diff --git a/.asf.yaml b/.asf.yaml index 0657d888cb2c6..ef141f3bb3fa6 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -49,6 +49,7 @@ github: protected_branches: master: {} + release-2.52.0: {} release-2.51.0: {} release-2.50.0: {} release-2.49.0: {} diff --git a/.github/REVIEWERS.yml b/.github/REVIEWERS.yml index f472568f34793..9fd61727f1064 100644 --- a/.github/REVIEWERS.yml +++ b/.github/REVIEWERS.yml @@ -40,6 +40,7 @@ labels: - kennknowles - robertwb - bvolpato + - m-trieu exclusionList: [] - name: IO reviewers: @@ -52,6 +53,10 @@ labels: - name: spanner reviewers: - nielm + - name: bigtable + reviewers: + - igorbernstein2 + - mutianf exclusionList: [] - name: Build reviewers: diff --git a/.github/actions/common-rc-validation/action.yaml b/.github/actions/common-rc-validation/action.yaml index 23efa93d1533f..51738e1381221 100644 --- a/.github/actions/common-rc-validation/action.yaml +++ b/.github/actions/common-rc-validation/action.yaml @@ -36,19 +36,19 @@ runs: shell: bash run: | echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip.sha512 - if [[ ! -f apache-beam-$RELEASE_VER.zip ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache-beam-$RELEASE_VER.tar.gz ]]; then { echo "Fail to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.zip.sha512 + sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 `which pip` install --upgrade pip `which pip` install --upgrade setuptools - name: Installing python SDK shell: bash - run: pip install apache-beam-${RELEASE_VER}.zip[gcp] + run: pip install apache-beam-${RELEASE_VER}.tar.gz[gcp] \ No newline at end of file diff --git a/.github/actions/gradle-command-self-hosted-action/action.yml b/.github/actions/gradle-command-self-hosted-action/action.yml index cb793714aa91d..906b35169d9db 100644 --- a/.github/actions/gradle-command-self-hosted-action/action.yml +++ b/.github/actions/gradle-command-self-hosted-action/action.yml @@ -24,12 +24,6 @@ inputs: required: false description: 'Gradle options' default: '' - default-arguments: - required: false - description: 'Default gradle switches' # Copied from CommonJobProperties.groovy' - default: | - --continue -Dorg.gradle.jvmargs=-Xms2g -Dorg.gradle.jvmargs=-Xmx6g \ - -Dorg.gradle.vfs.watch=false -Pdocker-pull-licenses max-workers: required: false description: 'Max number of workers' @@ -47,5 +41,6 @@ runs: if [ -f ~/.m2/settings.xml ]; then rm ~/.m2/settings.xml fi - ./gradlew ${{ inputs.gradle-command }} --max-workers=${{ inputs.max-workers }} ${{ inputs.arguments }} \ - ${{ inputs.default-arguments }} \ No newline at end of file + ./gradlew ${{ inputs.gradle-command }} --max-workers=${{ inputs.max-workers }} --continue \ + -Dorg.gradle.jvmargs=-Xms2g -Dorg.gradle.jvmargs=-Xmx6g -Dorg.gradle.vfs.watch=false -Pdocker-pull-licenses \ + ${{ inputs.arguments }} diff --git a/.github/actions/setup-action/action.yml b/.github/actions/setup-action/action.yml index da69dd9a97ddc..743e89a931da4 100644 --- a/.github/actions/setup-action/action.yml +++ b/.github/actions/setup-action/action.yml @@ -69,6 +69,4 @@ runs: - name: expose gcloud path shell: bash run: | - echo KUBELET_GCLOUD_CONFIG_PATH=/var/lib/kubelet/pods/$POD_UID/volumes/kubernetes.io~empty-dir/gcloud >> $GITHUB_ENV - - name: Setup environment - uses: ./.github/actions/setup-environment-action + echo KUBELET_GCLOUD_CONFIG_PATH=/var/lib/kubelet/pods/$POD_UID/volumes/kubernetes.io~empty-dir/gcloud >> $GITHUB_ENV \ No newline at end of file diff --git a/.github/actions/setup-environment-action/action.yml b/.github/actions/setup-environment-action/action.yml index 3452a16c132c2..5c6151ca6e1f4 100644 --- a/.github/actions/setup-environment-action/action.yml +++ b/.github/actions/setup-environment-action/action.yml @@ -30,6 +30,10 @@ inputs: required: false description: 'Install Go version' default: '' + disable-cache: + required: false + description: 'Whether to disable the gradle cache' + default: false runs: using: "composite" @@ -48,7 +52,7 @@ runs: - name: Setup Gradle uses: gradle/gradle-build-action@v2 with: - cache-read-only: false + cache-read-only: ${{ inputs.disable-cache }} - name: Install Go if: ${{ inputs.go-version != '' }} uses: actions/setup-go@v3 diff --git a/.github/autolabeler.yml b/.github/autolabeler.yml index 5a8a22044da43..57c8f65c6ac17 100644 --- a/.github/autolabeler.yml +++ b/.github/autolabeler.yml @@ -31,6 +31,7 @@ python: ["sdks/python/**/*", "learning/katas/python/**/*"] typescript: ["sdks/typescript/**/*"] vendor: ["vendor/**/*"] website: ["website/**/*"] +yaml: ["sdks/python/apache_beam/yaml/**"] # Extensions extensions: ["sdks/java/extensions/**/*", "runners/extensions-java/**/*"] @@ -68,6 +69,7 @@ io: ["sdks/go/pkg/beam/io/**/*", "sdks/java/io/**/*", "sdks/python/apache_beam/ "redis": ["sdks/java/io/redis/**/*"] "solr": ["sdks/java/io/solr/**/*"] "spanner": ["sdks/go/pkg/beam/io/spannerio/**/*", "sdks/python/apache_beam/io/gcp/spanner.py", "sdks/python/apache_beam/io/gcp/experimental/spannerio.py", "sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/**/*"] +"bigtable": ["sdks/go/pkg/beam/io/bigtableio/**/*", "sdks/go/pkg/beam/io/xlang/bigtableio/**/*", "sdks/python/apache_beam/io/gcp/bigtableio.py", "sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/**/*"] "synthetic": ["sdks/java/io/synthetic/**/*"] "tests": ["sdks/java/io/file-based-io-tests/**/*"] "thrift": ["sdks/java/io/thrift/**/*"] diff --git a/.github/build.gradle b/.github/build.gradle new file mode 100644 index 0000000000000..acfe7f3686a97 --- /dev/null +++ b/.github/build.gradle @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +buildscript { + repositories { + mavenCentral() + } + dependencies { + classpath group: 'org.yaml', name: 'snakeyaml', version: '2.2' + } +} + +/** check that yml are valid */ +task check { + doLast { + fileTree("${project.projectDir}/workflows").matching { + include "*.yml" + include "*.yaml" + }.each { + def fname = it.getName() + // attempt load yml to make sure its valid + def workflow = new org.yaml.snakeyaml.Yaml().load(it.newInputStream()) + + // additional guards for running all tests functionality + // TODO(yathu) expand this also to post commits prior teardown Jenkins postcommits + if ( fname.startsWith("beam_PreCommit")) { + List paths + try { + paths = workflow.getAt(true).pull_request_target.paths as List + } catch (Exception e) { + throw new GradleException("Fail to get the trigger path for ${fname}. " + + "Make sure precommit has a pull_request_target trigger.", e) + } + // precommit should triggered by this specific file + if (paths != null && !paths.contains('release/trigger_all_tests.json')) { + throw new GradleException("Error validating ${fname}: " + + "Please add 'release/trigger_all_tests.json' to the trigger path for release verification run properly") + } + } + } + } +} + +task preCommit { + dependsOn check +} \ No newline at end of file diff --git a/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl b/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl index f6da0aff038ae..4b04c5ad8eb15 100644 --- a/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl +++ b/.github/gh-actions-self-hosted-runners/arc/config/arc_autoscaler.tpl @@ -27,7 +27,7 @@ spec: name: ${name} minReplicas: ${min_runners} maxReplicas: ${max_runners} - %{~ if webhook_scaling == "true" ~} + %{~ if webhook_scaling ~} scaleUpTriggers: - githubEvent: workflowJob: {} diff --git a/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl b/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl index 6234571c55a30..41c6700b18b92 100644 --- a/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl +++ b/.github/gh-actions-self-hosted-runners/arc/config/arc_deployment.tpl @@ -22,7 +22,11 @@ metadata: name: ${name} spec: template: + metadata: + annotations: + cluster-autoscaler.kubernetes.io/safe-to-evict: "false" spec: + dockerMTU: 1460 %{~ if selector == true ~} nodeSelector: runner-pool: ${name} diff --git a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env index 9de66b628c898..7f58e016a85ca 100644 --- a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env +++ b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env @@ -20,7 +20,7 @@ project_id = "apache-beam-testing" region = "us-central1" zone = "us-central1-b" -environment = "beam" +environment = "beam-prod" ingress_domain = "action.beam.apache.org" organization = "apache" repository = "beam" @@ -28,16 +28,20 @@ github_app_id_secret_name = "gh-app_id" github_app_install_id_secret_name = "gh-app_installation_id" github_private_key_secret_name = "gh-pem_key" deploy_webhook = "true" +existing_vpc_name = "default" +existing_ip_name = "beam-arc-webhook-ip" +subnetwork_cidr_range = "10.119.0.0/20" +service_account_id = "beam-github-actions@apache-beam-testing.iam.gserviceaccount.com" runner_group = "beam" main_runner = { name = "main-runner" runner_image = "us-central1-docker.pkg.dev/apache-beam-testing/beam-github-actions/beam-arc-runner:2b20e26bb3b99d8e4f41a3d1d9d2e7080043de5c" machine_type = "e2-standard-16" min_node_count = "1" - max_node_count = "24" + max_node_count = "30" min_replicas = "1" - max_replicas = "200" - webhook_scaling = true + max_replicas = "240" + webhook_scaling = false disk_size_gb = 200 requests = { cpu = "2" @@ -49,10 +53,10 @@ additional_runner_pools = [{ machine_type = "e2-standard-2" runner_image = "us-central1-docker.pkg.dev/apache-beam-testing/beam-github-actions/beam-arc-runner:2b20e26bb3b99d8e4f41a3d1d9d2e7080043de5c" min_node_count = "1" - max_node_count = "10" + max_node_count = "15" min_replicas = "1" - max_replicas = "10" - webhook_scaling = "true" + max_replicas = "15" + webhook_scaling = false requests = { cpu = "1500m" memory = "5Gi" @@ -66,10 +70,10 @@ additional_runner_pools = [{ machine_type = "c3-highmem-8" runner_image = "us-central1-docker.pkg.dev/apache-beam-testing/beam-github-actions/beam-arc-runner:2b20e26bb3b99d8e4f41a3d1d9d2e7080043de5c" min_node_count = "1" - max_node_count = "10" + max_node_count = "15" min_replicas = "1" - max_replicas = "10" - webhook_scaling = "true" + max_replicas = "15" + webhook_scaling = false requests = { cpu = "7.5" memory = "5Gi" diff --git a/.github/gh-actions-self-hosted-runners/arc/gke.tf b/.github/gh-actions-self-hosted-runners/arc/gke.tf index bfb048885570a..45421ad38b472 100644 --- a/.github/gh-actions-self-hosted-runners/arc/gke.tf +++ b/.github/gh-actions-self-hosted-runners/arc/gke.tf @@ -21,7 +21,7 @@ resource "google_container_cluster" "actions-runner-gke" { project = var.project_id location = var.zone initial_node_count = 1 - network = google_compute_network.actions-runner-network.id + network = data.google_compute_network.actions-runner-network.id subnetwork = google_compute_subnetwork.actions-runner-subnetwork.id remove_default_node_pool = true @@ -45,6 +45,7 @@ resource "google_container_node_pool" "main-actions-runner-pool" { oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform" ] + service_account = data.google_service_account.service_account.email tags = ["actions-runner-pool"] } } @@ -72,6 +73,7 @@ resource "google_container_node_pool" "additional_runner_pools" { oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform" ] + service_account = data.google_service_account.service_account.email tags = ["actions-runner-pool"] labels = { "runner-pool" = each.value.name @@ -90,5 +92,15 @@ resource "google_container_node_pool" "additional_runner_pools" { resource "google_compute_global_address" "actions-runner-ip" { - name = "${var.environment}-actions-runner-ip" + count = var.deploy_webhook == "true" && var.existing_ip_name == "" ? 1 : 0 + name = "${var.environment}-actions-runner-ip" +} + +data "google_compute_global_address" "actions-runner-ip" { + count = var.deploy_webhook == "true" ? 1 : 0 + name = var.existing_ip_name == "" ? google_compute_global_address.actions-runner-ip[0].name : var.existing_ip_name +} + +data google_service_account "service_account" { + account_id = var.service_account_id } \ No newline at end of file diff --git a/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf b/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf index bafb653896d73..0a36e1fa2ba63 100644 --- a/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf +++ b/.github/gh-actions-self-hosted-runners/arc/kubernetes.tf @@ -27,6 +27,7 @@ resource "kubectl_manifest" "arc_autoscaler" { depends_on = [helm_release.arc] } resource "kubectl_manifest" "arc_webhook_certificate" { + count = var.deploy_webhook != "false" ? 1 : 0 yaml_body = templatefile("config/arc_certificate.tpl", { ingress_domain = var.ingress_domain }) override_namespace = "arc" depends_on = [helm_release.arc] diff --git a/.github/gh-actions-self-hosted-runners/arc/locals.tf b/.github/gh-actions-self-hosted-runners/arc/locals.tf index 170193b8b6b6d..a69d069ab8657 100644 --- a/.github/gh-actions-self-hosted-runners/arc/locals.tf +++ b/.github/gh-actions-self-hosted-runners/arc/locals.tf @@ -19,20 +19,18 @@ locals { - subnetwork_cidr_range = "10.128.0.0/20" arc_values = { - "githubWebhookServer.enabled" = "true" + "githubWebhookServer.enabled" = "${var.deploy_webhook}" "authSecret.create" = "true" "authSecret.github_app_id" = data.google_secret_manager_secret_version.github_app_id.secret_data "authSecret.github_app_installation_id" = data.google_secret_manager_secret_version.github_app_install_id.secret_data "authSecret.github_app_private_key" = data.google_secret_manager_secret_version.github_private_key.secret_data - "githubWebhookServer.ingress.enabled" = "true" + "githubWebhookServer.ingress.enabled" = "${var.deploy_webhook}" "githubWebhookServer.ingress.hosts[0].host" = var.ingress_domain "githubWebhookServer.ingress.hosts[0].paths[0].path" = "/" "githubWebhookServer.ingress.hosts[0].paths[0].pathType" = "ImplementationSpecific" "githubWebhookServer.service.type" = "NodePort" - #"githubWebhookServer.ingress.tls[0].hosts[0]" = var.ingress_domain - "githubWebhookServer.ingress.annotations.kubernetes\\.io/ingress\\.global-static-ip-name" = google_compute_global_address.actions-runner-ip.name + "githubWebhookServer.ingress.annotations.kubernetes\\.io/ingress\\.global-static-ip-name" = var.deploy_webhook != "false" ? data.google_compute_global_address.actions-runner-ip[0].name : "not-configured" "githubWebhookServer.ingress.annotations.networking\\.gke\\.io/managed-certificates" = "managed-cert" "githubWebhookServer.ingress.annotations.kubernetes\\.io/ingress\\.class" = "gce" } diff --git a/.github/gh-actions-self-hosted-runners/arc/network.tf b/.github/gh-actions-self-hosted-runners/arc/network.tf index fb7c23a7a3c64..1bc685641337c 100644 --- a/.github/gh-actions-self-hosted-runners/arc/network.tf +++ b/.github/gh-actions-self-hosted-runners/arc/network.tf @@ -18,15 +18,21 @@ # resource "google_compute_network" "actions-runner-network" { + count = var.existing_vpc_name == "" ? 1 : 0 project = var.project_id name = "${var.environment}-actions-runner-network" auto_create_subnetworks = false } +data "google_compute_network" "actions-runner-network" { + name = var.existing_vpc_name == "" ? google_compute_network.actions-runner-network[0].name : var.existing_vpc_name + project = var.project_id +} + resource "google_compute_subnetwork" "actions-runner-subnetwork" { - ip_cidr_range = local.subnetwork_cidr_range + ip_cidr_range = var.subnetwork_cidr_range name = "${var.environment}-actions-runner-subnetwork" - network = google_compute_network.actions-runner-network.id + network = data.google_compute_network.actions-runner-network.id region = var.region project = var.project_id } diff --git a/.github/gh-actions-self-hosted-runners/arc/outputs.tf b/.github/gh-actions-self-hosted-runners/arc/outputs.tf index 1e805ca74ce1f..f7450911aaf70 100644 --- a/.github/gh-actions-self-hosted-runners/arc/outputs.tf +++ b/.github/gh-actions-self-hosted-runners/arc/outputs.tf @@ -24,9 +24,8 @@ output "cluster_endpoint" { value = google_container_cluster.actions-runner-gke.endpoint } output "ingress_ip" { - value = google_compute_global_address.actions-runner-ip.address + value = var.deploy_webhook != "false" ? data.google_compute_global_address.actions-runner-ip[0].address : "Not Configured" } - output "get_kubeconfig_command" { value = "gcloud container clusters get-credentials ${google_container_cluster.actions-runner-gke.name} --region ${var.zone} --project ${var.project_id}" } diff --git a/.github/gh-actions-self-hosted-runners/arc/provider.tf b/.github/gh-actions-self-hosted-runners/arc/provider.tf index 11aa604fb288a..dc557b62a559d 100644 --- a/.github/gh-actions-self-hosted-runners/arc/provider.tf +++ b/.github/gh-actions-self-hosted-runners/arc/provider.tf @@ -19,7 +19,7 @@ terraform { backend "gcs" { - prefix = "test-state" + prefix = "prod" } required_providers { @@ -28,8 +28,8 @@ terraform { version = "~> 4.62.0" } kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.7.0" + source = "alekc/kubectl" + version = ">= 2.0.2" } } } diff --git a/.github/gh-actions-self-hosted-runners/arc/variables.tf b/.github/gh-actions-self-hosted-runners/arc/variables.tf index 43f51938b7d1f..3caeffe5a5232 100644 --- a/.github/gh-actions-self-hosted-runners/arc/variables.tf +++ b/.github/gh-actions-self-hosted-runners/arc/variables.tf @@ -58,6 +58,23 @@ variable "deploy_webhook" { description = "Enable Github Webhook deployment. use this if the Github App has permissions to create webhooks" default = "false" } +variable "existing_vpc_name" { + description = "Name of existing VPC to use for deployment" + default = "" +} +variable "existing_ip_name" { + description = "Name of existing IP to use for ingress" + default = "" +} +variable "subnetwork_cidr_range" { + description = "CIDR range for subnetwork" + default = "10.128.0.0/20" + +} +variable "service_account_id" { + description = "ID of service account to use for deployment. This can be Name, full Email or Fully Qualified Path" + default = "" +} variable "runner_group" { description = "value for the runner group label" default = "" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8fba73ed8fc38..bd4dcc7d067c2 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -49,7 +49,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Job Phrase' steps: @@ -78,14 +78,14 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startsWith(github.event.comment.body, 'Run Job With Matrix') steps: - uses: actions/checkout@v3 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) ``` @@ -119,7 +119,7 @@ Concurrency groups are a way of making sure that no more than one Actions run is ``` concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true ``` @@ -178,175 +178,287 @@ You can do this by changing runs-on: [self-hosted, ubuntu-20.04, main] (self-hos # Workflows Please note that jobs with matrix need to have matrix element in the comment. Example: ```Run Python PreCommit (3.8)``` + +### PreCommit Jobs + | Workflow name | Matrix | Trigger Phrase | Cron Status | |:-------------:|:------:|:--------------:|:-----------:| -| [ Java InfluxDbIO Integration Test ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml) | N/A |`Run Java InfluxDbIO_IT`| [![.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml) -| [ Load Tests GBK Dataflow Batch Go ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml) | N/A |`Run Load Tests Go GBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml) -| [ Load Tests CoGBK Dataflow Streaming Java ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml) | N/A |`Run Load Tests Java CoGBK Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml) -| [ Load Tests Combine Dataflow Batch Python ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) | N/A |`Run Load Tests Python Combine Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) -| [ Load Tests Combine Dataflow Batch Python ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) | N/A |`Run Load Tests Python Combine Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) -| [ Load Tests FnApiRunner Microbenchmark Python ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml) | N/A |`Run Python Load Tests FnApiRunner Microbenchmark`| [![.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml) -| [ Load Tests ParDo Dataflow Batch Go ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml) | N/A |`Run Load Tests Go ParDo Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml) -| [ Performance Tests AvroIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml) | N/A |`Run Java AvroIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml) -| [ Performance Tests AvroIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT.yml) | N/A |`Run Java AvroIO Performance Test`| [![.github/workflows/beam_PerformanceTests_AvroIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT.yml) -| [ Performance Tests BigQueryIO Batch Java Avro ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Avro`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml) -| [ Performance Tests BigQueryIO Batch Java Json ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Json`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml) -| [ Performance Tests BigQueryIO Streaming Java ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml) | N/A |`Run BigQueryIO Streaming Performance Test Java`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml) -| [ PostCommit BeamMetrics Publish ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml) | N/A |`Run Beam Metrics Deployment`| [![.github/workflows/beam_PostCommit_BeamMetrics_Publish.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml) -| [ PostCommit Go ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go.yml) | N/A |`Run Go PostCommit`| [![.github/workflows/beam_PostCommit_Go.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go.yml) | -| [ PostCommit Go Dataflow ARM](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml) | N/A |`Run Go PostCommit Dataflow ARM`| [![.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml) | -| [ PostCommit Go VR Flink](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml) | N/A |`Run Go Flink ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml) | -| [ PostCommit Go VR Samza](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml) | N/A |`Run Go Samza ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Samza.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml) | -| [ PostCommit Go VR Spark](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml) | N/A |`Run Go Spark ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml) | -| [ PostCommit Java Avro Versions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml) | N/A |`Run Java Avro Versions PostCommit`| [![.github/workflows/beam_PostCommit_Java_Avro_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml) | -| [ PostCommit Java Dataflow V1 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml) | N/A |`Run PostCommit_Java_Dataflow`| [![.github/workflows/beam_PostCommit_Java_DataflowV1.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml) | -| [ PostCommit Java Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml) | N/A |`Run PostCommit_Java_DataflowV2`| [![.github/workflows/beam_PostCommit_Java_DataflowV2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml) | -| [ PostCommit Java Examples Dataflow ARM ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml) | ['8','11','17'] |`Run Java_Examples_Dataflow_ARM PostCommit (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml) | -| [ PostCommit Java Examples Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow.yml) | N/A |`Run Java examples on Dataflow`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow.yml) | -| [ PostCommit Java Examples Dataflow Java ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml) | ['11','17'] |`Run Java examples on Dataflow Java (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml) | -| [ PostCommit Java Examples Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Direct.yml) | N/A |`Run Java Examples_Direct`| [![.github/workflows/beam_PostCommit_Java_Examples_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Direct.yml) | -| [ PostCommit Java Examples Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Flink.yml) | N/A |`Run Java Examples_Flink`| [![.github/workflows/beam_PostCommit_Java_Examples_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Flink.yml) | -| [ PostCommit Java Examples Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Spark.yml) | N/A |`Run Java Examples_Spark`| [![.github/workflows/beam_PostCommit_Java_Examples_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Spark.yml) | -| [ PostCommit Java Hadoop Versions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Hadoop_Versions.yml) | N/A |`Run PostCommit_Java_Hadoop_Versions`| [![.github/workflows/beam_PostCommit_Java_Hadoop_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Hadoop_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Hadoop_Versions.yml) | -| [ PostCommit Java Jpms Dataflow Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml) | N/A |`Run Jpms Dataflow Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml) | -| [ PostCommit Java Jpms Dataflow Java17 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml) | N/A |`Run Jpms Dataflow Java 17 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml) | -| [ PostCommit Java Jpms Direct Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml) | N/A |`Run Jpms Direct Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml) | -| [ PostCommit Java Jpms Direct Java17 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml) | N/A |`Run Jpms Direct Java 17 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java17](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml) | -| [ PostCommit Java Jpms Flink Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml) | N/A |`Run Jpms Flink Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Flink_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml) | -| [ PostCommit Java Jpms Spark Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml) | N/A |`Run Jpms Spark Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Spark_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml) | -| [ PostCommit Java Nexmark Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml) | N/A |`Run Dataflow Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml) | -| [ PostCommit Java Nexmark Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml) | N/A |`Run Dataflow Runner V2 Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml) | -| [ PostCommit Java Nexmark Dataflow V2 Java ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml) | ['11','17'] |`Run Dataflow Runner V2 Java (matrix) Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml) | -| [ PostCommit Java Nexmark Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Direct.yml) | N/A |`Run Direct Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Direct.yml) | -| [ PostCommit Java Nexmark Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Flink.yml) | N/A |`Run Flink Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Flink.yml) | -| [ PostCommit Java Nexmark Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Spark.yml) | N/A |`Run Spark Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Spark.yml) | -| [ PostCommit Java PVR Flink Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml) | N/A |`Run Java Flink PortableValidatesRunner Streaming`| [![PostCommit Java PVR Flink Streaming](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml) | -| [ PostCommit Java PVR Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Samza.yml) | N/A |`Run Java Samza PortableValidatesRunner`| [![PostCommit Java PVR Samza](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Samza.yml) | -| [ PostCommit Java PVR Spark3 Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml) | N/A |`Run Java Spark v3 PortableValidatesRunner Streaming`| [![PostCommit Java PVR Spark3 Streaming](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml) | -| [ PostCommit Java PVR Spark Batch ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml) | N/A |`Run Java Spark PortableValidatesRunner Batch`| [![PostCommit Java PVR Spark Batch](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml) | -| [ PostCommit Java Sickbay ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Sickbay.yml) | N/A |`Run Java Sickbay`| [![.github/workflows/beam_PostCommit_Java_Sickbay.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Sickbay.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Sickbay.yml) | -| [ PostCommit Java Tpcds Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml) | N/A |`Run Dataflow Runner Tpcds Tests`| [![.github/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml) | -| [ PostCommit Java Tpcds Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Flink.yml) | N/A |`Run Flink Runner Tpcds Tests`| [![.github/workflows/beam_PostCommit_Java_Tpcds_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Flink.yml) | -| [ PostCommit Java Tpcds Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Spark.yml) | N/A |`Run Spark Runner Tpcds Tests`| [![.github/workflows/beam_PostCommit_Java_Tpcds_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Spark.yml) | -| [ PostCommit Java ValidatesRunner Dataflow JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Java.yml) | ['11','17'] |`Run Dataflow ValidatesRunner Java (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Java.yml) | -| [ PostCommit Java ValidatesRunner Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml) | N/A |`Run Dataflow Streaming ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml) | -| [ PostCommit Java ValidatesRunner Dataflow V2 Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml) | N/A |`Run Java Dataflow V2 ValidatesRunner Streaming`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml) | -| [ PostCommit Java ValidatesRunner Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml) | N/A |`Run Java Dataflow V2 ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml) | -| [ PostCommit Java ValidatesRunner Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml) | N/A |`Run Dataflow ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml) | -| [ PostCommit Java ValidatesRunner Direct JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_Java.yml) | ['11','17'] |`Run Direct ValidatesRunner Java (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_Java.yml) | -| [ PostCommit Java ValidatesRunner Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml) | N/A |`Run Direct ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml) | -| [ PostCommit Java ValidatesRunner Flink Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml) | N/A |`Run Flink ValidatesRunner Java 11`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml) | -| [ PostCommit Java ValidatesRunner Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml) | N/A |`Run Flink ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml) | -| [ PostCommit Java ValidatesRunner Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml) | N/A |`Run Samza ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml) | -| [ PostCommit Java ValidatesRunner Spark Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml) | N/A |`Run Spark ValidatesRunner Java 11`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml) | -| [ PostCommit Java ValidatesRunner Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml) | N/A |`Run Spark ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml) | -| [ PostCommit Java ValidatesRunner SparkStructuredStreaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml) | N/A |`Run Spark StructuredStreaming ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml) | -| [ PostCommit Java ValidatesRunner Twister2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml) | N/A |`Run Twister2 ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml) | -| [ PostCommit Java ValidatesRunner ULR ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml) | N/A |`Run ULR Loopback ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml) | -| [ PostCommit Java ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java.yml) | N/A |`Run Java PostCommit`| [![.github/workflows/beam_PostCommit_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java.yml) | -| [ PostCommit Javadoc ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Javadoc.yml) | N/A |`Run Javadoc PostCommit`| [![.github/workflows/beam_PostCommit_Javadoc.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Javadoc.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Javadoc.yml) | -| [ PostCommit PortableJar Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Flink.yml) | N/A |`Run PortableJar_Flink PostCommit`| [![.github/workflows/beam_PostCommit_PortableJar_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Flink.yml) | -| [ PostCommit PortableJar Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Spark.yml) | N/A |`Run PortableJar_Spark PostCommit`| [![.github/workflows/beam_PostCommit_PortableJar_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Spark.yml) | -| [ PostCommit Python ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python PostCommit (matrix_element)`| [![.github/workflows/beam_PostCommit_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python.yml) | -| [ PostCommit Python Arm](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Arm.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python PostCommit Arm (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Arm.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Arm.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Arm.yml) | -| [ PostCommit Python Examples Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml) | N/A |`Run Python Examples_Dataflow`| [![.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml) | -| [ PostCommit Python Examples Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python Examples_Direct (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Examples_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml) | -| [ PostCommit Python Examples Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml) | ['3.8','3.11'] |`Run Python Examples_Flink (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Examples_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml) | -| [ PostCommit Python Examples Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Spark.yml) | ['3.8','3.11'] |`Run Python Examples_Spark (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Examples_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Spark.yml) | -| [ PostCommit Python MongoDBIO IT ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml) | N/A |`Run Python MongoDBIO_IT`| [![.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml) | -| [ PostCommit Python Nexmark Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Nexmark_Direct.yml) | N/A |`Run Python Direct Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Nexmark_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Nexmark_Direct.yml) | -| [ PostCommit Python ValidatesContainer Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python Dataflow ValidatesContainer (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml) | -| [ PostCommit Python ValidatesContainer Dataflow With RC ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python RC Dataflow ValidatesContainer (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml) | -| [ PostCommit Python ValidatesRunner Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml) | ['3.8','3.11'] |`Run Python Dataflow ValidatesRunner (matrix_element)`| [![PostCommit Python ValidatesRunner Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml) | -| [ PostCommit Python ValidatesRunner Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml) | ['3.8','3.11'] |`Run Python Flink ValidatesRunner (matrix_element)`| [![PostCommit Python ValidatesRunner Flink](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml) | -| [ PostCommit Python ValidatesRunner Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml) | ['3.8','3.11'] |`Run Python Samza ValidatesRunner (matrix_element)`| [![PostCommit Python ValidatesRunner Samza](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml) | -| [ PostCommit Python ValidatesRunner Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml) | ['3.8','3.9','3.11'] |`Run Python Spark ValidatesRunner (matrix_element)`| [![PostCommit Python ValidatesRunner Spark](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml) | -| [ PostCommit Python Xlang Gcp Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml) | N/A |`Run Python_Xlang_Gcp_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml) | -| [ PostCommit Python Xlang Gcp Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml) | N/A |`Run Python_Xlang_Gcp_Direct PostCommit`| [![.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml) | -| [ PostCommit Python Xlang IO Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml) | N/A |`Run Python_Xlang_IO_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml) | -| [ PostCommit Sickbay Python ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Sickbay_Python.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python (matrix_element) PostCommit Sickbay`| [![.github/workflows/beam_PostCommit_Sickbay_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Sickbay_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Sickbay_Python.yml) | -| [ PostCommit SQL ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_SQL.yml) | N/A |`Run SQL PostCommit`| [![.github/workflows/beam_PostCommit_SQL.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_SQL.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_SQL.yml) | -| [ PostCommit TransformService Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml) | N/A |`Run TransformService_Direct PostCommit`| [![.github/workflows/beam_PostCommit_TransformService_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml) -| [ PostCommit Website Publish ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Publish.yml) | N/A | N/A | [![.github/workflows/beam_PostCommit_Website_Publish.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Publish.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Publish.yml) | -| [ PostCommit Website Test](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Test.yml) | N/A |`Run Full Website Test`| [![.github/workflows/beam_PostCommit_Website_Test](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Test.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Test.yml) | -| [ PostCommit XVR GoUsingJava Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml) | N/A |`Run XVR_GoUsingJava_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml) | -| [ PostCommit XVR Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Direct.yml) | N/A |`Run XVR_Direct PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Direct](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Direct.yml) | -| [ PostCommit XVR Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Flink.yml) | N/A |`Run XVR_Flink PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Flink](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Flink.yml) | -| [ PostCommit XVR JavaUsingPython Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml) | N/A |`Run XVR_JavaUsingPython_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml) | -| [ PostCommit XVR PythonUsingJava Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml) | N/A |`Run XVR_PythonUsingJava_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml) | -| [ PostCommit XVR PythonUsingJavaSQL Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml) | N/A |`Run XVR_PythonUsingJavaSQL_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml) | -| [ PostCommit XVR Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Samza.yml) | N/A |`Run XVR_Samza PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Samza](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Samza.yml) | -| [ PostCommit XVR Spark3 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Spark3.yml) | N/A |`Run XVR_Spark3 PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Spark3](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Spark3.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Spark3.yml) | -| [ PreCommit Community Metrics ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_CommunityMetrics.yml) | N/A |`Run CommunityMetrics PreCommit`| [![.github/workflows/beam_PreCommit_CommunityMetrics.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_CommunityMetrics.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_CommunityMetrics.yml) | -| [ PreCommit Go ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Go.yml) | N/A |`Run Go PreCommit`| [![.github/workflows/beam_PreCommit_Go.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Go.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Go.yml) | -| [ PreCommit Java ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java.yml) | N/A |`Run Java PreCommit`| [![.github/workflows/beam_PreCommit_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java.yml) | -| [ PreCommit Java Amazon Web Services IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml) | N/A |`Run Java_Amazon-Web-Services_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml) | -| [ PreCommit Java Amazon Web Services2 IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml) | N/A |`Run Java_Amazon-Web-Services2_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml) | -| [ PreCommit Java Amqp IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml) | N/A |`Run Java_Amqp_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml) | -| [ PreCommit Java Azure IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml) | N/A |`Run Java_Azure_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml) | -| [ PreCommit Java Cassandra IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml) | N/A |`Run Java_Cassandra_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml) | -| [ PreCommit Java Cdap IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml) | N/A |`Run Java_Cdap_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml) | -| [ PreCommit Java Clickhouse IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml) | N/A |`Run Java_Clickhouse_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml) | -| [ PreCommit Java Csv IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml) | N/A |`Run Java_Csv_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml) | -| [ PreCommit Java Debezium IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml) | N/A |`Run Java_Debezium_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml) | -| [ PreCommit Java ElasticSearch IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml) | N/A |`Run Java_ElasticSearch_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml) | -| [ PreCommit Java Examples Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow.yml) | N/A |`Run Java_Examples_Dataflow PreCommit`| [![.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow.yml) | -| [ PreCommit Java Flink Versions ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Flink_Versions.yml) | N/A |`Run Java_Flink_Versions PreCommit`| [![.github/workflows/beam_PreCommit_Java_Flink_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Flink_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Flink_Versions.yml) | -| [ PreCommit Java GCP IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml) | N/A |`Run Java_GCP_IO_Direct PreCommit`| [![.github\workflows\beam_PreCommit_Java_GCP_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml) | -| [ PreCommit Java Examples Dataflow Java11 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml) | N/A | `Run Java_Examples_Dataflow_Java11 PreCommit` | [![.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml) | -| [ PreCommit Java Examples Dataflow Java17 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml) | N/A | `Run Java_Examples_Dataflow_Java17 PreCommit` | [![.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml) | -| [ PreCommit Java File-schema-transform IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml) | N/A |`Run Java_File-schema-transform_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml) | -| [ PreCommit Java Hadoop IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml) | N/A |`Run Java_Hadoop_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml) | -| [ PreCommit Java HBase IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml) | N/A |`Run Java_HBase_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml) | -| [ PreCommit Java HCatalog IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml) | N/A |`Run Java_HCatalog_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml) | -| [ PreCommit Java Kafka IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml) | N/A |`Run Java_Kafka_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml) | -| [ PreCommit Java InfluxDb IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml) | N/A |`Run Java_InfluxDb_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml) | +| [ PreCommit Community Metrics ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_CommunityMetrics.yml) | N/A |`Run CommunityMetrics PreCommit`| [![.github/workflows/beam_PreCommit_CommunityMetrics.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_CommunityMetrics.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_CommunityMetrics.yml?query=event%3Aschedule) | +| [ PreCommit GHA ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GHA.yml) | N/A |`Run GHA PreCommit`| [![.github/workflows/beam_PreCommit_GHA.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GHA.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GHA.yml?query=event%3Aschedule) | +| [ PreCommit Go ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Go.yml) | N/A |`Run Go PreCommit`| [![.github/workflows/beam_PreCommit_Go.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Go.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Go.yml?query=event%3Aschedule) | +| [ PreCommit GoPortable ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GoPortable.yml) | N/A |`Run GoPortable PreCommit`| [![.github/workflows/beam_PreCommit_GoPortable.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GoPortable.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GoPortable.yml?query=event%3Aschedule) | +| [ PreCommit Java ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java.yml) | N/A |`Run Java PreCommit`| [![.github/workflows/beam_PreCommit_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java.yml?query=event%3Aschedule) | +| [ PreCommit Java Amazon Web Services IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml) | N/A |`Run Java_Amazon-Web-Services_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Amazon Web Services2 IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml) | N/A |`Run Java_Amazon-Web-Services2_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Amqp IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml) | N/A |`Run Java_Amqp_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Azure IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml) | N/A |`Run Java_Azure_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Cassandra IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml) | N/A |`Run Java_Cassandra_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Cdap IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml) | N/A |`Run Java_Cdap_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Clickhouse IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml) | N/A |`Run Java_Clickhouse_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Csv IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml) | N/A |`Run Java_Csv_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Debezium IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml) | N/A |`Run Java_Debezium_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java ElasticSearch IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml) | N/A |`Run Java_ElasticSearch_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Examples Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow.yml) | N/A |`Run Java_Examples_Dataflow PreCommit`| [![.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow.yml?query=event%3Aschedule) | +| [ PreCommit Java Examples Dataflow Java11 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml) | N/A | `Run Java_Examples_Dataflow_Java11 PreCommit` | [![.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml?query=event%3Aschedule) | +| [ PreCommit Java Examples Dataflow Java17 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml) | N/A | `Run Java_Examples_Dataflow_Java17 PreCommit` | [![.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml?query=event%3Aschedule) | +| [ PreCommit Java File-schema-transform IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml) | N/A |`Run Java_File-schema-transform_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Flink Versions ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Flink_Versions.yml) | N/A |`Run Java_Flink_Versions PreCommit`| [![.github/workflows/beam_PreCommit_Java_Flink_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Flink_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Flink_Versions.yml?query=event%3Aschedule) | +| [ PreCommit Java GCP IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml) | N/A |`Run Java_GCP_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Google-ads IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Google-ads_IO_Direct.yml) | N/A |`Run Java_Google-ads_IO_Direct PreCommit`| [![.github\workflows\beam_PreCommit_Java_Google-ads_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Google-ads_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Google-ads_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Hadoop IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml) | N/A |`Run Java_Hadoop_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java HBase IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml) | N/A |`Run Java_HBase_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java HCatalog IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml) | N/A |`Run Java_HCatalog_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Kafka IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml) | N/A |`Run Java_Kafka_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java InfluxDb IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml) | N/A |`Run Java_InfluxDb_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml?query=event%3Aschedule) | | [ PreCommit Java IOs Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_IOs_Direct.yml) | N/A |`Run Java_IOs_Direct PreCommit`| N/A | -| [ PreCommit Java JDBC IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml) | N/A |`Run Java_JDBC_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml) | -| [ PreCommit Java Jms IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml) | N/A |`Run Java_Jms_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml) | -| [ PreCommit Java Kinesis IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml) | N/A |`Run Java_Kinesis_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml) | -| [ PreCommit Java Kudu IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml) | N/A |`Run Java_Kudu_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml) | -| [ PreCommit Java MongoDb IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml) | N/A |`Run Java_MongoDb_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml) | -| [ PreCommit Java Mqtt IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml) | N/A |`Run Java_Mqtt_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml) | -| [ PreCommit Java Neo4j IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml) | N/A |`Run Java_Neo4j_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml) | -| [ PreCommit Java Parquet IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml) | N/A |`Run Java_Parquet_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml) | -| [ PreCommit Java Pulsar IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml) | N/A |`Run Java_Pulsar_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml) | -| [ PreCommit Java PVR Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml) | N/A |`Run Java_PVR_Flink_Batch PreCommit`| [![.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml) | -| [ PreCommit Java PVR Flink Docker ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml) | N/A |`Run Java_PVR_Flink_Docker PreCommit`| [![.github/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml) | -| [ PreCommit Java RabbitMq IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml) | N/A |`Run Java_RabbitMq_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml) | -| [ PreCommit Java Redis IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml) | N/A |`Run Java_Redis_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml) | -| [ PreCommit Java SingleStore IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml) | N/A |`Run Java_SingleStore_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml) | -| [ PreCommit Java Snowflake IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml) | N/A |`Run Java_Snowflake_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml) | -| [ PreCommit Java Solr IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml) | N/A |`Run Java_Solr_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml) | -| [ PreCommit Java Spark3 Versions ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Spark3_Versions.yml) | N/A | `Run Java_Spark3_Versions PreCommit` | [![.github/workflows/beam_PreCommit_Java_Spark3_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Spark3_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Spark3_Versions.yml) | -| [ PreCommit Java Splunk IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml) | N/A |`Run Java_Splunk_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml) | -| [ PreCommit Java Thrift IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml) | N/A |`Run Java_Thrift_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml) | -| [ PreCommit Java Tika IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml) | N/A |`Run Java_Tika_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml) | -| [ PreCommit Python ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python PreCommit (matrix_element)` | [![.github/workflows/beam_PreCommit_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python.yml) | -| [ PreCommit Python Coverage ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Coverage.yml) | N/A | `Run Python_Coverage PreCommit`| [![.github/workflows/beam_PreCommit_Python_Coverage.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Coverage.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Coverage.yml) | -| [ PreCommit Python Dataframes ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Dataframes.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Dataframes PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_Python_Dataframes.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Dataframes.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Dataframes.yml) | -| [ PreCommit Python Docker ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocker.yml) | ['3.8','3.9','3.10','3.11'] | `Run PythonDocker PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_PythonDocker.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocker.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocker.yml) | -| [ PreCommit Python Docs ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocs.yml) | N/A | `Run PythonDocs PreCommit`| [![.github/workflows/beam_PreCommit_PythonDocs.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocs.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocs.yml) | -| [ PreCommit Python Examples ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Examples.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Examples PreCommit (matrix_element)` | [![.github/workflows/beam_PreCommit_Python_Examples.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Examples.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Examples.yml) | -| [ PreCommit Python Formatter ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonFormatter.yml) | N/A | `Run PythonFormatter PreCommit`| [![.github/workflows/beam_PreCommit_PythonFormatter.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonFormatter.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonFormatter.yml) | -| [ PreCommit Python Integration](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Integration.yml) | ['3.8','3.11'] | `Run Python_Integration PreCommit (matrix_element)` | [![.github/workflows/beam_PreCommit_Python_Integration.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Integration.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Integration.yml) | -| [ PreCommit Python Lint ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonLint.yml) | N/A | `Run PythonLint PreCommit` | [![.github/workflows/beam_PreCommit_PythonLint.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonLint.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonLint.yml) | -| [ PreCommit Python PVR Flink ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_PVR_Flink.yml) | N/A | `Run Python_PVR_Flink PreCommit` | [![.github/workflows/beam_PreCommit_Python_PVR_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_PVR_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_PVR_Flink.yml) | -| [ PreCommit Python Runners ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Runners.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Runners PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_Python_Runners.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Runners.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Runners.yml) | -| [ PreCommit Python Transforms ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Transforms.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Transforms PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_Python_Transforms.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Transforms.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Transforms.yml) | -| [ PreCommit RAT ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_RAT.yml) | N/A | `Run RAT PreCommit` | [![.github/workflows/beam_PreCommit_RAT.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_RAT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_RAT.yml) | -| [ PreCommit Spotless ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Spotless.yml) | N/A | `Run Spotless PreCommit` | [![.github/workflows/beam_PreCommit_Spotless.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Spotless.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Spotless.yml) | -| [ PreCommit SQL ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL.yml) | N/A |`Run SQL PreCommit`| [![.github/workflows/beam_PreCommit_SQL.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL.yml) | -| [ PreCommit SQL Java11 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java11.yml) | N/A |`Run SQL_Java11 PreCommit`| [![.github/workflows/beam_PreCommit_SQL_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java11.yml) | -| [ PreCommit SQL Java17 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java17.yml) | N/A |`Run SQL_Java17 PreCommit`| [![.github/workflows/beam_PreCommit_SQL_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java17.yml) | -| [ PreCommit Typescript ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Typescript.yml) | N/A |`Run Typescript PreCommit`| [![.github/workflows/beam_PreCommit_Typescript.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Typescript.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Typescript.yml) | -| [ PreCommit Website ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website.yml) | N/A |`Run Website PreCommit`| [![.github/workflows/beam_PreCommit_Website.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website.yml) | -| [ PreCommit Website Stage GCS ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website_Stage_GCS.yml) | N/A |`Run Website_Stage_GCS PreCommit`| [![PreCommit Website Stage GCS](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website_Stage_GCS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website_Stage_GCS.yml) | -| [ PreCommit Whitespace ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Whitespace.yml) | N/A |`Run Whitespace PreCommit`| [![.github/workflows/beam_PreCommit_Whitespace.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Whitespace.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Whitespace.yml) | -| [ Python Validates Container Dataflow ARM ](https://github.com/apache/beam/actions/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python ValidatesContainer Dataflow ARM (matrix_element)`| [![.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml](https://github.com/apache/beam/actions/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml) | -| [ PreCommit GoPortable ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GoPortable.yml) | N/A |`Run GoPortable PreCommit`| [![.github/workflows/beam_PreCommit_GoPortable.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GoPortable.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_GoPortable.yml) | -| [ PreCommit Kotlin Examples ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml) | N/A | `Run Kotlin_Examples PreCommit` | [![.github/workflows/beam_PreCommit_Kotlin_Examples.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml) | -| [ PreCommit Portable Python ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml) | ['3.8','3.11'] | `Run Portable_Python PreCommit` | [![.github/workflows/beam_PreCommit_Portable_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml) | -| [ Cancel Stale Dataflow Jobs ](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml) | N/A | `Run Cancel Stale Dataflow Jobs` | [![.github/workflows/beam_CancelStaleDataflowJobs.yml](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml) | -| [ Clean Up GCP Resources ](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml) | N/A | `Run Clean GCP Resources` | [![.github/workflows/beam_CleanUpGCPResources.yml](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml) | -| [ Clean Up Prebuilt SDK Images ](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml) | N/A | `Run Clean Prebuilt Images` | [![.github/workflows/beam_beam_CleanUpPrebuiltSDKImages.yml](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml) | +| [ PreCommit Java JDBC IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml) | N/A |`Run Java_JDBC_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Jms IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml) | N/A |`Run Java_Jms_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Kinesis IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml) | N/A |`Run Java_Kinesis_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Kudu IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml) | N/A |`Run Java_Kudu_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java MongoDb IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml) | N/A |`Run Java_MongoDb_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Mqtt IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml) | N/A |`Run Java_Mqtt_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Neo4j IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml) | N/A |`Run Java_Neo4j_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Parquet IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml) | N/A |`Run Java_Parquet_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Pulsar IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml) | N/A |`Run Java_Pulsar_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java PVR Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml) | N/A |`Run Java_PVR_Flink_Batch PreCommit`| [![.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml?query=event%3Aschedule) | +| [ PreCommit Java PVR Flink Docker ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml) | N/A |`Run Java_PVR_Flink_Docker PreCommit`| [![.github/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml?query=event%3Aschedule) | +| [ PreCommit Java RabbitMq IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml) | N/A |`Run Java_RabbitMq_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Redis IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml) | N/A |`Run Java_Redis_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java RequestResponse IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RequestResponse_IO_Direct.yml) | N/A |`Run Java_RequestResponse_IO_Direct PreCommit`| [![.github\workflows\beam_RequestResponse_Java_RequestResponse_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RequestResponse_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_RequestResponse_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java SingleStore IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml) | N/A |`Run Java_SingleStore_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Snowflake IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml) | N/A |`Run Java_Snowflake_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Solr IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml) | N/A |`Run Java_Solr_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Spark3 Versions ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Spark3_Versions.yml) | N/A | `Run Java_Spark3_Versions PreCommit` | [![.github/workflows/beam_PreCommit_Java_Spark3_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Spark3_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Spark3_Versions.yml?query=event%3Aschedule) | +| [ PreCommit Java Splunk IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml) | N/A |`Run Java_Splunk_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Thrift IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml) | N/A |`Run Java_Thrift_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Java Tika IO Direct ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml) | N/A |`Run Java_Tika_IO_Direct PreCommit`| [![.github/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml?query=event%3Aschedule) | +| [ PreCommit Kotlin Examples ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml) | N/A | `Run Kotlin_Examples PreCommit` | [![.github/workflows/beam_PreCommit_Kotlin_Examples.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml?query=event%3Aschedule) | +| [ PreCommit Portable Python ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml) | ['3.8','3.11'] | `Run Portable_Python PreCommit` | [![.github/workflows/beam_PreCommit_Portable_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml?query=event%3Aschedule) | +| [ PreCommit Python ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python PreCommit (matrix_element)` | [![.github/workflows/beam_PreCommit_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python.yml?query=event%3Aschedule) | +| [ PreCommit Python Coverage ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Coverage.yml) | N/A | `Run Python_Coverage PreCommit`| [![.github/workflows/beam_PreCommit_Python_Coverage.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Coverage.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Coverage.yml?query=event%3Aschedule) | +| [ PreCommit Python Dataframes ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Dataframes.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Dataframes PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_Python_Dataframes.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Dataframes.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Dataframes.yml?query=event%3Aschedule) | +| [ PreCommit Python Docker ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocker.yml) | ['3.8','3.9','3.10','3.11'] | `Run PythonDocker PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_PythonDocker.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocker.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocker.yml?query=event%3Aschedule) | +| [ PreCommit Python Docs ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocs.yml) | N/A | `Run PythonDocs PreCommit`| [![.github/workflows/beam_PreCommit_PythonDocs.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocs.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonDocs.yml?query=event%3Aschedule) | +| [ PreCommit Python Examples ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Examples.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Examples PreCommit (matrix_element)` | [![.github/workflows/beam_PreCommit_Python_Examples.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Examples.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Examples.yml?query=event%3Aschedule) | +| [ PreCommit Python Formatter ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonFormatter.yml) | N/A | `Run PythonFormatter PreCommit`| [![.github/workflows/beam_PreCommit_PythonFormatter.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonFormatter.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonFormatter.yml?query=event%3Aschedule) | +| [ PreCommit Python Integration](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Integration.yml) | ['3.8','3.11'] | `Run Python_Integration PreCommit (matrix_element)` | [![.github/workflows/beam_PreCommit_Python_Integration.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Integration.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Integration.yml?query=event%3Aschedule) | +| [ PreCommit Python Lint ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonLint.yml) | N/A | `Run PythonLint PreCommit` | [![.github/workflows/beam_PreCommit_PythonLint.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonLint.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_PythonLint.yml?query=event%3Aschedule) | +| [ PreCommit Python PVR Flink ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_PVR_Flink.yml) | N/A | `Run Python_PVR_Flink PreCommit` | [![.github/workflows/beam_PreCommit_Python_PVR_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_PVR_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_PVR_Flink.yml?query=event%3Aschedule) | +| [ PreCommit Python Runners ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Runners.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Runners PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_Python_Runners.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Runners.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Runners.yml?query=event%3Aschedule) | +| [ PreCommit Python Transforms ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Transforms.yml) | ['3.8','3.9','3.10','3.11'] | `Run Python_Transforms PreCommit (matrix_element)`| [![.github/workflows/beam_PreCommit_Python_Transforms.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Transforms.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Python_Transforms.yml?query=event%3Aschedule) | +| [ PreCommit RAT ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_RAT.yml) | N/A | `Run RAT PreCommit` | [![.github/workflows/beam_PreCommit_RAT.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_RAT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_RAT.yml?query=event%3Aschedule) | +| [ PreCommit Spotless ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Spotless.yml) | N/A | `Run Spotless PreCommit` | [![.github/workflows/beam_PreCommit_Spotless.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Spotless.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Spotless.yml?query=event%3Aschedule) | +| [ PreCommit SQL ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL.yml) | N/A |`Run SQL PreCommit`| [![.github/workflows/beam_PreCommit_SQL.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL.yml?query=event%3Aschedule) | +| [ PreCommit SQL Java11 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java11.yml) | N/A |`Run SQL_Java11 PreCommit`| [![.github/workflows/beam_PreCommit_SQL_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java11.yml?query=event%3Aschedule) | +| [ PreCommit SQL Java17 ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java17.yml) | N/A |`Run SQL_Java17 PreCommit`| [![.github/workflows/beam_PreCommit_SQL_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_SQL_Java17.yml?query=event%3Aschedule) | +| [ PreCommit Typescript ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Typescript.yml) | N/A |`Run Typescript PreCommit`| [![.github/workflows/beam_PreCommit_Typescript.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Typescript.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Typescript.yml?query=event%3Aschedule) | +| [ PreCommit Website ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website.yml) | N/A |`Run Website PreCommit`| [![.github/workflows/beam_PreCommit_Website.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website.yml?query=event%3Aschedule) | +| [ PreCommit Website Stage GCS ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website_Stage_GCS.yml) | N/A |`Run Website_Stage_GCS PreCommit`| [![.github/workflows/beam_PreCommit_Website_Stage_GCS.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website_Stage_GCS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Website_Stage_GCS.yml?query=event%3Aschedule) | +| [ PreCommit Whitespace ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Whitespace.yml) | N/A |`Run Whitespace PreCommit`| [![.github/workflows/beam_PreCommit_Whitespace.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Whitespace.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Whitespace.yml?query=event%3Aschedule) | + +### PostCommit Jobs + +| Workflow name | Matrix | Trigger Phrase | Cron Status | +|:-------------:|:------:|:--------------:|:-----------:| +| [ PostCommit BeamMetrics Publish ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml) | N/A |`Run Beam Metrics Deployment`| [![.github/workflows/beam_PostCommit_BeamMetrics_Publish.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml?query=event%3Aschedule) +| [ PostCommit Go ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go.yml) | N/A |`Run Go PostCommit`| [![.github/workflows/beam_PostCommit_Go.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go.yml?query=event%3Aschedule) | +| [ PostCommit Go Dataflow ARM](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml) | N/A |`Run Go PostCommit Dataflow ARM`| [![.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml?query=event%3Aschedule) | +| [ PostCommit Go VR Flink](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml) | N/A |`Run Go Flink ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml?query=event%3Aschedule) | +| [ PostCommit Go VR Samza](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml) | N/A |`Run Go Samza ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Samza.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml?query=event%3Aschedule) | +| [ PostCommit Go VR Spark](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml) | N/A |`Run Go Spark ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Java Avro Versions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml) | N/A |`Run Java Avro Versions PostCommit`| [![.github/workflows/beam_PostCommit_Java_Avro_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml?query=event%3Aschedule) | +| [ PostCommit Java Dataflow V1 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml) | N/A |`Run PostCommit_Java_Dataflow`| [![.github/workflows/beam_PostCommit_Java_DataflowV1.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml?query=event%3Aschedule) | +| [ PostCommit Java Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml) | N/A |`Run PostCommit_Java_DataflowV2`| [![.github/workflows/beam_PostCommit_Java_DataflowV2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Dataflow ARM ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml) | ['8','11','17','21'] |`Run Java_Examples_Dataflow_ARM PostCommit (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow.yml) | N/A |`Run Java examples on Dataflow`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Dataflow Java ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml) | ['11','17','21'] |`Run Java examples on Dataflow Java (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml) | N/A |`Run Java Examples on Dataflow Runner V2`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Dataflow V2 Java ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml) | ['11','17','21'] |`Run Java (matrix_element) Examples on Dataflow Runner V2`| [![.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Direct.yml) | N/A |`Run Java Examples_Direct`| [![.github/workflows/beam_PostCommit_Java_Examples_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Direct.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Flink.yml) | N/A |`Run Java Examples_Flink`| [![.github/workflows/beam_PostCommit_Java_Examples_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Flink.yml?query=event%3Aschedule) | +| [ PostCommit Java Examples Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Spark.yml) | N/A |`Run Java Examples_Spark`| [![.github/workflows/beam_PostCommit_Java_Examples_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Java Hadoop Versions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Hadoop_Versions.yml) | N/A |`Run PostCommit_Java_Hadoop_Versions`| [![.github/workflows/beam_PostCommit_Java_Hadoop_Versions.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Hadoop_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Hadoop_Versions.yml?query=event%3Aschedule) | +| [ PostCommit Java InfluxDbIO Integration Test ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml) | N/A |`Run Java InfluxDbIO_IT`| [![.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml?query=event%3Aschedule) +| [ PostCommit Java Jpms Dataflow Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml) | N/A |`Run Jpms Dataflow Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml?query=event%3Aschedule) | +| [ PostCommit Java Jpms Dataflow Java17 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml) | N/A |`Run Jpms Dataflow Java 17 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml?query=event%3Aschedule) | +| [ PostCommit Java Jpms Direct Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml) | N/A |`Run Jpms Direct Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml?query=event%3Aschedule) | +| [ PostCommit Java Jpms Direct Java17 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml) | N/A |`Run Jpms Direct Java 17 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml?query=event%3Aschedule) | +| [ PostCommit Java Jpms Direct Java21 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java21.yml) | N/A |`Run Jpms Direct Java21 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java21.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java21.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java21.yml?query=event%3Aschedule) | +| [ PostCommit Java Jpms Flink Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml) | N/A |`Run Jpms Flink Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml?query=event%3Aschedule) | +| [ PostCommit Java Jpms Spark Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml) | N/A |`Run Jpms Spark Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml?query=event%3Aschedule) | +| [ PostCommit Java Nexmark Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml) | N/A |`Run Dataflow Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Java Nexmark Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml) | N/A |`Run Dataflow Runner V2 Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml?query=event%3Aschedule) | +| [ PostCommit Java Nexmark Dataflow V2 Java ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml) | ['11','17'] |`Run Dataflow Runner V2 Java (matrix) Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml?query=event%3Aschedule) | +| [ PostCommit Java Nexmark Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Direct.yml) | N/A |`Run Direct Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Direct.yml?query=event%3Aschedule) | +| [ PostCommit Java Nexmark Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Flink.yml) | N/A |`Run Flink Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Flink.yml?query=event%3Aschedule) | +| [ PostCommit Java Nexmark Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Spark.yml) | N/A |`Run Spark Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Nexmark_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Java PVR Flink Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml) | N/A |`Run Java Flink PortableValidatesRunner Streaming`| [![.github/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml?query=event%3Aschedule) | +| [ PostCommit Java PVR Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Samza.yml) | N/A |`Run Java Samza PortableValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_PVR_Samza.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Samza.yml?query=event%3Aschedule) | +| [ PostCommit Java SingleStoreIO IT ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml) | N/A |`Run Java SingleStoreIO_IT`| [![.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml?query=event%3Aschedule) | +| [ PostCommit Java PVR Spark3 Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml) | N/A |`Run Java Spark v3 PortableValidatesRunner Streaming`| [![.github/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml?query=event%3Aschedule) | +| [ PostCommit Java PVR Spark Batch ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml) | N/A |`Run Java Spark PortableValidatesRunner Batch`| [![.github/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml?query=event%3Aschedule) | +| [ PostCommit Java Sickbay ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Sickbay.yml) | N/A |`Run Java Sickbay`| [![.github/workflows/beam_PostCommit_Java_Sickbay.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Sickbay.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Sickbay.yml?query=event%3Aschedule) | +| [ PostCommit Java Tpcds Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml) | N/A |`Run Dataflow Runner Tpcds Tests`| [![.github/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Java Tpcds Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Flink.yml) | N/A |`Run Flink Runner Tpcds Tests`| [![.github/workflows/beam_PostCommit_Java_Tpcds_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Flink.yml?query=event%3Aschedule) | +| [ PostCommit Java Tpcds Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Spark.yml) | N/A |`Run Spark Runner Tpcds Tests`| [![.github/workflows/beam_PostCommit_Java_Tpcds_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Tpcds_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Dataflow JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml) | ['11','17'] |`Run Dataflow ValidatesRunner Java (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml) | N/A |`Run Dataflow Streaming ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Dataflow V2 Streaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml) | N/A |`Run Java Dataflow V2 ValidatesRunner Streaming`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml) | N/A |`Run Java Dataflow V2 ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml) | N/A |`Run Dataflow ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Direct JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml) | ['11','17'] |`Run Direct ValidatesRunner Java (matrix_element)`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml) | N/A |`Run Direct ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Flink Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml) | N/A |`Run Flink ValidatesRunner Java 11`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml) | N/A |`Run Flink ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml) | N/A |`Run Samza ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Spark Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml) | N/A |`Run Spark ValidatesRunner Java 11`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml) | N/A |`Run Spark ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner SparkStructuredStreaming ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml) | N/A |`Run Spark StructuredStreaming ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner Twister2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml) | N/A |`Run Twister2 ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml?query=event%3Aschedule) | +| [ PostCommit Java ValidatesRunner ULR ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml) | N/A |`Run ULR Loopback ValidatesRunner`| [![.github/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml?query=event%3Aschedule) | +| [ PostCommit Java ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java.yml) | N/A |`Run Java PostCommit`| [![.github/workflows/beam_PostCommit_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java.yml?query=event%3Aschedule) | +| [ PostCommit Javadoc ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Javadoc.yml) | N/A |`Run Javadoc PostCommit`| [![.github/workflows/beam_PostCommit_Javadoc.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Javadoc.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Javadoc.yml?query=event%3Aschedule) | +| [ PostCommit PortableJar Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Flink.yml) | N/A |`Run PortableJar_Flink PostCommit`| [![.github/workflows/beam_PostCommit_PortableJar_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Flink.yml?query=event%3Aschedule) | +| [ PostCommit PortableJar Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Spark.yml) | N/A |`Run PortableJar_Spark PostCommit`| [![.github/workflows/beam_PostCommit_PortableJar_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_PortableJar_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Python ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python PostCommit (matrix_element)`| [![.github/workflows/beam_PostCommit_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python.yml?query=event%3Aschedule) | +| [ PostCommit Python Arm](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Arm.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python PostCommit Arm (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Arm.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Arm.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Arm.yml?query=event%3Aschedule) | +| [ PostCommit Python Examples Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml) | N/A |`Run Python Examples_Dataflow`| [![.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Python Examples Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python Examples_Direct (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Examples_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml?query=event%3Aschedule) | +| [ PostCommit Python Examples Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml) | ['3.8','3.11'] |`Run Python Examples_Flink (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Examples_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml?query=event%3Aschedule) | +| [ PostCommit Python Examples Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Spark.yml) | ['3.8','3.11'] |`Run Python Examples_Spark (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_Examples_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Python MongoDBIO IT ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml) | N/A |`Run Python MongoDBIO_IT`| [![.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml?query=event%3Aschedule) | +| [ PostCommit Python Nexmark Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Nexmark_Direct.yml) | N/A |`Run Python Direct Runner Nexmark Tests`| [![.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Nexmark_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Nexmark_Direct.yml?query=event%3Aschedule) | +| [ PostCommit Python ValidatesContainer Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python Dataflow ValidatesContainer (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Python ValidatesContainer Dataflow With RC ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python RC Dataflow ValidatesContainer (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml?query=event%3Aschedule) | +| [ PostCommit Python ValidatesRunner Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml) | ['3.8','3.11'] |`Run Python Dataflow ValidatesRunner (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Python ValidatesRunner Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml) | ['3.8','3.11'] |`Run Python Flink ValidatesRunner (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml?query=event%3Aschedule) | +| [ PostCommit Python ValidatesRunner Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml) | ['3.8','3.11'] |`Run Python Samza ValidatesRunner (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml?query=event%3Aschedule) | +| [ PostCommit Python ValidatesRunner Spark ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml) | ['3.8','3.9','3.11'] |`Run Python Spark ValidatesRunner (matrix_element)`| [![.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml?query=event%3Aschedule) | +| [ PostCommit Python Xlang Gcp Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml) | N/A |`Run Python_Xlang_Gcp_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Python Xlang Gcp Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml) | N/A |`Run Python_Xlang_Gcp_Direct PostCommit`| [![.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml?query=event%3Aschedule) | +| [ PostCommit Python Xlang IO Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml) | N/A |`Run Python_Xlang_IO_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit Sickbay Python ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Sickbay_Python.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python (matrix_element) PostCommit Sickbay`| [![.github/workflows/beam_PostCommit_Sickbay_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Sickbay_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Sickbay_Python.yml?query=event%3Aschedule) | +| [ PostCommit SQL ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_SQL.yml) | N/A |`Run SQL PostCommit`| [![.github/workflows/beam_PostCommit_SQL.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_SQL.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_SQL.yml?query=event%3Aschedule) | +| [ PostCommit TransformService Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml) | N/A |`Run TransformService_Direct PostCommit`| [![.github/workflows/beam_PostCommit_TransformService_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml?query=event%3Aschedule) +| [ PostCommit Website Publish ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Publish.yml) | N/A | N/A | [![.github/workflows/beam_PostCommit_Website_Publish.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Publish.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Publish.yml?query=event%3Aschedule) | +| [ PostCommit Website Test](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Test.yml) | N/A |`Run Full Website Test`| [![.github/workflows/beam_PostCommit_Website_Test.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Test.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Website_Test.yml?query=event%3Aschedule) | +| [ PostCommit XVR GoUsingJava Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml) | N/A |`Run XVR_GoUsingJava_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit XVR Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Direct.yml) | N/A |`Run XVR_Direct PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Direct.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Direct.yml?query=event%3Aschedule) | +| [ PostCommit XVR Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Flink.yml) | N/A |`Run XVR_Flink PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Flink.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Flink.yml?query=event%3Aschedule) | +| [ PostCommit XVR JavaUsingPython Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml) | N/A |`Run XVR_JavaUsingPython_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit XVR PythonUsingJava Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml) | N/A |`Run XVR_PythonUsingJava_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit XVR PythonUsingJavaSQL Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml) | N/A |`Run XVR_PythonUsingJavaSQL_Dataflow PostCommit`| [![.github/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml?query=event%3Aschedule) | +| [ PostCommit XVR Samza ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Samza.yml) | N/A |`Run XVR_Samza PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Samza.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Samza.yml?query=event%3Aschedule) | +| [ PostCommit XVR Spark3 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Spark3.yml) | N/A |`Run XVR_Spark3 PostCommit`| [![.github/workflows/beam_PostCommit_XVR_Spark3.yml](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Spark3.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_XVR_Spark3.yml?query=event%3Aschedule) | +| [ Python Validates Container Dataflow ARM ](https://github.com/apache/beam/actions/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml) | ['3.8','3.9','3.10','3.11'] |`Run Python ValidatesContainer Dataflow ARM (matrix_element)`|[![.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml](https://github.com/apache/beam/actions/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml?query=event%3Aschedule) | + +### PerformanceTests and Benchmark Jobs + +| Workflow name | Matrix | Trigger Phrase | Cron Status | +|:-------------:|:------:|:--------------:|:-----------:| +| [ CloudML Benchmarks Dataflow ](https://github.com/apache/beam/actions/workflows/beam_CloudML_Benchmarks_Dataflow.yml) | N/A |`Run TFT Criteo Benchmarks`| [![.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_CloudML_Benchmarks_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CloudML_Benchmarks_Dataflow.yml?query=event%3Aschedule) +| [ Inference Python Benchmarks Dataflow ](https://github.com/apache/beam/actions/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml) | N/A |`Run Inference Benchmarks`| [![.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml](https://github.com/apache/beam/actions/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml?query=event%3Aschedule) +| [ Java JMH ](https://github.com/apache/beam/actions/workflows/beam_Java_JMH.yml) | N/A | N/A | [![.github/workflows/beam_Java_JMH.yml](https://github.com/apache/beam/actions/workflows/beam_Java_JMH.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Java_JMH.yml?query=event%3Aschedule) +| [ Performance Tests AvroIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml) | N/A |`Run Java AvroIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml?query=event%3Aschedule) +| [ Performance Tests AvroIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT.yml) | N/A |`Run Java AvroIO Performance Test`| [![.github/workflows/beam_PerformanceTests_AvroIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_AvroIOIT.yml?query=event%3Aschedule) +| [ Performance Tests BigQueryIO Batch Java Avro ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Avro`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml?query=event%3Aschedule) +| [ Performance Tests BigQueryIO Batch Java Json ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Json`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml?query=event%3Aschedule) +| [ Performance Tests BigQueryIO Streaming Java ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml) | N/A |`Run BigQueryIO Streaming Performance Test Java`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml?query=event%3Aschedule) +| [ Performance Tests BigQueryIO Read Python ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml) | N/A |`Run BigQueryIO Read Performance Test Python`| [![.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml?query=event%3Aschedule) +| [ Performance Tests BigQueryIO Write Python Batch ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml) | N/A |`Run BigQueryIO Write Performance Test Python`| [![.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml?query=event%3Aschedule) +| [ PerformanceTests Cdap ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Cdap.yml) | N/A |`Run Java CdapIO Performance Test`| [![.github/workflows/beam_PerformanceTests_Cdap.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Cdap.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Cdap.yml?query=event%3Aschedule) +| [ PerformanceTests Compressed TextIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml) | N/A |`Run Java CompressedTextIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml?query=event%3Aschedule) +| [ PerformanceTests Compressed TextIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml) | N/A |`Run Java CompressedTextIO Performance Test`| [![.github/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml?query=event%3Aschedule) +| [ PerformanceTests HadoopFormat ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_HadoopFormat.yml) | N/A |`Run Java HadoopFormatIO Performance Test`| [![.github/workflows/beam_PerformanceTests_HadoopFormat.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_HadoopFormat.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_HadoopFormat.yml?query=event%3Aschedule) +| [ PerformanceTests JDBC ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_JDBC.yml) | N/A |`Run Java JdbcIO Performance Test`| [![.github/workflows/beam_PerformanceTests_JDBC.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_JDBC.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_JDBC.yml?query=event%3Aschedule) +| [ PerformanceTests Kafka IO ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Kafka_IO.yml) | N/A |`Run Java KafkaIO Performance Test`| [![.github/workflows/beam_PerformanceTests_Kafka_IO.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Kafka_IO.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_Kafka_IO.yml?query=event%3Aschedule) +| [ PerformanceTests ManyFiles TextIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml) | N/A |`Run Java ManyFilesTextIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml?query=event%3Aschedule) +| [ PerformanceTests ManyFiles TextIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml) | N/A |`Run Java ManyFilesTextIO Performance Test`| [![.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml?query=event%3Aschedule) +| [ PerformanceTests MongoDBIO IT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_MongoDBIO_IT.yml) | N/A |`Run Java MongoDBIO Performance Test`| [![.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_MongoDBIO_IT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_MongoDBIO_IT.yml?query=event%3Aschedule) +| [ PerformanceTests ParquetIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml) | N/A |`Run Java ParquetIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml?query=event%3Aschedule) +| [ PerformanceTests ParquetIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ParquetIOIT.yml) | N/A |`Run Java ParquetIO Performance Test`| [![.github/workflows/beam_PerformanceTests_ParquetIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ParquetIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_ParquetIOIT.yml?query=event%3Aschedule) +| [ PerformanceTests PubsubIOIT Python Streaming ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml) | N/A |`Run PubsubIO Performance Test Python`| [![.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml?query=event%3Aschedule) +| [ PerformanceTests SingleStoreIO ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SingleStoreIO.yml) | N/A |`Run Java SingleStoreIO Performance Test`| [![.github/workflows/beam_PerformanceTests_SingleStoreIO.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SingleStoreIO.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SingleStoreIO.yml?query=event%3Aschedule) +| [ PerformanceTests SpannerIO Read 2GB Python ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml) | N/A |`Run SpannerIO Read 2GB Performance Test Python`| [![.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml?query=event%3Aschedule) +| [ PerformanceTests SpannerIO Write 2GB Python Batch ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml) | N/A |`Run SpannerIO Write 2GB Performance Test Python Batch`| [![.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml?query=event%3Aschedule) +| [ PerformanceTests SparkReceiver IO ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SparkReceiver_IO.yml) | N/A |`Run Java SparkReceiverIO Performance Test`| [![.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SparkReceiver_IO.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SparkReceiver_IO.yml?query=event%3Aschedule) +| [ PerformanceTests SQLBigQueryIO Batch Java ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml) | N/A |`Run SQLBigQueryIO Batch Performance Test Java`| [![.github/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml?query=event%3Aschedule) +| [ PerformanceTests TextIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml) | N/A |`Run Java TextIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml?query=event%3Aschedule) +| [ PerformanceTests TextIOIT Python ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT_Python.yml) | N/A |`Run Python TextIO Performance Test`| [![.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT_Python.yml?query=event%3Aschedule) +| [ PerformanceTests TextIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT.yml) | N/A |`Run Java TextIO Performance Test`| [![.github/workflows/beam_PerformanceTests_TextIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TextIOIT.yml?query=event%3Aschedule) +| [ PerformanceTests TFRecordIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml) | N/A |`Run Java TFRecordIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml?query=event%3Aschedule) +| [ PerformanceTests TFRecordIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TFRecordIOIT.yml) | N/A |`Run Java TFRecordIO Performance Test`| [![.github/workflows/beam_PerformanceTests_TFRecordIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TFRecordIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_TFRecordIOIT.yml?query=event%3Aschedule) +| [ PerformanceTests WordCountIT PythonVersions ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml) | ['3.8'] |`Run Python (matrix_element) WordCountIT Performance Test`| [![.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml?query=event%3Aschedule) +| [ PerformanceTests XmlIOIT HDFS ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml) | N/A |`Run Java XmlIO Performance Test HDFS`| [![.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml?query=event%3Aschedule) +| [ PerformanceTests XmlIOIT ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_XmlIOIT.yml) | N/A |`Run Java XmlIO Performance Test`| [![.github/workflows/beam_PerformanceTests_XmlIOIT.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_XmlIOIT.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_XmlIOIT.yml?query=event%3Aschedule) +| [ PerformanceTests xlang KafkaIO Python ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml) | N/A |`Run Python xlang KafkaIO Performance Test`| [![.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml?query=event%3Aschedule) + +### LoadTests Jobs + +| Workflow name | Matrix | Trigger Phrase | Cron Status | +|:-------------:|:------:|:--------------:|:-----------:| +| [ LoadTests Go CoGBK Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml) | N/A |`Run LoadTests Go CoGBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go CoGBK Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml) | N/A |`Run Load Tests Go CoGBK Flink Batch`| [![.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml?query=event%3Aschedule) +| [ LoadTests Go Combine Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml) | N/A |`Run Load Tests Go Combine Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go Combine Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml) | N/A |`Run Load Tests Go Combine Flink Batch`| [![.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go GBK Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml) | N/A |`Run Load Tests Go GBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go GBK Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml) | N/A |`Run Load Tests Go GBK Flink Batch`| [![.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go ParDo Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml) | N/A |`Run Load Tests Go ParDo Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go ParDo Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml) | N/A |`Run Load Tests Go ParDo Flink Batch`| [![.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go SideInput Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml) | N/A |`Run Load Tests Go SideInput Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Go SideInput Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml) | N/A |`Run Load Tests Go SideInput Flink Batch`| [![.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java CoGBK Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Batch.yml) | N/A |`Run Load Tests Java CoGBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java CoGBK Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml) | N/A |`Run Load Tests Java CoGBK Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Java CoGBK Dataflow V2 Batch JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions.yml) | ['11','17'] |`Run Load Tests Java (matrix_element) CoGBK Dataflow V2 Batch`| [![.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions.yml?query=event%3Aschedule) +| [ LoadTests Java CoGBK Dataflow V2 Streaming JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions.yml) | ['11','17'] |`Run Load Tests Java (matrix_element) CoGBK Dataflow V2 Streaming`| [![.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions.yml?query=event%3Aschedule) +| [ LoadTests Java CoGBK SparkStructuredStreaming Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch.yml) | N/A |`Run Load Tests Java CoGBK SparkStructuredStreaming Batch`| [![.github/workflows/beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java Combine Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml) | N/A |`Run Load Tests Java Combine Dataflow Batch`| [![.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java Combine Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml) | N/A |`Run Load Tests Java Combine Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Java Combine SparkStructuredStreaming Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml) | N/A |`Run Load Tests Java Combine SparkStructuredStreaming Batch`| [![.github/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java GBK Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_Batch.yml) | N/A |`Run Load Tests Java GBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java GBK Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_Streaming.yml) | N/A |`Run Load Tests Java GBK Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Java GBK Dataflow V2 Batch Java11 ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11.yml) | N/A |`Run Load Tests Java 11 GBK Dataflow V2 Batch`| [![.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11.yml?query=event%3Aschedule) +| [ LoadTests Java GBK Dataflow V2 Batch Java17 ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17.yml) | N/A |`Run Load Tests Java 17 GBK Dataflow V2 Batch`| [![.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17.yml?query=event%3Aschedule) +| [ LoadTests Java GBK Dataflow V2 Streaming Java11 ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11.yml) | N/A |`Run Load Tests Java 11 GBK Dataflow V2 Streaming`| [![.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11.yml?query=event%3Aschedule) +| [ LoadTests Java GBK Dataflow V2 Streaming Java17 ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17.yml) | N/A |`Run Load Tests Java 17 GBK Dataflow V2 Streaming`| [![.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17.yml?query=event%3Aschedule) +| [ LoadTests Java GBK Smoke ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Smoke.yml) | N/A |`Run Java Load Tests GBK Smoke`| [![.github/workflows/beam_LoadTests_Java_GBK_Smoke.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Smoke.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_Smoke.yml?query=event%3Aschedule) +| [ LoadTests Java GBK SparkStructuredStreaming Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml) | N/A |`Run Load Tests Java GBK SparkStructuredStreaming Batch`| [![.github/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java ParDo Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml) | N/A |`Run Load Tests Java ParDo Dataflow Batch`| [![.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java ParDo Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml) | N/A |`Run Load Tests Java ParDo Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Java ParDo Dataflow V2 Batch JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions.yml) | ['11','17'] |`Run Load Tests Java (matrix_element) ParDo Dataflow V2 Batch`| [![.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions.yml?query=event%3Aschedule) +| [ LoadTests Java ParDo Dataflow V2 Streaming JavaVersions ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions.yml) | ['11','17'] |`Run Load Tests Java (matrix_element) ParDo Dataflow V2 Streaming`| [![.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions.yml?query=event%3Aschedule) +| [ LoadTests Java ParDo SparkStructuredStreaming Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml) | N/A |`Run Load Tests Java ParDo SparkStructuredStreaming Batch`| [![.github/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml?query=event%3Aschedule) +| [ LoadTests Java Combine Smoke ](https://github.com/apache/beam/actions/workflows/beam_Java_LoadTests_Combine_Smoke.yml) | N/A | N/A | [![.github/workflows/beam_Java_LoadTests_Combine_Smoke.yml](https://github.com/apache/beam/actions/workflows/beam_Java_LoadTests_Combine_Smoke.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Java_LoadTests_Combine_Smoke.yml?query=event%3Aschedule) +| [ LoadTests Python CoGBK Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml) | N/A |`Run Load Tests Python CoGBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python CoGBK Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml) | N/A |`Run Load Tests Python CoGBK Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Python CoGBK Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml) | N/A |`Run Load Tests Python CoGBK Flink Batch`| [![.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python Combine Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) | N/A |`Run Load Tests Python Combine Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python Combine Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml) | N/A |`Run Load Tests Python Combine Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Python Combine Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml) | N/A |`Run Load Tests Python Combine Flink Batch`| [![.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python Combine Flink Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml) | N/A |`Run Load Tests Python Combine Flink Streaming`| [![.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Python FnApiRunner Microbenchmark ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml) | N/A |`Run Python Load Tests FnApiRunner Microbenchmark`| [![.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml?query=event%3Aschedule) +| [ LoadTests Python GBK Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml) | N/A |`Run Load Tests Python GBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python GBK Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml) | N/A |`Run Load Tests Python GBK Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Python GBK Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml) | N/A |`Run Load Tests Python GBK Flink Batch`| [![.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python GBK reiterate Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml) | N/A |`Run Load Tests Python GBK reiterate Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python GBK reiterate Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml) | N/A |`Run Load Tests Python GBK reiterate Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Python ParDo Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml) | N/A |`Run Load Tests Python ParDo Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python ParDo Dataflow Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml) | N/A |`Run Python Load Tests ParDo Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Python ParDo Flink Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml) | N/A |`Run Load Tests Python ParDo Flink Batch`| [![.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python ParDo Flink Streaming ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml) | N/A |`Run Load Tests Python ParDo Flink Streaming`| [![.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml?query=event%3Aschedule) +| [ LoadTests Python SideInput Dataflow Batch ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml) | N/A |`Run Load Tests Python SideInput Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml?query=event%3Aschedule) +| [ LoadTests Python Smoke ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Smoke.yml) | N/A |`Run Python Load Tests Smoke`| [![.github/workflows/beam_LoadTests_Python_Smoke.yml](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Smoke.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Smoke.yml?query=event%3Aschedule) + +### Other Jobs + +| Workflow name | Matrix | Trigger Phrase | Cron Status | +|:-------------:|:------:|:--------------:|:-----------:| +| [ Cancel Stale Dataflow Jobs ](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml) | N/A | `Run Cancel Stale Dataflow Jobs` | [![.github/workflows/beam_CancelStaleDataflowJobs.yml](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml?query=event%3Aschedule) | +| [ Clean Up GCP Resources ](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml) | N/A | `Run Clean GCP Resources` | [![.github/workflows/beam_CleanUpGCPResources.yml](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml?query=event%3Aschedule) | +| [ Clean Up Prebuilt SDK Images ](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml) | N/A | `Run Clean Prebuilt Images` | [![.github/workflows/beam_beam_CleanUpPrebuiltSDKImages.yml](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml?query=event%3Aschedule) | +| [ Cleanup Dataproc Resources ](https://github.com/apache/beam/actions/workflows/beam_CleanUpDataprocResources.yml) | N/A | N/A | [![.github/workflows/beam_CleanUpDataprocResources.yml](https://github.com/apache/beam/actions/workflows/beam_CleanUpDataprocResources.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CleanUpDataprocResources.yml?query=event%3Aschedule) +| [ Community Metrics Prober ](https://github.com/apache/beam/actions/workflows/beam_Prober_CommunityMetrics.yml) | N/A |`Run Community Metrics Prober`| [![.github/workflows/beam_Prober_CommunityMetrics.yml](https://github.com/apache/beam/actions/workflows/beam_Prober_CommunityMetrics.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Prober_CommunityMetrics.yml?query=event%3Aschedule) +| [ Publish Beam SDK Snapshots ](https://github.com/apache/beam/actions/workflows/beam_Publish_Beam_SDK_Snapshots.yml) | N/A | N/A | [![.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml](https://github.com/apache/beam/actions/workflows/beam_Publish_Beam_SDK_Snapshots.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Publish_Beam_SDK_Snapshots.yml?query=event%3Aschedule) | +| [ Publish Docker Snapshots ](https://github.com/apache/beam/actions/workflows/beam_Publish_Docker_Snapshots.yml) | N/A |`Publish Docker Snapshots`| [![.github/workflows/beam_Publish_Docker_Snapshots.yml](https://github.com/apache/beam/actions/workflows/beam_Publish_Docker_Snapshots.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_Publish_Docker_Snapshots.yml?query=event%3Aschedule) | +| [ Rotate IO-Datastores Cluster Credentials ](https://github.com/apache/beam/actions/workflows/beam_IODatastoresCredentialsRotation.yml) | N/A | N/A | [![.github/workflows/beam_IODatastoresCredentialsRotation.yml](https://github.com/apache/beam/actions/workflows/beam_IODatastoresCredentialsRotation.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_IODatastoresCredentialsRotation.yml?query=event%3Aschedule) | +| [ Rotate Metrics Cluster Credentials ](https://github.com/apache/beam/actions/workflows/beam_MetricsCredentialsRotation.yml) | N/A | N/A | [![.github/workflows/beam_MetricsCredentialsRotation.yml](https://github.com/apache/beam/actions/workflows/beam_MetricsCredentialsRotation.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_MetricsCredentialsRotation.yml?query=event%3Aschedule) | diff --git a/.github/workflows/beam_CancelStaleDataflowJobs.yml b/.github/workflows/beam_CancelStaleDataflowJobs.yml index 226ee78f01aa7..78cfe67da8511 100644 --- a/.github/workflows/beam_CancelStaleDataflowJobs.yml +++ b/.github/workflows/beam_CancelStaleDataflowJobs.yml @@ -18,15 +18,13 @@ name: Cancel Stale Dataflow Jobs on: - issue_comment: - types: [created] schedule: - cron: '0 */4 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +50,7 @@ permissions: jobs: beam_CancelStaleDataflowJobs: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -60,7 +58,7 @@ jobs: job_name: [beam_CancelStaleDataflowJobs] job_phrase: [Run Cancel Stale Dataflow Jobs] if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Cancel Stale Dataflow Jobs' steps: @@ -71,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 diff --git a/.github/workflows/beam_CleanUpDataprocResources.yml b/.github/workflows/beam_CleanUpDataprocResources.yml new file mode 100644 index 0000000000000..7ab5029902a12 --- /dev/null +++ b/.github/workflows/beam_CleanUpDataprocResources.yml @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Cleanup Dataproc Resources + +on: + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_CleanUpDataprocResources: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + name: "beam_CleanUpDataprocResources" + steps: + - uses: actions/checkout@v3 + - name: Delete leaked resources for all the jobs that generates flink clusters + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./cleanup.sh -xe \ No newline at end of file diff --git a/.github/workflows/beam_CleanUpGCPResources.yml b/.github/workflows/beam_CleanUpGCPResources.yml index 42aeccb1a3b0b..acebf427f6e98 100644 --- a/.github/workflows/beam_CleanUpGCPResources.yml +++ b/.github/workflows/beam_CleanUpGCPResources.yml @@ -18,15 +18,13 @@ name: Clean Up GCP Resources on: - issue_comment: - types: [created] schedule: - - cron: '0 0 * * *' + - cron: '0 0,12 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +50,7 @@ permissions: jobs: beam_CleanUpGCPResources: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 strategy: @@ -60,7 +58,7 @@ jobs: job_name: [beam_CleanUpGCPResources] job_phrase: [Run Clean GCP Resources] if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Clean GCP Resources' steps: @@ -71,12 +69,17 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth - uses: google-github-actions/auth@v1 + uses: google-github-actions/setup-gcloud@v0 with: - credentials_json: ${{ secrets.GCP_SA_KEY }} + service_account_email: ${{ secrets.GCP_SA_EMAIL }} + service_account_key: ${{ secrets.GCP_SA_KEY }} project_id: ${{ secrets.GCP_PROJECT_ID }} + - name: Install gcloud bigtable cli + run: gcloud components install cbt - name: run cleanup GCP resources uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml b/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml index 62197137ca0c0..81ece47832f68 100644 --- a/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml +++ b/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml @@ -18,15 +18,13 @@ name: Clean Up Prebuilt SDK Images on: - issue_comment: - types: [created] schedule: - cron: '0 0 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +50,7 @@ permissions: jobs: beam_CleanUpPrebuiltSDKImages: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 strategy: @@ -60,7 +58,7 @@ jobs: job_name: [beam_CleanUpPrebuiltSDKImages] job_phrase: [Run Clean Prebuilt Images] if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Clean Prebuilt Images' steps: @@ -71,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 diff --git a/.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml b/.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml new file mode 100644 index 0000000000000..ccc9822950ec6 --- /dev/null +++ b/.github/workflows/beam_CloudML_Benchmarks_Dataflow.yml @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: CloudML Benchmarks Dataflow + +on: + schedule: + - cron: '10 21 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_CloudML_Benchmarks_Dataflow: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run TFT Criteo Benchmarks' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 360 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_CloudML_Benchmarks_Dataflow"] + job_phrase: ["Run TFT Criteo Benchmarks"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup Python environment + uses: ./.github/actions/setup-environment-action + with: + python-version: | + 3.8 + 3.9 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run TFT Criteo Benchmarks + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:test-suites:dataflow:tftTests + arguments: | + -PpythonVersion=3.9 \ + -Prunner=DataflowRunner \ + '-Popts=${{ env.beam_CloudML_Benchmarks_Dataflow_test_arguments_1 }}' \ No newline at end of file diff --git a/.github/workflows/beam_IODatastoresCredentialsRotation.yml b/.github/workflows/beam_IODatastoresCredentialsRotation.yml new file mode 100644 index 0000000000000..c24d3e52ca710 --- /dev/null +++ b/.github/workflows/beam_IODatastoresCredentialsRotation.yml @@ -0,0 +1,98 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Rotate IO-Datastores Cluster Credentials + +on: + schedule: + - cron: '0 2 1 * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_IODatastoresCredentialsRotation: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + name: ${{ matrix.job_name }} + strategy: + matrix: + job_name: ["beam_IODatastoresCredentialsRotation"] + job_phrase: ["N/A"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Starting credential rotation + run: | + gcloud container clusters update io-datastores --start-credential-rotation --zone=us-central1-a --quiet + - name: Rebuilding the nodes + run: | + gcloud container clusters upgrade io-datastores --node-pool=pool-1 --zone=us-central1-a --quiet + - name: Completing the rotation + run: | + gcloud container clusters update io-datastores --complete-credential-rotation --zone=us-central1-a --quiet + - name: Generate Date + if: failure() + run: | + date=$(date -u +"%Y-%m-%d") + echo "date=$date" >> $GITHUB_ENV + - name: Send email + uses: dawidd6/action-send-mail@v3 + if: failure() + with: + server_address: smtp.gmail.com + server_port: 465 + secure: true + username: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_ADDRESS }} + password: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_PASSWORD }} + subject: Credentials Rotation Failure on IO-Datastores cluster (${{ env.date }}) + to: dev@beam.apache.org + from: gactions@beam.apache.org + body: | + Something went wrong during the automatic credentials rotation for IO-Datastores Cluster, performed at ${{ env.date }}. It may be necessary to check the state of the cluster certificates. For further details refer to the following links:\n * Failing job: https://github.com/apache/beam/actions/workflows/beam_IODatastoresCredentialsRotation.yml \n * Job configuration: https://github.com/apache/beam/blob/master/.github/workflows/beam_IODatastoresCredentialsRotation.yml \n * Cluster URL: https://pantheon.corp.google.com/kubernetes/clusters/details/us-central1-a/io-datastores/details?mods=dataflow_dev&project=apache-beam-testing \ No newline at end of file diff --git a/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml b/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml new file mode 100644 index 0000000000000..2ca9953ce4b88 --- /dev/null +++ b/.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml @@ -0,0 +1,144 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Inference Python Benchmarks Dataflow + +on: + schedule: + - cron: '50 3 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_Inference_Python_Benchmarks_Dataflow: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Inference Benchmarks' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 900 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_Inference_Python_Benchmarks_Dataflow"] + job_phrase: ["Run Inference Benchmarks"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup Python environment + uses: ./.github/actions/setup-environment-action + with: + python-version: '3.8' + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Vision_Classification_Resnet_101.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Base_Uncased.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Large_Uncased.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152_Tesla_T4_GPU.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + - name: run Pytorch Vision Classification with Resnet 101 + uses: ./.github/actions/gradle-command-self-hosted-action + timeout-minutes: 180 + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ + '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_1 }} --job_name=benchmark-tests-pytorch-imagenet-python-101-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_resnet101-${{env.NOW_UTC}}.txt' \ + - name: run Pytorch Imagenet Classification with Resnet 152 + uses: ./.github/actions/gradle-command-self-hosted-action + timeout-minutes: 180 + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ + '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_2 }} --job_name=benchmark-tests-pytorch-imagenet-python-152-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_resnet152-${{env.NOW_UTC}}.txt' \ + - name: run Pytorch Language Modeling using Hugging face bert-base-uncased model + uses: ./.github/actions/gradle-command-self-hosted-action + timeout-minutes: 180 + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_language_modeling_benchmarks \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ + '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_3 }} --job_name=benchmark-tests-pytorch-language-modeling-bert-base-uncased-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_bert_base_uncased-${{env.NOW_UTC}}.txt' \ + - name: run Pytorch Langauge Modeling using Hugging Face bert-large-uncased model + uses: ./.github/actions/gradle-command-self-hosted-action + timeout-minutes: 180 + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_language_modeling_benchmarks \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ + '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_4 }} --job_name=benchmark-tests-pytorch-language-modeling-bert-large-uncased-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_bert_large_uncased-${{env.NOW_UTC}}.txt' \ + - name: run Pytorch Imagenet Classification with Resnet 152 with Tesla T4 GPU + uses: ./.github/actions/gradle-command-self-hosted-action + timeout-minutes: 180 + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + -PloadTest.requirementsTxtFile=apache_beam/ml/inference/torch_tests_requirements.txt \ + '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_5 }} --job_name=benchmark-tests-pytorch-imagenet-python-gpu-${{env.NOW_UTC}} --output=gs://temp-storage-for-end-to-end-tests/torch/result_resnet152_gpu-${{env.NOW_UTC}}.txt' \ No newline at end of file diff --git a/.github/workflows/beam_Java_JMH.yml b/.github/workflows/beam_Java_JMH.yml new file mode 100644 index 0000000000000..d05d69cf8ddcc --- /dev/null +++ b/.github/workflows/beam_Java_JMH.yml @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Java JMH + +on: + schedule: + - cron: '0 0 * * 0' + pull_request_target: + paths: ['release/trigger_all_tests.json'] + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_HOST: http://10.128.0.96:8086 + INFLUXDB_DATABASE: beam_test_metrics + +jobs: + beam_Java_JMH: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 900 + name: "beam_Java_JMH" + steps: + - uses: actions/checkout@v3 + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: run the Java JMH micro-benchmark harness suite + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:harness:jmh:jmh + - name: run the Java JMH micro-benchmark core suite + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:core:jmh:jmh \ No newline at end of file diff --git a/.github/workflows/beam_Java_LoadTests_Combine_Smoke.yml b/.github/workflows/beam_Java_LoadTests_Combine_Smoke.yml new file mode 100644 index 0000000000000..5fb71d01ced2c --- /dev/null +++ b/.github/workflows/beam_Java_LoadTests_Combine_Smoke.yml @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java Combine Smoke + +on: + # schedule: + # - cron: '10 12 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_Java_LoadTests_Combine_Smoke: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Java Load Tests Combine Smoke' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_Java_LoadTests_Combine_Smoke"] + job_phrase: ["Run Java Load Tests Combine Smoke"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-1.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-2.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-3.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CombineLoadTest load test Dataflow-1 + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CombineLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_Java_LoadTests_Combine_Smoke_test_arguments_1 }}' \ + - name: run CombineLoadTest load test Dataflow-2 + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CombineLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_Java_LoadTests_Combine_Smoke_test_arguments_2 }}' \ + - name: run CombineLoadTest load test Dataflow-3 + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CombineLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_Java_LoadTests_Combine_Smoke_test_arguments_3 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml index d90b7a5a4cc30..2124bee4c9e92 100644 --- a/.github/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_CoGBK_Dataflow_Batch.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests CoGBK Dataflow Batch Go +name: LoadTests Go CoGBK Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 23 * * *' + - cron: '10 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Go_CoGBK_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go CoGBK Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -69,16 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_SingleKey.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_MultipleKey.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_Reiteration_10KB.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_Reiteration_2MB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_SingleKey.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_MultipleKey.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_Reiteration_10KB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_Reiteration_2MB.txt arguments: | --job_name=load-tests-go-dataflow-batch-cogbk-$(date '+%m%d%H%M%S' --utc) - name: run CoGBK Dataflow Batch Go Load Test 1 (single key) diff --git a/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml b/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml index cf355a2ee98df..93e062925f393 100644 --- a/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml +++ b/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml @@ -16,10 +16,8 @@ name: LoadTests Go CoGBK Flink Batch on: - issue_comment: - types: [created] schedule: - - cron: '10 14 * * *' + - cron: '10 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,13 +38,15 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-cogbk-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster @@ -62,7 +62,7 @@ jobs: beam_LoadTests_Go_CoGBK_Flink_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go CoGBK Flink Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -79,15 +79,17 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_MultipleKey.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_Reiteration_10KB.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_Reiteration_2MB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_MultipleKey.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_Reiteration_10KB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_Reiteration_2MB.txt arguments: | --job_name=load-tests-go-flink-batch-cogbk-$(date '+%m%d%H%M%S' --utc) - name: Start Flink with parallelism 5 diff --git a/.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml index 423290d3fdc6a..d927c16ffa399 100644 --- a/.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests Combine Dataflow Batch Go +name: LoadTests Go Combine Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 23 * * *' + - cron: '10 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Go_Combine_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go Combine Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -69,19 +69,20 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Prepare configs - #Reads config files, excludes comments, appends current date to the job_name parameter - id: set_configs - shell: bash - run: | - CURDATE=$(date '+%m%d%H%M%S' --utc) - CONFIG_ARR=('config_Combine_Go_Batch_10b.txt' 'config_Combine_Go_Batch_Fanout_4.txt' 'config_Combine_Go_Batch_Fanout_8.txt') - for INDEX in ${!CONFIG_ARR[@]} - do - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/${CONFIG_ARR[INDEX]} | tr '\n' ' ') - CURCONFIG=$(echo "${CURCONFIG/load-tests-go-dataflow-batch-combine-$((INDEX + 1))-/load-tests-go-dataflow-batch-combine-$((INDEX + 1))-$CURDATE}") - echo "prepared_config_$((INDEX + 1))=$CURCONFIG" >> $GITHUB_OUTPUT - done + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: go + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_8.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV - name: run Combine Dataflow Batch Go Load Test 1 (single key) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -89,7 +90,7 @@ jobs: arguments: | -PloadTest.mainClass=combine \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_1 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_Combine_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-go-dataflow-batch-combine-1-${{env.NOW_UTC}}' \ - name: run Combine Dataflow Batch Go Load Test 2 (multiple keys) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -97,7 +98,7 @@ jobs: arguments: | -PloadTest.mainClass=combine \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_2 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_Combine_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-go-dataflow-batch-combine-2-${{env.NOW_UTC}}' \ - name: run Combine Dataflow Batch Go Load Test 3 (reiterate 10KB) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -105,4 +106,4 @@ jobs: arguments: | -PloadTest.mainClass=combine \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_3 }}' \ No newline at end of file + '-PloadTest.args=${{ env.beam_LoadTests_Go_Combine_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-go-dataflow-batch-combine-3-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml index 0226b003b58e8..3ec1097e64af4 100644 --- a/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Go Combine Flink Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 6 * * *' + - cron: '10 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,13 +38,15 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-combine-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster @@ -62,7 +62,7 @@ jobs: beam_LoadTests_Go_Combine_Flink_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go Combine Flink Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -85,9 +85,9 @@ jobs: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_10b.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_Fanout_4.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_Fanout_8.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_Fanout_8.txt arguments: | --job_name=load-tests-go-flink-batch-combine-$(date '+%m%d%H%M%S' --utc) - name: Start Flink with parallelism 5 diff --git a/.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml index bfdb19c1f5d5b..0b682ebdf5528 100644 --- a/.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests GBK Dataflow Batch Go +name: LoadTests Go GBK Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 23 * * *' + - cron: '50 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Go_GBK_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go GBK Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -69,19 +69,24 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Prepare configs - #Reads config files, excludes comments, appends current date to the job_name parameter - id: set_configs - shell: bash - run: | - CURDATE=$(date '+%m%d%H%M%S' --utc) - CONFIG_ARR=('config_GBK_Go_Batch_10b.txt' 'config_GBK_Go_Batch_100b.txt' 'config_GBK_Go_Batch_100b.txt' 'config_GBK_Go_Batch_Fanout_4.txt' 'config_GBK_Go_Batch_Fanout_8.txt' 'config_GBK_Go_Batch_Reiteration_10KB.txt', 'config_GBK_Go_Batch_Reiteration_2MB.txt') - for INDEX in ${!CONFIG_ARR[@]} - do - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/${CONFIG_ARR[INDEX]} | tr '\n' ' ') - CURCONFIG=$(echo "${CURCONFIG/load-tests-go-dataflow-batch-gbk-$((INDEX + 1))-/load-tests-go-dataflow-batch-gbk-$((INDEX + 1))-$CURDATE}") - echo "prepared_config_$((INDEX + 1))=$CURCONFIG" >> $GITHUB_OUTPUT - done + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: go + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100kb.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_8.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_10KB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_2MB.txt + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run GBK Dataflow Batch Go Load Test 1 (10 b records) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -89,7 +94,7 @@ jobs: arguments: | -PloadTest.mainClass=group_by_key \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_1 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_GBK_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-go-dataflow-batch-gbk-1-${{env.NOW_UTC}}' \ - name: run GBK Dataflow Batch Go Load Test 2 (100 b records) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -97,7 +102,7 @@ jobs: arguments: | -PloadTest.mainClass=group_by_key \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_2 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_GBK_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-go-dataflow-batch-gbk-2-${{env.NOW_UTC}}' \ - name: run GBK Dataflow Batch Go Load Test 3 (100 kb records) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -105,7 +110,7 @@ jobs: arguments: | -PloadTest.mainClass=group_by_key \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_3 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_GBK_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-go-dataflow-batch-gbk-3-${{env.NOW_UTC}}' \ - name: run GBK Dataflow Batch Go Load Test 4 (fanout 4) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -113,7 +118,7 @@ jobs: arguments: | -PloadTest.mainClass=group_by_key \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_4 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_GBK_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-go-dataflow-batch-gbk-4-${{env.NOW_UTC}}' \ - name: run GBK Dataflow Batch Go Load Test 5 (fanout 8) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -121,7 +126,7 @@ jobs: arguments: | -PloadTest.mainClass=group_by_key \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_5 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_GBK_Dataflow_Batch_test_arguments_5 }} --job_name=load-tests-go-dataflow-batch-gbk-5-${{env.NOW_UTC}}' \ - name: run GBK Dataflow Batch Go Load Test 6 (reiterate 4 times 10 kb) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -129,7 +134,7 @@ jobs: arguments: | -PloadTest.mainClass=group_by_key \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_6 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Go_GBK_Dataflow_Batch_test_arguments_6 }} --job_name=load-tests-go-dataflow-batch-gbk-6-${{env.NOW_UTC}}' \ - name: run GBK Dataflow Batch Go Load Test 7 (reiterate 4 times 2 mb) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -137,4 +142,4 @@ jobs: arguments: | -PloadTest.mainClass=group_by_key \ -Prunner=DataflowRunner \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_7 }}' \ No newline at end of file + '-PloadTest.args=${{ env.beam_LoadTests_Go_GBK_Dataflow_Batch_test_arguments_7 }} --job_name=load-tests-go-dataflow-batch-gbk-7-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml index 7c4d95738a097..d0870f4174521 100644 --- a/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Go GBK Flink Batch on: - issue_comment: - types: [created] schedule: - - cron: '20 1 * * *' + - cron: '50 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,13 +38,15 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-gbk-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster @@ -62,7 +62,7 @@ jobs: beam_LoadTests_Go_GBK_Flink_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go GBK Flink Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -79,18 +79,20 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_10b.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_100b.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_100kb.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Fanout_4.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Fanout_8.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Reiteration_10KB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_100b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_100kb.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Fanout_8.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Reiteration_10KB.txt arguments: | --job_name=load-tests-go-flink-batch-gbk-$(date '+%m%d%H%M%S' --utc) - name: Start Flink with parallelism 5 diff --git a/.github/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml index e4b4f0997d4db..47b2c51471f3f 100644 --- a/.github/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_ParDo_Dataflow_Batch.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests ParDo Dataflow Batch Go +name: LoadTests Go ParDo Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '15 18 * * *' + - cron: '50 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Go_ParDo_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go ParDo Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -69,16 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_10_Iterations.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_200_Iterations.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_10_Counters.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_100_Counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_10_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_200_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_10_Counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_100_Counters.txt arguments: | --job_name=load-tests-go-dataflow-batch-pardo-$(date '+%m%d%H%M%S' --utc) - name: run ParDo Dataflow Batch Go Load Test 1 (10 iterations) diff --git a/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml index 34211f9270ff2..c6929905d429b 100644 --- a/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Go ParDo Flink Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 2 * * *' + - cron: '50 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,13 +38,15 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-pardo-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster @@ -62,7 +62,7 @@ jobs: beam_LoadTests_Go_ParDo_Flink_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go ParDo Flink Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -79,16 +79,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_10_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_200_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_10_counters.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_100_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_10_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_200_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_10_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_100_counters.txt arguments: | --job_name=load-tests-go-flink-batch-pardo-$(date '+%m%d%H%M%S' --utc) - name: Start Flink with parallelism 5 diff --git a/.github/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml index cad15e4eae0cb..181365e2d5612 100644 --- a/.github/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_SideInput_Dataflow_Batch.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests SideInput Dataflow Batch Go +name: LoadTests Go SideInput Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 23 * * *' + - cron: '50 12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Go_SideInput_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go SideInput Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -69,14 +69,16 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_SideInput_Go_Batch_Dataflow_First_Iterable.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_SideInput_Go_Batch_Dataflow_Iterable.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_SideInput_Dataflow_Batch_First_Iterable.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_SideInput_Dataflow_Batch_Iterable.txt arguments: | --job_name=load-tests-go-dataflow-batch-sideinput-$(date '+%m%d%H%M%S' --utc) - name: run SideInput Dataflow Batch Go Load Test 1 (first iterable) diff --git a/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml index 8600c5cd37177..955c54c238ca1 100644 --- a/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Go SideInput Flink Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 23 * * *' + - cron: '10 13 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,13 +38,15 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-sideinput-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster @@ -62,7 +62,7 @@ jobs: beam_LoadTests_Go_SideInput_Flink_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Go SideInput Flink Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -79,14 +79,16 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: go argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_SideInput_Flink_Batch_First_Iterable.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/go_SideInput_Flink_Batch_Iterable.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_SideInput_Flink_Batch_First_Iterable.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/go_SideInput_Flink_Batch_Iterable.txt arguments: | --job_name=load-tests-go-flink-batch-sideinput-$(date '+%m%d%H%M%S' --utc) - name: Start Flink with parallelism 10 diff --git a/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Batch.yml new file mode 100644 index 0000000000000..265e3dfc9d38c --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Batch.yml @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java CoGBK Dataflow Batch + +on: + schedule: + - cron: '10 13 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_CoGBK_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java CoGBK Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_CoGBK_Dataflow_Batch"] + job_phrase: ["Run Load Tests Java CoGBK Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Single_Key.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_10kB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_2MB.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CoGBK 2GB 100 byte records - single key + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Batch_test_arguments_1 }} --appName=load_tests_Java_Dataflow_batch_CoGBK_1' \ + - name: run CoGBK 2GB 100 byte records - multiple keys + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Batch_test_arguments_2 }} --appName=load_tests_Java_Dataflow_batch_CoGBK_2' \ + - name: run CoGBK 2GB reiteration 10kB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Batch_test_arguments_3 }} --appName=load_tests_Java_Dataflow_batch_CoGBK_3' \ + - name: run CoGBK 2GB reiteration 2MB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Batch_test_arguments_4 }} --appName=load_tests_Java_Dataflow_batch_CoGBK_4' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml index 0cb601522a818..ffb38e34a4541 100644 --- a/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests CoGBK Dataflow Streaming Java +name: LoadTests Java CoGBK Dataflow Streaming on: - issue_comment: - types: [created] schedule: - - cron: '50 10 * * *' + - cron: '10 13 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,26 +38,28 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_CoGBK_Dataflow_Streaming: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java CoGBK Dataflow Streaming' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) strategy: matrix: - job_name: [ "beam_LoadTests_Java_CoGBK_Dataflow_Streaming" ] + job_name: ["beam_LoadTests_Java_CoGBK_Dataflow_Streaming"] job_phrase: ["Run Load Tests Java CoGBK Dataflow Streaming"] steps: - uses: actions/checkout@v4 @@ -69,18 +69,19 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Prepare configs - id: set_configs - shell: bash - run: | - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_SingleKey.txt | tr '\n' ' ') - echo "prepared_config_1=$CURCONFIG" >> $GITHUB_OUTPUT - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_MultipleKey.txt | tr '\n' ' ') - echo "prepared_config_2=$CURCONFIG" >> $GITHUB_OUTPUT - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_10KB.txt | tr '\n' ' ') - echo "prepared_config_3=$CURCONFIG" >> $GITHUB_OUTPUT - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_2MB.txt | tr '\n' ' ') - echo "prepared_config_4=$CURCONFIG" >> $GITHUB_OUTPUT + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_SingleKey.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_MultipleKey.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_10KB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_2MB.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run CoGBK Dataflow Streaming Java Load Test 1 (single key) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -88,7 +89,7 @@ jobs: arguments: | -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ -Prunner=:runners:google-cloud-dataflow-java \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_1 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Streaming_test_arguments_1 }}' \ - name: run CoGBK Dataflow Streaming Java Load Test 2 (multiple key) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -96,7 +97,7 @@ jobs: arguments: | -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ -Prunner=:runners:google-cloud-dataflow-java \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_2 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Streaming_test_arguments_2 }}' \ - name: run CoGBK Dataflow Streaming Java Load Test 3 (reiteration 10KB value) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -104,7 +105,7 @@ jobs: arguments: | -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ -Prunner=:runners:google-cloud-dataflow-java \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_3 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Streaming_test_arguments_3 }}' \ - name: run CoGBK Dataflow Streaming Java Load Test 4 (reiteration 2MB value) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -112,7 +113,7 @@ jobs: arguments: | -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ -Prunner=:runners:google-cloud-dataflow-java \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_4 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_Streaming_test_arguments_4 }}' \ - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions.yml b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions.yml new file mode 100644 index 0000000000000..b1cc0bc7b147d --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions.yml @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java CoGBK Dataflow V2 Batch JavaVersions + +on: + schedule: + - cron: '10 13 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + (contains(github.event.comment.body, 'Run Load Tests Java') && + contains(github.event.comment.body, 'CoGBK Dataflow V2 Batch')) + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + strategy: + fail-fast: false + matrix: + job_name: ["beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions"] + job_phrase_1: ["Run Load Tests Java"] + job_phrase_2: ["CoGBK Dataflow V2 Batch"] + java_version: ['11','17'] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: ${{ matrix.java_version }} + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Single_Key.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_10kB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_2MB.txt + arguments: | + --influxTags={\"runnerVersion\":\"v2\",\"jdk\":\"java${{ matrix.java_version }}\"} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CoGBK 2GB 100 byte records - single key + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions_test_arguments_1 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_CoGBK_1' \ + - name: run CoGBK 2GB 100 byte records - multiple keys + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions_test_arguments_2 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_CoGBK_2' \ + - name: run CoGBK 2GB reiteration 10kB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions_test_arguments_3 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_CoGBK_3' \ + - name: run CoGBK 2GB reiteration 2MB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_JavaVersions_test_arguments_4 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_CoGBK_4' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions.yml b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions.yml new file mode 100644 index 0000000000000..2b38f2e964824 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions.yml @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java CoGBK Dataflow V2 Streaming JavaVersions + +on: + schedule: + - cron: '10 13 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + (contains(github.event.comment.body, 'Run Load Tests Java') && + contains(github.event.comment.body, 'CoGBK Dataflow V2 Streaming')) + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + strategy: + fail-fast: false + matrix: + job_name: ["beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions"] + job_phrase_1: ["Run Load Tests Java"] + job_phrase_2: ["CoGBK Dataflow V2 Streaming"] + java_version: ['11','17'] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: ${{ matrix.java_version }} + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Single_Key.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Multiple_Keys.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_10kB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_2MB.txt + arguments: | + --influxTags={\"runnerVersion\":\"v2\",\"jdk\":\"java${{ matrix.java_version }}\"} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CoGBK 2GB 100 byte records - single key + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions_test_arguments_1 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_CoGBK_1' \ + - name: run CoGBK 2GB 100 byte records - multiple keys + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions_test_arguments_2 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_CoGBK_2' \ + - name: run CoGBK 2GB reiteration 10kB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions_test_arguments_3 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_CoGBK_3' \ + - name: run CoGBK 2GB reiteration 2MB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_JavaVersions_test_arguments_4 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_CoGBK_4' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch.yml b/.github/workflows/beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch.yml new file mode 100644 index 0000000000000..1fd32911dc7f6 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch.yml @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java CoGBK SparkStructuredStreaming Batch + +on: + schedule: + - cron: '50 13 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java CoGBK SparkStructuredStreaming Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch"] + job_phrase: ["Run Load Tests Java CoGBK SparkStructuredStreaming Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Single_Key.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Multiple_Keys.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_10kB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_2MB.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CoGBK 2GB 100 byte records - single key + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:spark:3 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch_test_arguments_1 }}' \ + - name: run CoGBK 2GB 100 byte records - multiple keys + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:spark:3 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch_test_arguments_2 }}' \ + - name: run CoGBK 2GB reiteration 10kB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:spark:3 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch_test_arguments_3 }}' \ + - name: run CoGBK 2GB reiteration 2MB value + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest \ + -Prunner=:runners:spark:3 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch_test_arguments_4 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml index 758d196f273cc..53c7a000d3850 100644 --- a/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Batch.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests Combine Dataflow Batch Java +name: LoadTests Java Combine Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '35 7 * * *' + - cron: '50 13 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_Combine_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java Combine Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -69,15 +69,17 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_10b.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_Fanout_4.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_Fanout_8.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_Fanout_8.txt - name: run Combine Dataflow Batch Java Load Test 1 (10 b records) uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml index b0df52354722b..a19db5ff90112 100644 --- a/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Java_Combine_Dataflow_Streaming.yml @@ -16,10 +16,8 @@ name: LoadTests Java Combine Dataflow Streaming on: - issue_comment: - types: [created] schedule: - - cron: '25 14 * * *' + - cron: '50 13 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,22 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_LoadTests_Java_Combine_Dataflow_Streaming_test_arguments_1: '' - beam_LoadTests_Java_Combine_Dataflow_Streaming_test_arguments_2: '' - beam_LoadTests_Java_Combine_Dataflow_Streaming_test_arguments_3: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_Combine_Dataflow_Streaming: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java Combine Dataflow Streaming' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -72,15 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_10b.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_Fanout_4.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_Fanout_8.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_Fanout_8.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Combine Dataflow Streaming Java Load Test 1 (10 b records) uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml b/.github/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml index 28c17ffb535c8..1f9a5a7a96392 100644 --- a/.github/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml +++ b/.github/workflows/beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Java Combine SparkStructuredStreaming Batch on: - issue_comment: - types: [created] schedule: - - cron: '15 18 * * *' + - cron: '50 13 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,22 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch_test_arguments_1: '' - beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch_test_arguments_2: '' - beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch_test_arguments_3: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java Combine SparkStructuredStreaming Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -72,15 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_10b.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_Fanout_4.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_Fanout_8.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_Fanout_8.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Combine SparkStructuredStreaming Batch Java Load Test 1 (10b records) uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Batch.yml new file mode 100644 index 0000000000000..c9ab32def649b --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Batch.yml @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java GBK Dataflow Batch + +on: + schedule: + - cron: '50 13 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_GBK_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java GBK Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_GBK_Dataflow_Batch"] + job_phrase: ["Run Load Tests Java GBK Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_2MB_values.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB of 10B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Batch_test_arguments_1 }}' \ + - name: run Load test 2GB of 100B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Batch_test_arguments_2 }}' \ + - name: run Load test 2GB of 100kB records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Batch_test_arguments_3 }}' \ + - name: run Load test fanout 4 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Batch_test_arguments_4 }}' \ + - name: run Load test fanout 8 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Batch_test_arguments_5 }}' \ + - name: run Load test reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Batch_test_arguments_6 }}' \ + - name: run Load test reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Batch_test_arguments_7 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Streaming.yml new file mode 100644 index 0000000000000..c73c7f084437c --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_Streaming.yml @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java GBK Dataflow Streaming + +on: + schedule: + - cron: '10 14 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_GBK_Dataflow_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java GBK Dataflow Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_GBK_Dataflow_Streaming"] + job_phrase: ["Run Load Tests Java GBK Dataflow Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_2MB_values.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB of 10B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Streaming_test_arguments_1 }}' \ + - name: run Load test 2GB of 100B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Streaming_test_arguments_2 }}' \ + - name: run Load test 2GB of 100kB records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Streaming_test_arguments_3 }}' \ + - name: run Load test fanout 4 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Streaming_test_arguments_4 }}' \ + - name: run Load test fanout 8 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Streaming_test_arguments_5 }}' \ + - name: run Load test reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Streaming_test_arguments_6 }}' \ + - name: run Load test reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_Streaming_test_arguments_7 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11.yml b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11.yml new file mode 100644 index 0000000000000..a2e60076e19a5 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11.yml @@ -0,0 +1,166 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java GBK Dataflow V2 Batch Java11 + +on: + schedule: + - cron: '10 14 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java 11 GBK Dataflow V2 Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11"] + job_phrase: ["Run Load Tests Java 11 GBK Dataflow V2 Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: 11 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_2MB_values.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB of 10B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11_test_arguments_1 }}' \ + - name: run Load test 2GB of 100B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11_test_arguments_2 }}' \ + - name: run Load test 2GB of 100kB records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11_test_arguments_3 }}' \ + - name: run Load test fanout 4 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11_test_arguments_4 }}' \ + - name: run Load test fanout 8 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11_test_arguments_5 }}' \ + - name: run Load test reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11_test_arguments_6 }}' \ + - name: run Load test reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11_test_arguments_7 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17.yml b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17.yml new file mode 100644 index 0000000000000..7a658b2cfdc55 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17.yml @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java GBK Dataflow V2 Batch Java17 + +on: + schedule: + - cron: '10 14 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java 17 GBK Dataflow V2 Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17"] + job_phrase: ["Run Load Tests Java 17 GBK Dataflow V2 Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: | + 17 + 8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_2MB_values.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB of 10B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17_test_arguments_1 }}' \ + - name: run Load test 2GB of 100B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17_test_arguments_2 }}' \ + - name: run Load test 2GB of 100kB records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17_test_arguments_3 }}' \ + - name: run Load test fanout 4 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17_test_arguments_4 }}' \ + - name: run Load test fanout 8 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17_test_arguments_5 }}' \ + - name: run Load test reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17_test_arguments_6 }}' \ + - name: run Load test reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17_test_arguments_7 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11.yml b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11.yml new file mode 100644 index 0000000000000..6595c9b00e890 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11.yml @@ -0,0 +1,166 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java GBK Dataflow V2 Streaming Java11 + +on: + schedule: + - cron: '10 14 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java 11 GBK Dataflow V2 Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11"] + job_phrase: ["Run Load Tests Java 11 GBK Dataflow V2 Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: 11 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_2MB_values.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB of 10B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11_test_arguments_1 }}' \ + - name: run Load test 2GB of 100B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11_test_arguments_2 }}' \ + - name: run Load test 2GB of 100kB records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11_test_arguments_3 }}' \ + - name: run Load test fanout 4 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11_test_arguments_4 }}' \ + - name: run Load test fanout 8 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11_test_arguments_5 }}' \ + - name: run Load test reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11_test_arguments_6 }}' \ + - name: run Load test reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=11 \ + -Pjava11Home=$JAVA_HOME_11_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11_test_arguments_7 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17.yml b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17.yml new file mode 100644 index 0000000000000..33f5c26a86cb9 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17.yml @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java GBK Dataflow V2 Streaming Java17 + +on: + schedule: + - cron: '10 14 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Java 17 GBK Dataflow V2 Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17"] + job_phrase: ["Run Load Tests Java 17 GBK Dataflow V2 Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: | + 17 + 8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_2MB_values.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB of 10B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17_test_arguments_1 }}' \ + - name: run Load test 2GB of 100B records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17_test_arguments_2 }}' \ + - name: run Load test 2GB of 100kB records + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17_test_arguments_3 }}' \ + - name: run Load test fanout 4 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17_test_arguments_4 }}' \ + - name: run Load test fanout 8 times with 2GB 10-byte records total + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17_test_arguments_5 }}' \ + - name: run Load test reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17_test_arguments_6 }}' \ + - name: run Load test reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=17 \ + -Pjava17Home=$JAVA_HOME_17_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17_test_arguments_7 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_GBK_Smoke.yml b/.github/workflows/beam_LoadTests_Java_GBK_Smoke.yml new file mode 100644 index 0000000000000..cf31693a7884d --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_GBK_Smoke.yml @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java GBK Smoke + +on: + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_GBK_Smoke: + if: | + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Java Load Tests GBK Smoke' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Java_GBK_Smoke"] + job_phrase: ["Run Java Load Tests GBK Smoke"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Direct.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Dataflow.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Flink.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Spark.txt + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run GroupByKey load test Direct + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:direct-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Smoke_test_arguments_1 }}' \ + - name: run GroupByKey load test Dataflow + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Smoke_test_arguments_2 }}' \ + - name: run GroupByKey load test Flink + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + --info \ + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:flink:1.15 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Smoke_test_arguments_3 }}' \ + - name: run GroupByKey load test Spark + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.GroupByKeyLoadTest \ + -Prunner=:runners:spark:3 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_GBK_Smoke_test_arguments_4 }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml b/.github/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml index 039a9fc3b7521..95f1ed712e214 100644 --- a/.github/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml +++ b/.github/workflows/beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Java GBK SparkStructuredStreaming Batch on: - issue_comment: - types: [created] schedule: - - cron: '10 10 * * *' + - cron: '50 14 * * *' workflow_dispatch: # Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java GBK SparkStructuredStreaming Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,19 +69,21 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_10B_records.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_100B_records.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_100kB_records.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_10kB_values.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_2MB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_2MB_values.txt # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Load test 2GB of 10B records uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml index 6bd52d2ebae03..4b0453152a29e 100644 --- a/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Java ParDo Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '55 9 * * *' + - cron: '50 14 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,23 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_LoadTests_Java_ParDo_Dataflow_Batch_test_arguments_1: '' - beam_LoadTests_Java_ParDo_Dataflow_Batch_test_arguments_2: '' - beam_LoadTests_Java_ParDo_Dataflow_Batch_test_arguments_3: '' - beam_LoadTests_Java_ParDo_Dataflow_Batch_test_arguments_4: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_ParDo_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java ParDo Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -73,16 +69,19 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_10_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_200_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_10_counters.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_100_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_10_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_200_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_10_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_100_counters.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run ParDo Dataflow Batch Java Load Test 1 (10 times) uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml index dbee457b6ec5e..e1644a068570a 100644 --- a/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_Streaming.yml @@ -16,10 +16,8 @@ name: LoadTests Java ParDo Dataflow Streaming on: - issue_comment: - types: [created] schedule: - - cron: '10 11 * * *' + - cron: '50 14 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,23 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_LoadTests_Java_ParDo_Dataflow_Streaming_test_arguments_1: '' - beam_LoadTests_Java_ParDo_Dataflow_Streaming_test_arguments_2: '' - beam_LoadTests_Java_ParDo_Dataflow_Streaming_test_arguments_3: '' - beam_LoadTests_Java_ParDo_Dataflow_Streaming_test_arguments_4: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_ParDo_Dataflow_Streaming: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java ParDo Dataflow Streaming' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -73,16 +69,19 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_10_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_200_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_10_counters.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_100_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_10_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_200_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_10_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_100_counters.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run ParDo Dataflow Streaming Java Load Test 1 (10 times) uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions.yml b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions.yml new file mode 100644 index 0000000000000..0993409d5122e --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions.yml @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java ParDo Dataflow V2 Batch JavaVersions + +on: + schedule: + - cron: '50 14 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + (contains(github.event.comment.body, 'Run Load Tests Java') && + contains(github.event.comment.body, 'ParDo Dataflow V2 Batch')) + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + strategy: + fail-fast: false + matrix: + job_name: ["beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions"] + job_phrase_1: ["Run Load Tests Java"] + job_phrase_2: ["ParDo Dataflow V2 Batch"] + java_version: ['11','17'] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: ${{ matrix.java_version }} + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_200_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_100_counters.txt + arguments: | + --influxTags={\"runnerVersion\":\"v2\",\"jdk\":\"java${{ matrix.java_version }}\"} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run ParDo Dataflow V2 Batch Java Load Test 1 (10 times) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions_test_arguments_1 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_ParDo_1' \ + - name: run ParDo Dataflow V2 Batch Java Load Test 2 (200 times) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions_test_arguments_2 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_ParDo_2' \ + - name: run ParDo Dataflow V2 Batch Java Load Test 3 (10 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions_test_arguments_3 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_ParDo_3' \ + - name: run ParDo Dataflow V2 Batch Java Load Test 4 (100 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_JavaVersions_test_arguments_4 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_batch_ParDo_4' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions.yml b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions.yml new file mode 100644 index 0000000000000..24b32d5f21978 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions.yml @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Java ParDo Dataflow V2 Streaming JavaVersions + +on: + schedule: + - cron: '50 14 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + (contains(github.event.comment.body, 'Run Load Tests Java') && + contains(github.event.comment.body, 'ParDo Dataflow V2 Streaming')) + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + strategy: + fail-fast: false + matrix: + job_name: ["beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions"] + job_phrase_1: ["Run Load Tests Java"] + job_phrase_2: ["ParDo Dataflow V2 Streaming"] + java_version: ['11','17'] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: ${{ matrix.java_version }} + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_200_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_100_counters.txt + arguments: | + --influxTags={\"runnerVersion\":\"v2\",\"jdk\":\"java${{ matrix.java_version }}\"} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run ParDo Dataflow V2 Streaming Java Load Test 1 (10 times) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions_test_arguments_1 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_ParDo_1' \ + - name: run ParDo Dataflow V2 Streaming Java Load Test 2 (200 times) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions_test_arguments_2 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_ParDo_2' \ + - name: run ParDo Dataflow V2 Streaming Java Load Test 3 (10 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions_test_arguments_3 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_ParDo_3' \ + - name: run ParDo Dataflow V2 Streaming Java Load Test 4 (100 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:load-tests:run + arguments: | + -PloadTest.mainClass=org.apache.beam.sdk.loadtests.ParDoLoadTest \ + -Prunner=:runners:google-cloud-dataflow-java \ + -Prunner.version=V2 \ + -PtestJavaVersion=${{ matrix.java_version }} \ + -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ + '-PloadTest.args=${{ env.beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_JavaVersions_test_arguments_4 }} --appName=load_tests_Java${{ matrix.java_version }}_Dataflow_V2_streaming_ParDo_4' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml b/.github/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml index afbb79caec91f..c5972a7c5e937 100644 --- a/.github/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml +++ b/.github/workflows/beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch.yml @@ -16,10 +16,8 @@ name: LoadTests Java ParDo SparkStructuredStreaming Batch on: - issue_comment: - types: [created] schedule: - - cron: '25 8 * * *' + - cron: '10 15 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,23 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch_test_arguments_1: '' - beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch_test_arguments_2: '' - beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch_test_arguments_3: '' - beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch_test_arguments_4: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Java ParDo SparkStructuredStreaming Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -73,16 +69,19 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_10_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_200_times.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_10_counters.txt - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_100_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_10_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_200_times.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_10_counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_100_counters.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run ParDo SparkStructuredStreaming Batch Java Load Test 1 (10 times) uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml new file mode 100644 index 0000000000000..ffbd362a1eab3 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Batch.yml @@ -0,0 +1,127 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python CoGBK Dataflow Batch + +on: + schedule: + - cron: '10 15 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_CoGBK_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python CoGBK Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_CoGBK_Dataflow_Batch"] + job_phrase: ["Run Load Tests Python CoGBK Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Single_Key.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_10kB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_2MB.txt + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CoGBK 2GB of 100B records with a single key + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + --info \ + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-cogbk-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK 2GB of 100B records with multiple keys + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-cogbk-2-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-cogbk-3-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-cogbk-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml new file mode 100644 index 0000000000000..f569237c7fb4d --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Dataflow_Streaming.yml @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python CoGBK Dataflow Streaming + +on: + schedule: + - cron: '10 15 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_CoGBK_Dataflow_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python CoGBK Dataflow Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_CoGBK_Dataflow_Streaming"] + job_phrase: ["Run Load Tests Python CoGBK Dataflow Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Single_Key.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Multiple_Keys.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_10kB.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_2MB.txt + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CoGBK 2GB of 100B records with a single key + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-cogbk-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK 2GB of 100B records with multiple keys + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-cogbk-2-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_3 }} --job_name=load-tests-python-dataflow-streaming-cogbk-3-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK reiterate 4 times 2MB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Dataflow_Streaming_test_arguments_4 }} --job_name=load-tests-python-dataflow-streaming-cogbk-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml new file mode 100644 index 0000000000000..2493f14585b9a --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml @@ -0,0 +1,135 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python CoGBK Flink Batch + +on: + schedule: + - cron: '10 15 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + GCLOUD_ZONE: us-central1-a + CLUSTER_NAME: beam-loadtests-python-cogbk-flink-batch-${{ github.run_id }} + GCS_BUCKET: gs://beam-flink-cluster + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + FLINK_TASKMANAGER_SLOTS: 1 + DETACHED_MODE: true + HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-python-cogbk-flink-batch-${{ github.run_id }} + +jobs: + beam_LoadTests_Python_CoGBK_Flink_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python CoGBK Flink Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_CoGBK_Flink_Batch"] + job_phrase: ["Run Load Tests Python CoGBK Flink Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Single_Key.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Multiple_Keys.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_10kB.txt + - name: Start Flink with parallelism 5 + env: + FLINK_NUM_WORKERS: 5 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh create + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run CoGBK 2GB of 100B records with a single key + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + --info \ + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=FlinkRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Flink_Batch_test_arguments_1 }} --job_name=load-tests-python-flink-batch-cogbk-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK 2GB of 100B records with multiple keys + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + --info \ + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=FlinkRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Flink_Batch_test_arguments_2 }} --job_name=load-tests-python-flink-batch-cogbk-2-${{ steps.datetime.outputs.datetime }}' \ + - name: run CoGBK reiterate 4 times 10kB values + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + --info \ + -PloadTest.mainClass=apache_beam.testing.load_tests.co_group_by_key_test \ + -Prunner=FlinkRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_CoGBK_Flink_Batch_test_arguments_2 }} --job_name=load-tests-python-flink-batch-cogbk-3-${{ steps.datetime.outputs.datetime }}' \ + - name: Teardown Flink + if: always() + run: | + ${{ github.workspace }}/.test-infra/dataproc/flink_cluster.sh delete \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml index f7d7a056d5953..3e3b9be9754fa 100644 --- a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests Combine Dataflow Batch Python +name: LoadTests Python Combine Dataflow Batch on: - issue_comment: - types: [created] schedule: - - cron: '40 5 * * *' + - cron: '10 15 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Python_Combine_Dataflow_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Load Tests Python Combine Dataflow Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 @@ -69,23 +69,22 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - python-version: '3.8' - - name: Prepare configs - #Reads config files, excludes comments, appends current date to the job_name parameter - id: set_configs - shell: bash - run: | - CURDATE=$(date '+%m%d%H%M%S' --utc) - CONFIG_ARR=('config_Combine_Python_Batch_2GB_10b.txt' 'config_Combine_Python_Batch_2GB_Fanout_4.txt' 'config_Combine_Python_Batch_2GB_Fanout_8.txt') - for INDEX in ${!CONFIG_ARR[@]} - do - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/${CONFIG_ARR[INDEX]} | tr '\n' ' ') - CURCONFIG=$(echo "${CURCONFIG/load-tests-python-dataflow-batch-combine-$((INDEX + 1))-/load-tests-python-dataflow-batch-combine-$((INDEX + 1))-$CURDATE}") - echo "prepared_config_$((INDEX + 1))=$CURCONFIG" >> $GITHUB_OUTPUT - done + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_10b.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_8.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV - name: run Combine Dataflow Batch Python Load Test 1 (10 bytes records) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -94,7 +93,7 @@ jobs: -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ -PpythonVersion=3.8 \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_1 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-combine-1-${{env.NOW_UTC}}' \ - name: run Combine Dataflow Batch Python Load Test 2 (fanout 4) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,7 +102,7 @@ jobs: -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ -PpythonVersion=3.8 \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_2 }}' \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-combine-2-${{env.NOW_UTC}}' \ - name: run Combine Dataflow Batch Python Load Test 3 (fanout 8) uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -112,4 +111,4 @@ jobs: -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ -Prunner=DataflowRunner \ -PpythonVersion=3.8 \ - '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_3 }}' \ No newline at end of file + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-combine-3-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml new file mode 100644 index 0000000000000..746e6bc193001 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Streaming.yml @@ -0,0 +1,114 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python Combine Dataflow Streaming + +on: + schedule: + - cron: '50 15 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_Combine_Dataflow_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python Combine Dataflow Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_Combine_Dataflow_Streaming"] + job_phrase: ["Run Load Tests Python Combine Dataflow Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_10_byte_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_8.txt + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run 2GB 10 byte records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-combine-1-${{env.NOW_UTC}}' \ + - name: run 2GB Fanout 4 test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-combine-4-${{env.NOW_UTC}}' \ + - name: run 2GB Fanout 8 test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Dataflow_Streaming_test_arguments_3 }} --job_name=load-tests-python-dataflow-streaming-combine-5-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml new file mode 100644 index 0000000000000..815f3dbc50d89 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python Combine Flink Batch + +on: + schedule: + - cron: '50 15 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + GCLOUD_ZONE: us-central1-a + CLUSTER_NAME: beam-loadtests-py-cmb-flink-batch-${{ github.run_id }} + GCS_BUCKET: gs://beam-flink-cluster + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + FLINK_TASKMANAGER_SLOTS: 1 + DETACHED_MODE: true + HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-py-cmb-flink-batch-${{ github.run_id }} + +jobs: + beam_LoadTests_Python_Combine_Flink_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python Combine Flink Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_Combine_Flink_Batch"] + job_phrase: ["Run Load Tests Python Combine Flink Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_10_byte_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_8.txt + - name: Start Flink with parallelism 5 + env: + FLINK_NUM_WORKERS: 5 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh create + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB 10 byte records + env: + CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PpythonVersion=3.8 \ + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Batch_test_arguments_1 }} --job_name=load-tests-python-flink-batch-combine-1-${{env.NOW_UTC}}' \ + - name: Restart Flink with parallelism 16 + env: + FLINK_NUM_WORKERS: 16 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh restart + - name: run Load test 2GB Fanout 4 + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PpythonVersion=3.8 \ + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Batch_test_arguments_2 }} --job_name=load-tests-python-flink-batch-combine-4-${{env.NOW_UTC}}' \ + - name: run Load test 2GB Fanout 8 + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PpythonVersion=3.8 \ + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Batch_test_arguments_3 }} --job_name=load-tests-python-flink-batch-combine-5-${{env.NOW_UTC}}' \ + - name: Teardown Flink + if: always() + run: | + ${{ github.workspace }}/.test-infra/dataproc/flink_cluster.sh delete \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml new file mode 100644 index 0000000000000..24fdce175f2ca --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python Combine Flink Streaming + +on: + schedule: + - cron: '50 15 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + GCLOUD_ZONE: us-central1-a + CLUSTER_NAME: beam-loadtests-py-cmb-flink-streaming-${{ github.run_id }} + GCS_BUCKET: gs://beam-flink-cluster + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + FLINK_TASKMANAGER_SLOTS: 1 + DETACHED_MODE: true + HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-py-cmb-flink-streaming-${{ github.run_id }} + +jobs: + beam_LoadTests_Python_Combine_Flink_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python Combine Flink Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_Combine_Flink_Streaming"] + job_phrase: ["Run Load Tests Python Combine Flink Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt + - name: Start Flink with parallelism 16 + env: + FLINK_NUM_WORKERS: 16 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh create + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Load test 2GB Fanout 4 + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PpythonVersion=3.8 \ + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Streaming_test_arguments_1 }} --job_name=load-tests-python-flink-streaming-combine-4-${{env.NOW_UTC}}' \ + - name: run Load test 2GB Fanout 8 + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PpythonVersion=3.8 \ + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Combine_Flink_Streaming_test_arguments_2 }} --job_name=load-tests-python-flink-streaming-combine-5-${{env.NOW_UTC}}' \ + - name: Teardown Flink + if: always() + run: | + ${{ github.workspace }}/.test-infra/dataproc/flink_cluster.sh delete + + # // TODO(https://github.com/apache/beam/issues/20402). Skipping some cases because they are too slow: + # load-tests-python-flink-streaming-combine-1' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml b/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml index b74a44c647c8a..1169a45dfc2dd 100644 --- a/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml +++ b/.github/workflows/beam_LoadTests_Python_FnApiRunner_Microbenchmark.yml @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Load Tests FnApiRunner Microbenchmark Python +name: LoadTests Python FnApiRunner Microbenchmark on: - issue_comment: - types: [created] schedule: - cron: '0 */6 * * *' workflow_dispatch: @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_LoadTests_Python_FnApiRunner_Microbenchmark: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Python Load Tests FnApiRunner Microbenchmark' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 @@ -69,17 +69,17 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - python-version: '3.8' + python-version: 3.8 - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/load-tests-job-configs/config_FnApiRunner_Python_Microbenchmark.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_FnApiRunner_Microbenchmark.txt - name: run FnApiRunner Microbenchmark Python Load Test uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml new file mode 100644 index 0000000000000..4631c2b310886 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Batch.yml @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python GBK Dataflow Batch + +on: + schedule: + - cron: '50 15 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_GBK_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python GBK Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_GBK_Dataflow_Batch"] + job_phrase: ["Run Load Tests Python GBK Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100kB_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run 2GB of 10B records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-gbk-1-${{env.NOW_UTC}}' \ + - name: run 2GB of 100B records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-gbk-2-${{env.NOW_UTC}}' \ + - name: run 2GB of 100kB records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-gbk-3-${{env.NOW_UTC}}' \ + - name: run fanout 4 times with 2GB 10-byte records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-gbk-4-${{env.NOW_UTC}}' \ + - name: run fanout 8 times with 2GB 10-byte records total test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Batch_test_arguments_5 }} --job_name=load-tests-python-dataflow-batch-gbk-5-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml new file mode 100644 index 0000000000000..a28532f9d71ce --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_GBK_Dataflow_Streaming.yml @@ -0,0 +1,100 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python GBK Dataflow Streaming + +on: + schedule: + - cron: '10 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_GBK_Dataflow_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python GBK Dataflow Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_GBK_Dataflow_Streaming"] + job_phrase: ["Run Load Tests Python GBK Dataflow Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variable is created and populated in the test-arguments-action as "_test_arguments_" + - name: run 2GB of 100kB records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-gbk-3-${{env.NOW_UTC}}' \ + + # // TODO(https://github.com/apache/beam/issues/20403). Skipping some cases because they are too slow: + # load-tests-python-dataflow-streaming-gbk-1 + # load-tests-python-dataflow-streaming-gbk-2 + # load-tests-python-dataflow-streaming-gbk-4 + # load-tests-python-dataflow-streaming-gbk-5 \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml new file mode 100644 index 0000000000000..d2a99b3711e0b --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml @@ -0,0 +1,155 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python GBK Flink Batch + +on: + # schedule: + # - cron: '10 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + GCLOUD_ZONE: us-central1-a + CLUSTER_NAME: beam-loadtests-py-gbk-flk-batch-${{ github.run_id }} + GCS_BUCKET: gs://beam-flink-cluster + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + FLINK_TASKMANAGER_SLOTS: 1 + DETACHED_MODE: true + HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-py-gbk-flk-batch-${{ github.run_id }} + +jobs: + beam_LoadTests_Python_GBK_Flink_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python GBK Flink Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_GBK_Flink_Batch"] + job_phrase: ["Run Load Tests Python GBK Flink Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_10B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + - name: Start Flink with parallelism 5 + env: + FLINK_NUM_WORKERS: 5 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh create + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run Flink Batch 2GB of 10B records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run --info + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Flink_Batch_test_arguments_1 }} --job_name=load-tests-python-flink-batch-gbk-1-${{env.NOW_UTC}}' \ + - name: run Flink Batch 2GB of 100B records test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Flink_Batch_test_arguments_2 }} --job_name=load-tests-python-flink-batch-gbk-2-${{env.NOW_UTC}}' \ + - name: run reiterate 4 times 10kB values test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Flink_Batch_test_arguments_5 }} --job_name=load-tests-python-flink-batch-gbk-6-${{env.NOW_UTC}}' \ + - name: Restart Flink with parallelism 16 + env: + FLINK_NUM_WORKERS: 16 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh restart + - name: run fanout 4 times with 2GB 10-byte records total test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Flink_Batch_test_arguments_3 }} --job_name=load-tests-python-flink-batch-gbk-4-${{env.NOW_UTC}}' \ + - name: run fanout 8 times with 2GB 10-byte records total test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=PortableRunner \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_Flink_Batch_test_arguments_4 }} --job_name=load-tests-python-flink-batch-gbk-5-${{env.NOW_UTC}}' \ + - name: Teardown Flink + if: always() + run: | + ${{ github.workspace }}/.test-infra/dataproc/flink_cluster.sh delete + + # TODO(https://github.com/apache/beam/issues/20146) Re-enable auto builds after these tests pass. \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml new file mode 100644 index 0000000000000..e08b99c1f6781 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch.yml @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python GBK reiterate Dataflow Batch + +on: + schedule: + - cron: '10 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python GBK reiterate Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch"] + job_phrase: ["Run Load Tests Python GBK reiterate Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_2MB_values.txt + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run reiterate 4 times 10kB values test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-gbk-6-${{env.NOW_UTC}}' \ + - name: run reiterate 4 times 2MB values test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-gbk-7-${{env.NOW_UTC}}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml new file mode 100644 index 0000000000000..9028dedf876d9 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming.yml @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python GBK reiterate Dataflow Streaming + +on: + schedule: + - cron: '10 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python GBK reiterate Dataflow Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming"] + job_phrase: ["Run Load Tests Python GBK reiterate Dataflow Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_10kB_values.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_2MB_values.txt + - name: get current time + run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run reiterate 4 times 10kB values test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-gbk-6-${{env.NOW_UTC}}' \ + - name: run reiterate 4 times 2MB values test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-gbk-7-${{env.NOW_UTC}}' \ diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml new file mode 100644 index 0000000000000..dc3738e83bf2c --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Batch.yml @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python ParDo Dataflow Batch + +on: + schedule: + - cron: '10 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_ParDo_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python ParDo Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 200 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_ParDo_Dataflow_Batch"] + job_phrase: ["Run Load Tests Python ParDo Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_200_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_100_Counters.txt + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run ParDo Dataflow Batch Python Load Test 1 (10 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-pardo-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Dataflow Batch Python Load Test 2 (200 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-pardo-2-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Dataflow Batch Python Load Test 3 (10 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-pardo-3-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Dataflow Batch Python Load Test 4 (100 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-pardo-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml new file mode 100644 index 0000000000000..447460fecf0dd --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Dataflow_Streaming.yml @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python ParDo Dataflow Streaming + +on: + schedule: + - cron: '50 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_ParDo_Dataflow_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Python Load Tests ParDo Dataflow Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 200 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_ParDo_Dataflow_Streaming"] + job_phrase: ["Run Python Load Tests ParDo Dataflow Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_200_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_100_Counters.txt + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run ParDo Dataflow Streaming Python Load Test 1 (10 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_1 }} --job_name=load-tests-python-dataflow-streaming-pardo-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Dataflow Streaming Python Load Test 2 (200 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_2 }} --job_name=load-tests-python-dataflow-streaming-pardo-2-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Dataflow Streaming Python Load Test 3 (10 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_3 }} --job_name=load-tests-python-dataflow-streaming-pardo-3-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Dataflow Streaming Python Load Test 4 (100 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Dataflow_Streaming_test_arguments_4 }} --job_name=load-tests-python-dataflow-streaming-pardo-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml new file mode 100644 index 0000000000000..97211b0f02072 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml @@ -0,0 +1,131 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python ParDo Flink Batch + +on: + schedule: + - cron: '50 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + GCLOUD_ZONE: us-central1-a + CLUSTER_NAME: beam-loadtests-python-pardo-flink-batch-${{ github.run_id }} + GCS_BUCKET: gs://beam-flink-cluster + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + FLINK_TASKMANAGER_SLOTS: 1 + DETACHED_MODE: true + HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-python-pardo-flink-batch-${{ github.run_id }} + +jobs: + beam_LoadTests_Python_ParDo_Flink_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python ParDo Flink Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_ParDo_Flink_Batch"] + job_phrase: ["Run Load Tests Python ParDo Flink Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt + - name: Start Flink with parallelism 5 + env: + FLINK_NUM_WORKERS: 5 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh create + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run ParDo Flink Batch Python Load Test 1 (10 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Batch_test_arguments_1 }} --job_name=load-tests-python-flink-batch-pardo-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Flink Batch Python Load Test 2 (200 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Batch_test_arguments_2 }} --job_name=load-tests-python-flink-batch-pardo-3-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Flink Batch Python Load Test 3 (10 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Batch_test_arguments_3 }} --job_name=load-tests-python-flink-batch-pardo-4-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml new file mode 100644 index 0000000000000..46437e7653887 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml @@ -0,0 +1,155 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python ParDo Flink Streaming + +on: + schedule: + - cron: '50 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + GCLOUD_ZONE: us-central1-a + CLUSTER_NAME: beam-loadtests-python-pardo-flink-stream-${{ github.run_id }} + GCS_BUCKET: gs://beam-flink-cluster + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + FLINK_TASKMANAGER_SLOTS: 1 + DETACHED_MODE: true + HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-python-pardo-flink-stream-${{ github.run_id }} + +jobs: + beam_LoadTests_Python_ParDo_Flink_Streaming: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python ParDo Flink Streaming' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_ParDo_Flink_Streaming"] + job_phrase: ["Run Load Tests Python ParDo Flink Streaming"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt + - name: Start Flink with parallelism 5 + env: + FLINK_NUM_WORKERS: 5 + run: | + cd ${{ github.workspace }}/.test-infra/dataproc; ./flink_cluster.sh create + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run ParDo Flink Streaming Python Load Test 1 (10 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_1 }} --job_name=load-tests-python-flink-streaming-pardo-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Flink Streaming Python Load Test 2 (200 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_2 }} --job_name=load-tests-python-flink-streaming-pardo-2-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Flink Streaming Python Load Test 3 (10 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_3 }} --job_name=load-tests-python-flink-streaming-pardo-3-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Flink Streaming Python Load Test 4 (100 counters) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_4 }} --job_name=load-tests-python-flink-streaming-pardo-4-${{ steps.datetime.outputs.datetime }}' \ + - name: run ParDo Flink Streaming Python Load Test 5 (5 iterations) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ + -Prunner=PortableRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_ParDo_Flink_Streaming_test_arguments_5 }} --job_name=load-tests-python-flink-streaming-pardo-6-${{ steps.datetime.outputs.datetime }}' \ + - name: Teardown Flink + if: always() + run: | + ${{ github.workspace }}/.test-infra/dataproc/flink_cluster.sh delete \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml new file mode 100644 index 0000000000000..08e5567e6a0e0 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_SideInput_Dataflow_Batch.yml @@ -0,0 +1,186 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python SideInput Dataflow Batch + +on: + schedule: + - cron: '50 16 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_SideInput_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Load Tests Python SideInput Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_SideInput_Dataflow_Batch"] + job_phrase: ["Run Load Tests Python SideInput Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_1key_percent_dict.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_99key_percent_dict.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_first_iterable.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_iterable.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_first_list.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_list.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_1key_percent_dict.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_99key_percent_dict.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_first_iterable.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_iterable.txt + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run SideInput Dataflow Batch Python Load Test 1 (1gb-1kb-10workers-1window-1key-percent-dict) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_1 }} --job_name=load-tests-python-dataflow-batch-sideinput-1-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 2 (1gb-1kb-10workers-1window-99key-percent-dict) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-sideinput-2-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 3 (10gb-1kb-10workers-1window-first-iterable) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_3 }} --job_name=load-tests-python-dataflow-batch-sideinput-3-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 4 (10gb-1kb-10workers-1window-iterable) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_4 }} --job_name=load-tests-python-dataflow-batch-sideinput-4-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 5 (1gb-1kb-10workers-1window-first-list) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_5 }} --job_name=load-tests-python-dataflow-batch-sideinput-5-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 6 (1gb-1kb-10workers-1window-list) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_6 }} --job_name=load-tests-python-dataflow-batch-sideinput-6-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 7 (1gb-1kb-10workers-1000window-1key-percent-dict) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_7 }} --job_name=load-tests-python-dataflow-batch-sideinput-7-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 8 (1gb-1kb-10workers-1000window-99key-percent-dict) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_8 }} --job_name=load-tests-python-dataflow-batch-sideinput-8-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 9 (10gb-1kb-10workers-1000window-first-iterable) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_9 }} --job_name=load-tests-python-dataflow-batch-sideinput-9-${{ steps.datetime.outputs.datetime }}' \ + - name: run SideInput Dataflow Batch Python Load Test 10 (10gb-1kb-10workers-1000window-iterable) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_SideInput_Dataflow_Batch_test_arguments_10 }} --job_name=load-tests-python-dataflow-batch-sideinput-10-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_LoadTests_Python_Smoke.yml b/.github/workflows/beam_LoadTests_Python_Smoke.yml new file mode 100644 index 0000000000000..8150956fe8db7 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_Smoke.yml @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: LoadTests Python Smoke + +on: + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_LoadTests_Python_Smoke: + if: | + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Python Load Tests Smoke' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_Smoke"] + job_phrase: ["Run Python Load Tests Smoke"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: load + test-language: python + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Direct.txt + ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Dataflow.txt + - name: Set current datetime + id: datetime + run: | + echo "datetime=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_OUTPUT + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" + - name: run GroupByKey Python load test Direct + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DirectRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Smoke_test_arguments_1 }} --job_name=load-tests-python-direct-batch-gbk-smoke-${{ steps.datetime.outputs.datetime }}' \ + - name: run GroupByKey Python load test Dataflow + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.group_by_key_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ env.beam_LoadTests_Python_Smoke_test_arguments_2 }} --job_name=load-tests-python-dataflow-batch-gbk-smoke-${{ steps.datetime.outputs.datetime }}' \ No newline at end of file diff --git a/.github/workflows/beam_MetricsCredentialsRotation.yml b/.github/workflows/beam_MetricsCredentialsRotation.yml new file mode 100644 index 0000000000000..777477fe20576 --- /dev/null +++ b/.github/workflows/beam_MetricsCredentialsRotation.yml @@ -0,0 +1,98 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Rotate Metrics Cluster Credentials + +on: + schedule: + - cron: '0 2 1 * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_MetricsCredentialsRotation: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + name: ${{ matrix.job_name }} + strategy: + matrix: + job_name: ["beam_MetricsCredentialsRotation"] + job_phrase: ["N/A"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Starting credential rotation + run: | + gcloud container clusters update metrics --start-credential-rotation --zone=us-central1-a --quiet + - name: Rebuilding the nodes + run: | + gcloud container clusters upgrade metrics --node-pool=default-pool --zone=us-central1-a --quiet + - name: Completing the rotation + run: | + gcloud container clusters update metrics --complete-credential-rotation --zone=us-central1-a --quiet + - name: Generate Date + if: failure() + run: | + date=$(date -u +"%Y-%m-%d") + echo "date=$date" >> $GITHUB_ENV + - name: Send email + uses: dawidd6/action-send-mail@v3 + if: failure() + with: + server_address: smtp.gmail.com + server_port: 465 + secure: true + username: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_ADDRESS }} + password: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_PASSWORD }} + subject: Credentials Rotation Failure on Metrics cluster (${{ env.date }}) + to: dev@beam.apache.org + from: gactions@beam.apache.org + body: | + Something went wrong during the automatic credentials rotation for Metrics Cluster, performed at ${{ env.date }}. It may be necessary to check the state of the cluster certificates. For further details refer to the following links:\n * Failing job: https://github.com/apache/beam/actions/workflows/beam_MetricsCredentialsRotation.yml \n * Job configuration: https://github.com/apache/beam/blob/master/.github/workflows/beam_MetricsCredentialsRotation.yml \n * Cluster URL: https://pantheon.corp.google.com/kubernetes/clusters/details/us-central1-a/metrics/details?mods=dataflow_dev&project=apache-beam-testing \ No newline at end of file diff --git a/.github/workflows/beam_Metrics_Report.yml b/.github/workflows/beam_Metrics_Report.yml new file mode 100644 index 0000000000000..8ed0c66480f0f --- /dev/null +++ b/.github/workflows/beam_Metrics_Report.yml @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Beam Metrics Report + +on: + schedule: + - cron: '0 11 * * 2' + workflow_dispatch: + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +jobs: + beam_Metrics_Report: + name: beam_Metrics_Report + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + if: | + ((github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event_name == 'workflow_dispatch' + + steps: + - uses: actions/checkout@v3 + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Run Metrics Report + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :beam-test-jenkins:generateMetricsReport + arguments: --info -PinfluxDb=beam_test_metrics -PinfluxHost='10.128.0.96' -PinfluxPort=8086 + env: + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + - name: Archive Report + uses: actions/upload-artifact@v3 + with: + name: Metrics Report + path: "${{ github.workspace }}/.test-infra/jenkins/metrics_report/beam-metrics_report.html" + - name: Generate Date + run: | + date=$(date -u +"%Y-%m-%d") + echo "date=$date" >> $GITHUB_ENV + - name: Send mail + uses: dawidd6/action-send-mail@v3 + with: + server_address: smtp.gmail.com + server_port: 465 + secure: true + username: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_ADDRESS }} + password: ${{ secrets.ISSUE_REPORT_SENDER_EMAIL_PASSWORD }} + subject: Beam Metrics Report ${{ env.date }} + to: dev@beam.apache.org + from: beamactions@gmail.com + html_body: file://${{ github.workspace }}/.test-infra/jenkins/metrics_report/beam-metrics_report.html diff --git a/.github/workflows/beam_PerformanceTests_AvroIOIT.yml b/.github/workflows/beam_PerformanceTests_AvroIOIT.yml index a6c56287da418..18bd56855c82a 100644 --- a/.github/workflows/beam_PerformanceTests_AvroIOIT.yml +++ b/.github/workflows/beam_PerformanceTests_AvroIOIT.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Performance Tests AvroIOIT +name: PerformanceTests AvroIOIT on: - issue_comment: - types: [created] schedule: - - cron: '10 1/13 * * *' + - cron: '10 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_AvroIOIT_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_AvroIOIT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java AvroIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,15 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_AvroIOIT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/avroIOIT.txt arguments: | --filenamePrefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}}/ + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml index 7393e0e39b371..dab1d9d6e942f 100644 --- a/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_AvroIOIT_HDFS.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Performance Tests AvroIOIT HDFS +name: PerformanceTests AvroIOIT HDFS on: - issue_comment: - types: [created] schedule: - - cron: '10 1/13 * * *' + - cron: '10 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_AvroIOIT_HDFS_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_AvroIOIT_HDFS: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java AvroIO Performance Test HDFS' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -94,10 +95,11 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_AvroIOIT_HDFS.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/avroIOIT_HDFS.txt arguments: | --filenamePrefix=hdfs://${{ steps.install_hadoop.outputs.hadoop_IP }}:9000/TEXTIO_IT_ --hdfsConfiguration=[{\\\"fs.defaultFS\\\":\\\"hdfs:${{ steps.install_hadoop.outputs.hadoop_IP }}:9000\\\",\\\"dfs.replication\\\":1}] + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml b/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml index d29acbfc765fc..8727b2387403d 100644 --- a/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml +++ b/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Performance Tests BigQueryIO Batch Java Avro +name: PerformanceTests BigQueryIO Batch Java Avro on: - issue_comment: - types: [created] schedule: - - cron: '10 1,13 * * *' + - cron: '10 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_BigQueryIO_Batch_Java_Avro: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run BigQueryIO Batch Performance Test Java Avro' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,14 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Prepare config - id: set_config - shell: bash - run: | - CURDATE=$(date '+%m%d%H%M%S' --utc) - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Avro.txt | tr '\n' ' ') - CONFIGWITHDATE=$(echo "${CURCONFIG/bqio_write_10GB_java_avro_/bqio_write_10GB_java_avro_$CURDATE}") - echo "prepared_config=$CONFIGWITHDATE" >> $GITHUB_OUTPUT + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: performance + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Avro.txt + arguments: | + --testBigQueryTable=bqio_write_10GB_java_avro_$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Java BigQueryIO Batch Avro Performance Test uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -85,7 +89,7 @@ jobs: --tests org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT \ --info \ -DintegrationTestRunner=dataflow \ - -DintegrationTestPipelineOptions=${{ steps.set_config.outputs.prepared_config }} \ + -DintegrationTestPipelineOptions='[${{ env.beam_PerformanceTests_BigQueryIO_Batch_Java_Avro_test_arguments_1 }}]' \ - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml b/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml index 067d0e4b95b8a..a231d00f5ede3 100644 --- a/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml +++ b/.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Performance Tests BigQueryIO Batch Java Json +name: PerformanceTests BigQueryIO Batch Java Json on: - issue_comment: - types: [created] schedule: - - cron: '30 8,20 * * *' + - cron: '10 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_BigQueryIO_Batch_Java_Json: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run BigQueryIO Batch Performance Test Java Json' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,14 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Prepare config - id: set_config - shell: bash - run: | - CURDATE=$(date '+%m%d%H%M%S' --utc) - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Json.txt | tr '\n' ' ') - CONFIGWITHDATE=$(echo "${CURCONFIG/bqio_write_10GB_java_json_/bqio_write_10GB_java_json_$CURDATE}") - echo "prepared_config=$CONFIGWITHDATE" >> $GITHUB_OUTPUT + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: performance + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Json.txt + arguments: | + --testBigQueryTable=bqio_write_10GB_java_json_$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Java BigQueryIO Batch Json Performance Test uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -85,7 +89,7 @@ jobs: --tests org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT \ --info \ -DintegrationTestRunner=dataflow \ - -DintegrationTestPipelineOptions=${{ steps.set_config.outputs.prepared_config }} \ + -DintegrationTestPipelineOptions='[${{ env.beam_PerformanceTests_BigQueryIO_Batch_Java_Json_test_arguments_1 }}]' \ - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml b/.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml index bf10d4be522e2..7259fb1838f24 100644 --- a/.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml +++ b/.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml @@ -13,13 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: Performance Tests BigQueryIO Streaming Java +name: PerformanceTests BigQueryIO Streaming Java on: - issue_comment: - types: [created] schedule: - - cron: '20 15,22 * * *' + - cron: '50 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_BigQueryIO_Streaming_Java: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run BigQueryIO Streaming Performance Test Java' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,14 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Prepare config - id: set_config - shell: bash - run: | - CURDATE=$(date '+%m%d%H%M%S' --utc) - CURCONFIG=$(grep -v "^#.*" ./.github/workflows/performance-tests-job-configs/config_BigQueryIO_Streaming_Java.txt | tr '\n' ' ') - CONFIGWITHDATE=$(echo "${CURCONFIG/bqio_write_10GB_java_stream_/bqio_write_10GB_java_stream_$CURDATE}") - echo "prepared_config=$CONFIGWITHDATE" >> $GITHUB_OUTPUT + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: performance + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Streaming_Java.txt + arguments: | + --testBigQueryTable=bqio_write_10GB_java_stream_$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Java BigQueryIO Streaming Performance Test uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -85,7 +89,7 @@ jobs: --tests org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT \ --info \ -DintegrationTestRunner=dataflow \ - -DintegrationTestPipelineOptions=${{ steps.set_config.outputs.prepared_config }} \ + -DintegrationTestPipelineOptions='[${{ env.beam_PerformanceTests_BigQueryIO_Streaming_Java_test_arguments_1 }}]' \ - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml index 58958de69c891..11fa89767d617 100644 --- a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml +++ b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml @@ -16,10 +16,8 @@ name: PerformanceTests BiqQueryIO Read Python on: - issue_comment: - types: [created] schedule: - - cron: '0 2 * * *' + - cron: '30 9 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_BiqQueryIO_Read_Python_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_BiqQueryIO_Read_Python: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run BigQueryIO Read Performance Test Python' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -80,9 +79,10 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/biqQueryIO_Read_Python.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/biqQueryIO_Read_Python.txt arguments: | --job_name=performance-tests-bqio-read-python-10gb$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: Run BigQueryIO Read Performance Test Python uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml index f0fcd20bd3b5f..fa7a3a78d5d10 100644 --- a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml +++ b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml @@ -16,10 +16,8 @@ name: PerformanceTests BiqQueryIO Write Python Batch on: - issue_comment: - types: [created] schedule: - - cron: '0 1 * * *' + - cron: '30 9 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_BiqQueryIO_Write_Python_Batch_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_BiqQueryIO_Write_Python_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run BigQueryIO Write Performance Test Python' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -80,9 +79,10 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/biqQueryIO_Write_Python_Batch.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/biqQueryIO_Write_Python_Batch.txt arguments: | --job_name=performance-tests-bqio-write-python-batch-10gb$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run BigQueryIO Write Batch Python Performance Test uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_Cdap.yml b/.github/workflows/beam_PerformanceTests_Cdap.yml index f45419a1223f4..b0d29bbf02b53 100644 --- a/.github/workflows/beam_PerformanceTests_Cdap.yml +++ b/.github/workflows/beam_PerformanceTests_Cdap.yml @@ -16,10 +16,8 @@ name: PerformanceTests Cdap on: - issue_comment: - types: [created] schedule: - - cron: '13 4/12 * * *' + - cron: '50 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_Cdap_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_Cdap: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java CdapIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -94,9 +95,10 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/cdap.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/cdap.txt arguments: | --postgresServerName=${{ steps.install_postgres.outputs.postgres_IP }} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml b/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml index 0e82c0fdf7d10..3b68bdeeb509c 100644 --- a/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml +++ b/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT.yml @@ -16,10 +16,8 @@ name: PerformanceTests Compressed TextIOIT on: - issue_comment: - types: [created] schedule: - - cron: '10 1/12 * * *' + - cron: '50 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_Compressed_TextIOIT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java CompressedTextIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action # The env variable is created and populated in the test-arguments-action as "_test_arguments_" - name: Prepare test arguments uses: ./.github/actions/test-arguments-action @@ -76,7 +78,7 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_Compressed_TextIOIT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/compressed_TextIOIT.txt arguments: | --filenamePrefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}}/ - name: run integrationTest diff --git a/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml index 78c9d3e8ab7a6..414889a159c46 100644 --- a/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_Compressed_TextIOIT_HDFS.yml @@ -16,10 +16,8 @@ name: PerformanceTests Compressed TextIOIT HDFS on: - issue_comment: - types: [created] schedule: - - cron: '50 1/12 * * *' + - cron: '50 9/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_Compressed_TextIOIT_HDFS: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java CompressedTextIO Performance Test HDFS' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -93,7 +95,7 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_Compressed_TextIOIT_HDFS.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/compressed_TextIOIT_HDFS.txt arguments: | --filenamePrefix=hdfs://${{ steps.install_hadoop.outputs.hadoop_IP }}:9000/TEXTIO_IT_ --hdfsConfiguration=[{\\\"fs.defaultFS\\\":\\\"hdfs:${{ steps.install_hadoop.outputs.hadoop_IP }}:9000\\\",\\\"dfs.replication\\\":1}] diff --git a/.github/workflows/beam_PerformanceTests_HadoopFormat.yml b/.github/workflows/beam_PerformanceTests_HadoopFormat.yml index d73c6f6fb5f5a..002dd9865728a 100644 --- a/.github/workflows/beam_PerformanceTests_HadoopFormat.yml +++ b/.github/workflows/beam_PerformanceTests_HadoopFormat.yml @@ -16,10 +16,8 @@ name: PerformanceTests HadoopFormat on: - issue_comment: - types: [created] schedule: - - cron: '16 7/12 * * *' + - cron: '10 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_HadoopFormat_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_HadoopFormat: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java HadoopFormatIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -94,9 +95,10 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/hadoopFormat.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/hadoopFormat.txt arguments: | --postgresServerName=${{ steps.install_postgres.outputs.postgres_IP }} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_JDBC.yml b/.github/workflows/beam_PerformanceTests_JDBC.yml index c5995480e9d5d..caa2955b6c237 100644 --- a/.github/workflows/beam_PerformanceTests_JDBC.yml +++ b/.github/workflows/beam_PerformanceTests_JDBC.yml @@ -16,10 +16,8 @@ name: PerformanceTests JDBC on: - issue_comment: - types: [created] schedule: - - cron: '30 1,13 * * *' + - cron: '10 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_JDBC_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_JDBC: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java JdbcIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -94,9 +95,10 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/JDBC.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/JDBC.txt arguments: | --postgresServerName=${{ steps.postgres_for_dev.outputs.pfd_ip }} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: Run Java JdbcIO Performance Test uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_Kafka_IO.yml b/.github/workflows/beam_PerformanceTests_Kafka_IO.yml index ea709238b8f3b..ac96a50efcb70 100644 --- a/.github/workflows/beam_PerformanceTests_Kafka_IO.yml +++ b/.github/workflows/beam_PerformanceTests_Kafka_IO.yml @@ -16,10 +16,8 @@ name: PerformanceTests Kafka IO on: - issue_comment: - types: [created] schedule: - - cron: '30 2,14 * * *' + - cron: '10 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,21 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_Kafka_IO_test_arguments_1: '' - beam_PerformanceTests_Kafka_IO_test_arguments_2: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_Kafka_IO: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java KafkaIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 @@ -73,6 +71,14 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Authenticate on GCP + id: auth + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: @@ -98,10 +104,11 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/kafka_IO_Streaming.txt - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/kafka_IO_Batch.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/kafka_IO_Streaming.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/kafka_IO_Batch.txt arguments: | --kafkaBootstrapServerAddresses=${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_0 }}:${{ env.KAFKA_SERVICE_PORT }},${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_1 }}:${{ env.KAFKA_SERVICE_PORT }},${{ steps.set_brokers.outputs.KAFKA_SERVICE_BROKER_2 }}:${{ env.KAFKA_SERVICE_PORT }} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: Run Java KafkaIO Performance Streaming Test uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml b/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml index 5834bd8ab3e0a..3b6e54bc7b92a 100644 --- a/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml +++ b/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT.yml @@ -16,10 +16,8 @@ name: PerformanceTests ManyFiles TextIOIT on: - issue_comment: - types: [created] schedule: - - cron: '10 2/12 * * *' + - cron: '10 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_ManyFiles_TextIOIT_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_ManyFiles_TextIOIT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java ManyFilesTextIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,15 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_ManyFiles_TextIOIT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/manyFiles_TextIOIT.txt arguments: | --filenamePrefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}}/ + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml index 03163a41dcf95..8a34dae133aa6 100644 --- a/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_ManyFiles_TextIOIT_HDFS.yml @@ -16,10 +16,8 @@ name: PerformanceTests ManyFiles TextIOIT HDFS on: - issue_comment: - types: [created] schedule: - - cron: '50 2/12 * * *' + - cron: '10 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_ManyFiles_TextIOIT_HDFS_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_ManyFiles_TextIOIT_HDFS: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java ManyFilesTextIO Performance Test HDFS' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -93,10 +94,11 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_ManyFiles_TextIOIT_HDFS.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/manyFiles_TextIOIT_HDFS.txt arguments: | --filenamePrefix=hdfs://${{ steps.install_hadoop.outputs.hadoop_IP }}:9000/TEXTIO_IT_ --hdfsConfiguration=[{\\\"fs.defaultFS\\\":\\\"hdfs:${{ steps.install_hadoop.outputs.hadoop_IP }}:9000\\\",\\\"dfs.replication\\\":1}] + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml b/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml index 0ad21f99f8d45..6d4a9f4e2b88c 100644 --- a/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml +++ b/.github/workflows/beam_PerformanceTests_MongoDBIO_IT.yml @@ -16,10 +16,8 @@ name: PerformanceTests MongoDBIO IT on: - issue_comment: - types: [created] schedule: - - cron: '14 5/12 * * *' + - cron: '50 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_MongoDBIO_IT_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_MongoDBIO_IT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java MongoDBIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -94,9 +95,10 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/mongoDBIO_IT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/mongoDBIO_IT.txt arguments: | --mongoDBHostName=${{ steps.install_mongo.outputs.mongo_IP }} + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_ParquetIOIT.yml b/.github/workflows/beam_PerformanceTests_ParquetIOIT.yml index ceb540b16b1ff..d2d045f6ba023 100644 --- a/.github/workflows/beam_PerformanceTests_ParquetIOIT.yml +++ b/.github/workflows/beam_PerformanceTests_ParquetIOIT.yml @@ -16,10 +16,8 @@ name: PerformanceTests ParquetIOIT on: - issue_comment: - types: [created] schedule: - - cron: '10 3/12 * * *' + - cron: '50 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_ParquetIOIT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java ParquetIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action # The env variable is created and populated in the test-arguments-action as "_test_arguments_" - name: Prepare test arguments uses: ./.github/actions/test-arguments-action @@ -76,7 +78,7 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_ParquetIOIT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/parquetIOIT.txt arguments: | --filenamePrefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}}/ - name: run integrationTest diff --git a/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml index d0c40599eb62f..e5a85d06c20a1 100644 --- a/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_ParquetIOIT_HDFS.yml @@ -16,10 +16,8 @@ name: PerformanceTests ParquetIOIT HDFS on: - issue_comment: - types: [created] schedule: - - cron: '50 3/12 * * *' + - cron: '50 10/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_ParquetIOIT_HDFS: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java ParquetIO Performance Test HDFS' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -93,7 +95,7 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_ParquetIOIT_HDFS.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/parquetIOIT_HDFS.txt arguments: | --filenamePrefix=hdfs://${{ steps.install_hadoop.outputs.hadoop_IP }}:9000/TEXTIO_IT_ --hdfsConfiguration=[{\\\"fs.defaultFS\\\":\\\"hdfs:${{ steps.install_hadoop.outputs.hadoop_IP }}:9000\\\",\\\"dfs.replication\\\":1}] diff --git a/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml b/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml index 3a16e482979f4..8a1d8196178bb 100644 --- a/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml +++ b/.github/workflows/beam_PerformanceTests_PubsubIOIT_Python_Streaming.yml @@ -16,10 +16,8 @@ name: PerformanceTests PubsubIOIT Python Streaming on: - issue_comment: - types: [created] schedule: - - cron: '11 2 * * *' + - cron: '30 10 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_PubsubIOIT_Python_Streaming_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_PubsubIOIT_Python_Streaming: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run PubsubIO Performance Test Python' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -80,9 +79,10 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/pubsubIOIT_Python_Streaming.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/pubsubIOIT_Python_Streaming.txt arguments: | --job_name=performance-tests-psio-python-2gb$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: Run test uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml b/.github/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml index 06014a56e682c..0fdd94bcfe533 100644 --- a/.github/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml +++ b/.github/workflows/beam_PerformanceTests_SQLBigQueryIO_Batch_Java.yml @@ -16,10 +16,8 @@ name: PerformanceTests SQLBigQueryIO Batch Java on: - issue_comment: - types: [created] schedule: - - cron: '0 7,19 * * *' + - cron: '10 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_SQLBigQueryIO_Batch_Java_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_SQLBigQueryIO_Batch_Java: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run SQLBigQueryIO Batch Performance Test Java' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -80,7 +79,8 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/SQLBigQueryIO_Batch_Java.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/SQLBigQueryIO_Batch_Java.txt + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: Run SQLBigQueryIO Batch Performance Test Java uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_SingleStoreIO.yml b/.github/workflows/beam_PerformanceTests_SingleStoreIO.yml new file mode 100644 index 0000000000000..0efb001a4eb6e --- /dev/null +++ b/.github/workflows/beam_PerformanceTests_SingleStoreIO.yml @@ -0,0 +1,119 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: PerformanceTests SingleStoreIO + +on: + schedule: + - cron: '0 */12 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} + +jobs: + beam_PerformanceTests_SingleStoreIO: + name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + strategy: + matrix: + job_name: [beam_PerformanceTests_SingleStoreIO] + job_phrase: [Run Java SingleStoreIO Performance Test] + if: | + github.event_name == 'push' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Java SingleStoreIO Performance Test' + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Authenticate on GCP + id: auth + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} + - name: Set k8s access + uses: ./.github/actions/setup-k8s-access + with: + cluster_name: io-datastores + k8s_namespace: ${{ matrix.job_name }}-${{ github.run_id }} + - name: Install Singlestore operator + run: | + kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-rbac.yaml + kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-cluster-crd.yaml + kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-operator.yaml + kubectl wait --for=condition=Ready pod -l name=sdb-operator --timeout=300s + - name: Install Singlestore cluster + id: install_singlestore + run: | + kubectl apply -f ${{github.workspace}}/.test-infra/kubernetes/singlestore/sdb-cluster.yaml + kubectl wait --for=jsonpath='{.status.phase}'=Running memsqlclusters.memsql.com --all --timeout=300s + kubectl wait svc/svc-sdb-cluster-ddl --for=jsonpath='{.status.loadBalancer.ingress[0].ip}' --timeout=300s + loadbalancer_IP=$(kubectl get svc svc-sdb-cluster-ddl -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + echo lb_ip=$loadbalancer_IP >> $GITHUB_OUTPUT + - name: Prepare test arguments + uses: ./.github/actions/test-arguments-action + with: + test-type: performance + test-language: java + argument-file-paths: | + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/config_PerformanceTests_SingleStoreIO.txt + arguments: | + --singleStoreServerName=${{ steps.install_singlestore.outputs.lb_ip }} + # The env variable is created and populated in the test-arguments-action as "_test_arguments_" + - name: Run Java SingleStore IO Performance Test + id: run_java_singlestore_io_performance_test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:io:singlestore:integrationTest + arguments: | + --tests org.apache.beam.sdk.io.singlestore.SingleStoreIOPerformanceIT \ + --info \ + -DintegrationTestRunner=dataflow \ + -DintegrationTestPipelineOptions='[${{ env.beam_PerformanceTests_SingleStoreIO_test_arguments_1 }}]' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml b/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml index 9e464ef58900e..8e77a0edd66f9 100644 --- a/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml +++ b/.github/workflows/beam_PerformanceTests_SpannerIO_Read_2GB_Python.yml @@ -16,10 +16,8 @@ name: PerformanceTests SpannerIO Read 2GB Python on: - issue_comment: - types: [created] schedule: - - cron: '30 4 * * *' + - cron: '30 10 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_SpannerIO_Read_2GB_Python_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_SpannerIO_Read_2GB_Python: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run SpannerIO Read 2GB Performance Test Python' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 480 @@ -80,9 +79,10 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/spannerIO_Read_2GB_Python.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/spannerIO_Read_2GB_Python.txt arguments: | --job_name=performance-tests-spanner-read-python-2gb$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Performance SpannerIO Read 2GB Test Python uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml b/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml index 8cfce643f7499..92664757e70ab 100644 --- a/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml +++ b/.github/workflows/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch.yml @@ -16,10 +16,8 @@ name: PerformanceTests SpannerIO Write 2GB Python Batch on: - issue_comment: - types: [created] schedule: - - cron: '0 5 * * *' + - cron: '30 11 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run SpannerIO Write 2GB Performance Test Python Batch' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 480 @@ -80,9 +79,10 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/spannerIO_Write_2GB_Python.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/spannerIO_Write_2GB_Python.txt arguments: | --job_name=performance-tests-spannerio-write-python-batch-2gb$(date '+%m%d%H%M%S' --utc) + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run Performance SpannerIO Write 2GB Test Python uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml b/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml index 58c561f6ef649..33d8122775d9d 100644 --- a/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml +++ b/.github/workflows/beam_PerformanceTests_SparkReceiver_IO.yml @@ -16,10 +16,8 @@ name: PerformanceTests SparkReceiver IO on: - issue_comment: - types: [created] schedule: - - cron: '15 6/12 * * *' + - cron: '10 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_SparkReceiver_IO_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_SparkReceiver_IO: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java SparkReceiverIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -94,9 +95,10 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/sparkReceiver_IO.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/sparkReceiver_IO.txt arguments: | --rabbitMqBootstrapServerAddress=amqp://guest:guest@${{ steps.install_rabbitmq.outputs.rabbitmq_IP }}:5672 + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_TFRecordIOIT.yml b/.github/workflows/beam_PerformanceTests_TFRecordIOIT.yml index aa96b7e2bf815..84438ff31584b 100644 --- a/.github/workflows/beam_PerformanceTests_TFRecordIOIT.yml +++ b/.github/workflows/beam_PerformanceTests_TFRecordIOIT.yml @@ -16,10 +16,8 @@ name: PerformanceTests TFRecordIOIT on: - issue_comment: - types: [created] schedule: - - cron: '10 4/12 * * *' + - cron: '50 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_TFRecordIOIT_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_TFRecordIOIT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java TFRecordIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,15 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_TFRecordIOIT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/TFRecordIOIT.txt arguments: | --filenamePrefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}}/ + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml index bbc4a79aa0f3c..6cc273bbe9c0f 100644 --- a/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_TFRecordIOIT_HDFS.yml @@ -18,10 +18,8 @@ name: PerformanceTests TFRecordIOIT HDFS on: # TODO(https://github.com/apache/beam/issues/18796) TFRecord performance test is failing only when running on hdfs. # We need to fix this before enabling this job on jenkins. - # issue_comment: - # types: [created] # schedule: - # - cron: '17 8/20 * * *' + # - cron: '50 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -42,20 +40,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_TFRecordIOIT_HDFS_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_TFRecordIOIT_HDFS: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java TFRecordIO Performance Test HDFS' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -72,6 +71,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -96,10 +97,11 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/TFRecordIOIT_HDFS.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/TFRecordIOIT_HDFS.txt arguments: | --filenamePrefix=hdfs://${{ steps.install_hadoop.outputs.hadoop_IP }}:9000/TEXTIO_IT_ --hdfsConfiguration=[{\\\"fs.defaultFS\\\":\\\"hdfs:${{ steps.install_hadoop.outputs.hadoop_IP }}:9000\\\",\\\"dfs.replication\\\":1}] + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_TextIOIT.yml b/.github/workflows/beam_PerformanceTests_TextIOIT.yml index 9daa2b29dd2a0..96b20ad3a6f66 100644 --- a/.github/workflows/beam_PerformanceTests_TextIOIT.yml +++ b/.github/workflows/beam_PerformanceTests_TextIOIT.yml @@ -16,10 +16,8 @@ name: PerformanceTests TextIOIT on: - issue_comment: - types: [created] schedule: - - cron: '0 7,19 * * *' + - cron: '10 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_TextIOIT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java TextIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,13 +69,15 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/textIOIT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/textIOIT.txt arguments: | --filenamePrefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}} # The env variable is created and populated in the test-arguments-action as "beam_PerformanceTests_TextIOIT_test_arguments_1" diff --git a/.github/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml index a98379b281a9c..aca9739c3fb11 100644 --- a/.github/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_TextIOIT_HDFS.yml @@ -16,10 +16,8 @@ name: PerformanceTests TextIOIT HDFS on: - issue_comment: - types: [created] schedule: - - cron: '30 7,19 * * *' + - cron: '10 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_TextIOIT_HDFS: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java TextIO Performance Test HDFS' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: @@ -87,7 +89,7 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/textIOIT_HDFS.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/textIOIT_HDFS.txt arguments: | --filenamePrefix=hdfs://${{ steps.install_hadoop.outputs.hadoop_IP }}:9000/TEXTIO_IT_ --hdfsConfiguration=[{\\\"fs.defaultFS\\\":\\\"hdfs:${{ steps.install_hadoop.outputs.hadoop_IP }}:9000\\\",\\\"dfs.replication\\\":1}] diff --git a/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml b/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml index cb2b7fb34a9fa..2bc8bdbb194c2 100644 --- a/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml +++ b/.github/workflows/beam_PerformanceTests_TextIOIT_Python.yml @@ -16,10 +16,8 @@ name: PerformanceTests TextIOIT Python on: - issue_comment: - types: [created] schedule: - - cron: '0 8,20 * * *' + - cron: '30 11 * * *' workflow_dispatch: # Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,19 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_TextIOIT_Python: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Python TextIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -79,7 +79,7 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/textIOIT_Python.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/textIOIT_Python.txt arguments: | --filename_prefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}} # The env variable is created and populated in the test-arguments-action as "beam_PerformanceTests_TextIOIT_Python_test_arguments_1" diff --git a/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml b/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml index 53b157d691c59..b3e606fcc3904 100644 --- a/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml +++ b/.github/workflows/beam_PerformanceTests_WordCountIT_PythonVersions.yml @@ -16,10 +16,8 @@ name: PerformanceTests WordCountIT PythonVersions on: - issue_comment: - types: [created] schedule: - - cron: '12 3 * * *' + - cron: '50 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,25 +38,26 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_WordCountIT_PythonVersions_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_WordCountIT_PythonVersions: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || (startswith(github.event.comment.body, 'Run Python') && endswith(github.event.comment.body, 'WordCountIT Performance Test')) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 - name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }}${{matrix.python_version}} ${{ matrix.job_phrase_2 }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.python_version }} ${{ matrix.job_phrase_2 }}) strategy: fail-fast: false matrix: @@ -71,13 +70,13 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase_1 }}${{matrix.python_version}} ${{ matrix.job_phrase_2 }} + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.python_version }} ${{ matrix.job_phrase_2 }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }}${{matrix.python_version}} ${{ matrix.job_phrase_2 }}) + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.python_version }} ${{ matrix.job_phrase_2 }}) - name: Setup environment uses: ./.github/actions/setup-environment-action with: - python-version: ${{matrix.python_version}} + python-version: ${{ matrix.python_version }} - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | @@ -90,11 +89,12 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/wordCountIT_Python.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/wordCountIT_Python.txt arguments: | --job_name=performance-tests-wordcount-python${{steps.set_py_ver_clean.outputs.py_ver_clean}}-batch-1gb$(date '+%m%d%H%M%S' --utc) --metrics_table=wordcount_py${{steps.set_py_ver_clean.outputs.py_ver_clean}}_pkb_results --influx_measurement=wordcount_py${{steps.set_py_ver_clean.outputs.py_ver_clean}}_results + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: Run test uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -102,4 +102,17 @@ jobs: arguments: | --info \ -Ptest=apache_beam/examples/wordcount_it_test.py::WordCountIT::test_wordcount_it \ - "-Ptest-pipeline-options=${{ env.beam_PerformanceTests_WordCountIT_PythonVersions_test_arguments_1 }}" \ No newline at end of file + "-Ptest-pipeline-options=${{ env.beam_PerformanceTests_WordCountIT_PythonVersions_test_arguments_1 }}" + - name: Archive Python Test Results + uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_XmlIOIT.yml b/.github/workflows/beam_PerformanceTests_XmlIOIT.yml index cd0245f269d11..cf83d2ea40155 100644 --- a/.github/workflows/beam_PerformanceTests_XmlIOIT.yml +++ b/.github/workflows/beam_PerformanceTests_XmlIOIT.yml @@ -16,10 +16,8 @@ name: PerformanceTests XmlIOIT on: - issue_comment: - types: [created] schedule: - - cron: '30 4/12 * * *' + - cron: '50 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_XmlIOIT_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_XmlIOIT: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java XmlIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,15 +69,18 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Prepare test arguments uses: ./.github/actions/test-arguments-action with: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_XmlIOIT.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/xmlIOIT.txt arguments: | --filenamePrefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}}/ + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml b/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml index a89f0b5dcb0f4..3740a30f129f4 100644 --- a/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml +++ b/.github/workflows/beam_PerformanceTests_XmlIOIT_HDFS.yml @@ -16,10 +16,8 @@ name: PerformanceTests XmlIOIT HDFS on: - issue_comment: - types: [created] schedule: - - cron: '50 4/12 * * *' + - cron: '50 11/12 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,20 +38,21 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - beam_PerformanceTests_XmlIOIT_HDFS_test_arguments_1: '' + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_XmlIOIT_HDFS: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java XmlIO Performance Test HDFS' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -70,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -93,10 +94,11 @@ jobs: test-type: performance test-language: java argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/config_XmlIOIT_HDFS.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/xmlIOIT_HDFS.txt arguments: | --filenamePrefix=hdfs://${{ steps.install_hadoop.outputs.hadoop_IP }}:9000/TEXTIO_IT_ --hdfsConfiguration=[{\\\"fs.defaultFS\\\":\\\"hdfs:${{ steps.install_hadoop.outputs.hadoop_IP }}:9000\\\",\\\"dfs.replication\\\":1}] + # The env variables are created and populated in the test-arguments-action as "_test_arguments_" - name: run integrationTest uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml b/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml index a29454ced4bff..c6c60e1657219 100644 --- a/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml +++ b/.github/workflows/beam_PerformanceTests_xlang_KafkaIO_Python.yml @@ -16,10 +16,8 @@ name: PerformanceTests xlang KafkaIO Python on: - issue_comment: - types: [created] schedule: - - cron: '10 5 * * *' + - cron: '30 11 * * *' workflow_dispatch: # Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,27 +38,29 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} + INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} jobs: beam_PerformanceTests_xlang_KafkaIO_Python: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || - github.event.comment.body == 'Run Java CompressedTextIO Performance Test' + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Python xlang KafkaIO Performance Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) strategy: matrix: job_name: ["beam_PerformanceTests_xlang_KafkaIO_Python"] - job_phrase: ["Run Java CompressedTextIO Performance Test"] + job_phrase: ["Run Python xlang KafkaIO Performance Test"] steps: - uses: actions/checkout@v3 - name: Setup repository @@ -73,6 +73,12 @@ jobs: uses: ./.github/actions/setup-environment-action with: python-version: 3.8 + - name: Authenticate on GCP + id: auth + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} - name: Set k8s access uses: ./.github/actions/setup-k8s-access with: @@ -101,7 +107,7 @@ jobs: test-type: load test-language: python argument-file-paths: | - ${{ github.workspace }}/.github/workflows/performance-tests-job-configs/xlang_KafkaIO_Python.txt + ${{ github.workspace }}/.github/workflows/performance-tests-pipeline-options/xlang_KafkaIO_Python.txt arguments: | --filename_prefix=gs://temp-storage-for-perf-tests/${{ matrix.job_name }}/${{github.run_id}}/ --bootstrap_servers=${{ steps.kafka_ip.outputs.KAFKA_BROKER_0 }}:32400,${{ steps.kafka_ip.outputs.KAFKA_BROKER_1 }}:32400,${{ steps.kafka_ip.outputs.KAFKA_BROKER_2 }}:32400 diff --git a/.github/workflows/beam_PostCommit_BeamMetrics_Publish.yml b/.github/workflows/beam_PostCommit_BeamMetrics_Publish.yml index 0ea3207b505ad..b655eab8088c3 100644 --- a/.github/workflows/beam_PostCommit_BeamMetrics_Publish.yml +++ b/.github/workflows/beam_PostCommit_BeamMetrics_Publish.yml @@ -16,12 +16,6 @@ name: PostCommit BeamMetrics Publish on: - push: - tags: ['v*'] - branches: ['master', 'release-*'] - paths: ['.github/workflows/beam_PostCommit_BeamMetrics_Publish.yml'] - issue_comment: - types: [created] schedule: - cron: '24 2 * * *' workflow_dispatch: @@ -44,7 +38,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true @@ -58,7 +52,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Beam Metrics Deployment' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -75,6 +69,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 with: diff --git a/.github/workflows/beam_PostCommit_Go.yml b/.github/workflows/beam_PostCommit_Go.yml index dc1180314d67c..f9da27ff2ce90 100644 --- a/.github/workflows/beam_PostCommit_Go.yml +++ b/.github/workflows/beam_PostCommit_Go.yml @@ -16,10 +16,10 @@ name: PostCommit Go on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Go: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Go PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 300 @@ -69,6 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 - name: Authenticate on GCP diff --git a/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml b/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml index 2473a44b010b5..532c48e7b0724 100644 --- a/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml +++ b/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml @@ -16,14 +16,12 @@ name: PostCommit Go Dataflow ARM on: - push: - tags: ['v*'] - branches: ['master', 'release-*'] - paths: ['model/**', 'sdks/go.**', 'release/**', '.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -44,7 +42,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -57,7 +55,8 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Go PostCommit Dataflow ARM' runs-on: [self-hosted, ubuntu-20.04, main] name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) diff --git a/.github/workflows/beam_PostCommit_Go_VR_Flink.yml b/.github/workflows/beam_PostCommit_Go_VR_Flink.yml index 21dcf7f8e72a2..ce1e82d22e2a0 100644 --- a/.github/workflows/beam_PostCommit_Go_VR_Flink.yml +++ b/.github/workflows/beam_PostCommit_Go_VR_Flink.yml @@ -16,10 +16,10 @@ name: PostCommit Go VR Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Go_VR_Flink: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Go Flink ValidatesRunner' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,6 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Go Flink ValidatesRunner script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} diff --git a/.github/workflows/beam_PostCommit_Go_VR_Samza.yml b/.github/workflows/beam_PostCommit_Go_VR_Samza.yml index 90e107ee0c175..5d1ab6a8d13eb 100644 --- a/.github/workflows/beam_PostCommit_Go_VR_Samza.yml +++ b/.github/workflows/beam_PostCommit_Go_VR_Samza.yml @@ -16,10 +16,10 @@ name: PostCommit Go VR Samza on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Go_VR_Samza: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Go Samza ValidatesRunner' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 diff --git a/.github/workflows/beam_PostCommit_Go_VR_Spark.yml b/.github/workflows/beam_PostCommit_Go_VR_Spark.yml index 07cd627059b20..ed9f505831330 100644 --- a/.github/workflows/beam_PostCommit_Go_VR_Spark.yml +++ b/.github/workflows/beam_PostCommit_Go_VR_Spark.yml @@ -16,10 +16,10 @@ name: PostCommit Go VR Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Go_VR_Spark: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Go Spark ValidatesRunner' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,6 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Go Spark ValidatesRunner script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} diff --git a/.github/workflows/beam_PostCommit_Java.yml b/.github/workflows/beam_PostCommit_Java.yml index 3eed85bc2026a..d9fa4d52f9656 100644 --- a/.github/workflows/beam_PostCommit_Java.yml +++ b/.github/workflows/beam_PostCommit_Java.yml @@ -18,15 +18,15 @@ name: PostCommit Java on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java PostCommit] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java PostCommit' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml b/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml index 1bd828d08ee05..36bdec265dd57 100644 --- a/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml +++ b/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml @@ -18,15 +18,15 @@ name: PostCommit Java Avro Versions on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Avro_Versions: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Avro Versions PostCommit] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Avro Versions PostCommit' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Avro Versions script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml b/.github/workflows/beam_PostCommit_Java_BigQueryEarlyRollout.yml similarity index 57% rename from .github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml rename to .github/workflows/beam_PostCommit_Java_BigQueryEarlyRollout.yml index 03f4d32861b1b..f21f301cc2651 100644 --- a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml +++ b/.github/workflows/beam_PostCommit_Java_BigQueryEarlyRollout.yml @@ -15,54 +15,29 @@ # specific language governing permissions and limitations # under the License. -name: PreCommit Java Examples Dataflow Java11 +name: PostCommit Java BigQueryEarlyRollout on: - push: - tags: ['v*'] - branches: ['master', 'release-*'] - paths: - - 'model/**' - - 'sdks/java/**' - - 'runners/google-cloud-dataflow-java/**' - - 'examples/java/**' - - 'examples/kotlin/**' - - 'release/**' - - '.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java11.yml' - pull_request_target: - branches: ['master', 'release-*'] - paths: - - 'model/**' - - 'sdks/java/**' - - 'runners/google-cloud-dataflow-java/**' - - 'examples/java/**' - - 'examples/kotlin/**' - - 'release/**' - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true -env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} - GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} - GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: write + pull-requests: read checks: write contents: read deployments: read id-token: none - issues: write + issues: read discussions: read packages: read pages: read @@ -70,29 +45,35 @@ permissions: security-events: read statuses: read +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: - beam_PreCommit_Java_Examples_Dataflow_Java11: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + beam_PostCommit_Java_BigQueryEarlyRollout: + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 strategy: matrix: - job_name: [beam_PreCommit_Java_Examples_Dataflow_Java11] - job_phrase: [Run Java_Examples_Dataflow_Java11 PreCommit] - timeout-minutes: 60 + job_name: [beam_PostCommit_Java_BigQueryEarlyRollout] + job_phrase: [Run Java BigQueryEarlyRollout PostCommit] if: | - github.event_name == 'push' || - github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - github.event.comment.body == 'Run Java_Examples_Dataflow_Java11 PreCommit' + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Java BigQueryEarlyRollout PostCommit' steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} (${{matrix.job_phrase}}) + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 with: @@ -100,23 +81,10 @@ jobs: service_account_key: ${{ secrets.GCP_SA_KEY }} project_id: ${{ secrets.GCP_PROJECT_ID }} export_default_credentials: true - # The workflow installs java 11 and as default jvm. This is different from - # PreCommit_Java_Examples_Dataflow_Java17 where the build system and sources are compiled with Java8 - - name: Set up Java - uses: actions/setup-java@v3.8.0 - with: - distribution: 'temurin' - java-version: '11' - - name: run javaExamplesDataflowPrecommit script + - name: run PostCommit Java BigQueryEarlyRollout script uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :runners:google-cloud-dataflow-java:examples:preCommit - arguments: | - -PdisableSpotlessCheck=true \ - -PdisableCheckStyle=true \ - -PskipCheckerFramework \ - -PcompileAndRunTestsWithJava11 \ - -Pjava11Home=$JAVA_HOME_11_X64 \ + gradle-command: :sdks:java:io:google-cloud-platform:bigQueryEarlyRolloutIntegrationTest - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/beam_PostCommit_Java_DataflowV1.yml b/.github/workflows/beam_PostCommit_Java_DataflowV1.yml index 55e4f99afc74d..68c901c2e5277 100644 --- a/.github/workflows/beam_PostCommit_Java_DataflowV1.yml +++ b/.github/workflows/beam_PostCommit_Java_DataflowV1.yml @@ -18,15 +18,15 @@ name: PostCommit Java Dataflow V1 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_DataflowV1: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run PostCommit_Java_Dataflow] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run PostCommit_Java_Dataflow' steps: - uses: actions/checkout@v4 @@ -71,10 +72,9 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | 11 8 diff --git a/.github/workflows/beam_PostCommit_Java_DataflowV2.yml b/.github/workflows/beam_PostCommit_Java_DataflowV2.yml index 7ca2f57ce78d9..90a97296df774 100644 --- a/.github/workflows/beam_PostCommit_Java_DataflowV2.yml +++ b/.github/workflows/beam_PostCommit_Java_DataflowV2.yml @@ -18,15 +18,15 @@ name: PostCommit Java Dataflow V2 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_DataflowV2: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run PostCommit_Java_DataflowV2] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run PostCommit_Java_DataflowV2' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Dataflow V2 script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow.yml b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow.yml index 3b3b51b94f256..488031a9244a3 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow.yml @@ -18,15 +18,15 @@ name: PostCommit Java Examples Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Examples_Dataflow_V2: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java examples on Dataflow] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Examples on Dataflow' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Examples Dataflow script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml index 939d2646d352b..26ea487e0dcf4 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml @@ -18,26 +18,17 @@ name: PostCommit Java Examples Dataflow ARM on: - push: - tags: ['v*'] - branches: ['master', 'release-*'] - paths: - - 'model/**' - - 'sdks/java/**' - - 'runners/google-cloud-dataflow-java/**' - - 'examples/java/**' - - 'examples/kotlin/**' - - 'release/**' - - '.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -63,7 +54,7 @@ permissions: jobs: beam_PostCommit_Java_Examples__Dataflow_ARM: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.java_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.java_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -71,20 +62,21 @@ jobs: matrix: job_name: [beam_PostCommit_Java_Examples__Dataflow_ARM] job_phrase: [Run Java_Examples_Dataflow_ARM PostCommit] - java_version: ['8','11','17'] + java_version: ['8','11','17','21'] if: | github.event_name == 'push' || - github.event_name == 'schedule' || - github. event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startswith(github.event.comment.body, 'Run Java_Examples_Dataflow_ARM PostCommit') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase }} ${{matrix.java_version}} + comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.java_version }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{matrix.java_version}}) + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.java_version }}) - name: Setup environment uses: ./.github/actions/setup-environment-action with: @@ -111,7 +103,7 @@ jobs: gradle-command: :runners:google-cloud-dataflow-java:arm:examplesJavaRunnerV2IntegrationTestARM max-workers: 12 arguments: | - -PcompileAndRunTestsWithJava${{ matrix.java_version }} \ + -PtestJavaVersion=${{ matrix.java_version }} \ -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ -Pcontainer-architecture-list=arm64,amd64 \ -Ppush-containers \ diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml index a998e02422543..747c9f0983fce 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_Java.yml @@ -18,15 +18,15 @@ name: PostCommit Java Examples Dataflow Java on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -60,23 +60,23 @@ jobs: matrix: job_name: [beam_PostCommit_Java_Examples_Dataflow_Java] job_phrase: [Run Java examples on Dataflow Java] - java_version: ['11','17'] + java_version: ['11','17','21'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startswith(github.event.comment.body, 'Run Java examples on Dataflow Java') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase }} ${{matrix.java_version}} + comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.java_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.java_version }}) - - name: Set up Java${{ matrix.java_version }} - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | ${{ matrix.java_version }} 8 diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml index 0c87a81c5bedf..ae50ec506dff5 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2.yml @@ -18,15 +18,15 @@ name: PostCommit Java Examples Dataflow V2 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Examples_Dataflow_V2: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Examples on Dataflow Runner V2] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Examples on Dataflow Runner V2' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Examples Dataflow V2 script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -86,4 +89,6 @@ jobs: uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml index 2526f9b565313..1ed3cf5eb981a 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_V2_Java.yml @@ -18,15 +18,15 @@ name: PostCommit Java Examples Dataflow V2 Java on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Examples_Dataflow_V2_Java: - name: ${{matrix.job_name}} (${{matrix.job_phrase_1}}${{matrix.job_phrase_2}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -61,10 +61,11 @@ jobs: job_name: [beam_PostCommit_Java_Examples_Dataflow_V2_Java] job_phrase_1: [Run Java ] job_phrase_2: [Examples on Dataflow Runner V2] - java_version: ['11', '17'] + java_version: ['11', '17', '21'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || (contains(github.event.comment.body, 'Run Java') && contains(github.event.comment.body, 'Examples on Dataflow Runner V2')) steps: @@ -72,14 +73,16 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase_1 }} ${{matrix.java_version}} ${{ matrix.job_phrase_2 }} + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} ${{ matrix.job_phrase_1 }} ${{matrix.java_version}} ${{ matrix.job_phrase_2 }} - - name: Set up Java${{ matrix.java_version }} - uses: actions/setup-java@v3.8.0 + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: distribution: 'temurin' - java-version: ${{ matrix.java_version }} + java-version: | + ${{ matrix.java_version }} + 8 - name: run PostCommit Java Examples Dataflow V2 Java${{ matrix.java_version }} script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -88,7 +91,7 @@ jobs: -PdisableSpotlessCheck=true \ -PdisableCheckStyle=true \ -PskipCheckerFramework \ - -PcompileAndRunTestsWithJava${{ matrix.java_version }} \ + -PtestJavaVersion=${{ matrix.java_version }} \ -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 @@ -100,4 +103,6 @@ jobs: uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Direct.yml b/.github/workflows/beam_PostCommit_Java_Examples_Direct.yml index 6fd1150aecd11..fa7e76942808b 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Direct.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Direct.yml @@ -18,15 +18,15 @@ name: PostCommit Java Examples Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Examples_Direct: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Examples_Direct] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Examples_Direct' steps: - uses: actions/checkout@v4 @@ -71,11 +72,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run examplesIntegrationTest script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml b/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml index b123134cd239f..8e7a99f12ac86 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml @@ -18,15 +18,15 @@ name: PostCommit Java Examples Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Examples_Flink: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Examples_Flink] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Examples_Flink' steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Spark.yml b/.github/workflows/beam_PostCommit_Java_Examples_Spark.yml index fa28cdf402bf8..f927fa8b3b462 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Spark.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Spark.yml @@ -18,15 +18,15 @@ name: PostCommit Java Examples Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Examples_Spark: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Examples_Spark] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Examples_Spark' steps: - uses: actions/checkout@v4 @@ -71,11 +72,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run examplesIntegrationTest script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Hadoop_Versions.yml b/.github/workflows/beam_PostCommit_Java_Hadoop_Versions.yml index f9c7175ccd21b..c0c85fb0cd311 100644 --- a/.github/workflows/beam_PostCommit_Java_Hadoop_Versions.yml +++ b/.github/workflows/beam_PostCommit_Java_Hadoop_Versions.yml @@ -16,10 +16,10 @@ name: PostCommit Java Hadoop Versions on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 3/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,8 @@ jobs: job_phrase: [Run PostCommit_Java_Hadoop_Versions] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run PostCommit_Java_Hadoop_Versions' steps: - uses: actions/checkout@v4 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunner script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml b/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml index 471782621fa77..4bbc23b13de4b 100644 --- a/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml +++ b/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml @@ -16,14 +16,12 @@ name: PostCommit Java IO Performance Tests on: - push: - tags: ['v*'] - branches: ['master', 'release-*'] - paths: ['it/google-cloud-platform/**','.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -44,7 +42,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.sender.login }}-${{ github.event.schedule }}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -57,7 +55,8 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java PostCommit IO Performance Tests' runs-on: [self-hosted, ubuntu-20.04, main] name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.test_case }}) @@ -65,7 +64,7 @@ jobs: matrix: job_name: ["beam_PostCommit_Java_IO_Performance_Tests"] job_phrase: ["Run Java PostCommit IO Performance Tests"] - test_case: ["GCSPerformanceTest", "BigTablePerformanceTest"] + test_case: ["GCSPerformanceTest", "BigTablePerformanceTest", "BigQueryStorageApiStreamingPerformanceTest"] steps: - uses: actions/checkout@v4 - name: Setup repository diff --git a/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml b/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml index f048fdc6f1f7b..3e0022ba1bea1 100644 --- a/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml +++ b/.github/workflows/beam_PostCommit_Java_InfluxDbIO_IT.yml @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. -name: Java InfluxDbIO Integration Test +name: PostCommit Java InfluxDbIO Integration Test on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,7 @@ permissions: jobs: beam_PostCommit_Java_InfluxDbIO_IT: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 strategy: @@ -60,8 +60,9 @@ jobs: job_name: [beam_PostCommit_Java_InfluxDbIO_IT] job_phrase: [Run Java InfluxDbIO_IT] if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || github.event.comment.body == 'Run Java InfluxDbIO_IT' steps: - uses: actions/checkout@v3 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 diff --git a/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml b/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml index 3ee009747a883..2449054a10732 100644 --- a/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml +++ b/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml @@ -16,10 +16,10 @@ name: PostCommit Java Jpms Dataflow Java11 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Java_Jpms_Dataflow_Java11: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Jpms Dataflow Java 11 PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java 11 - uses: actions/setup-java@v3.11.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' - java-version: '11' + java-version: 11 - name: run PostCommit Java Jpms Dataflow Java11 script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml b/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml index 398be301eecfd..611c8a9d31deb 100644 --- a/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml +++ b/.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml @@ -16,10 +16,10 @@ name: PostCommit Java Jpms Dataflow Java17 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Java_Jpms_Dataflow_Java17: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Jpms Dataflow Java 17 PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -69,10 +70,9 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java - uses: actions/setup-java@v3.11.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | 17 8 @@ -82,7 +82,7 @@ jobs: gradle-command: :sdks:java:testing:jpms-tests:dataflowRunnerIntegrationTest arguments: -PskipCheckerFramework - -PcompileAndRunTestsWithJava17 + -PtestJavaVersion=17 -Pjava17Home=$JAVA_HOME_17_X64 - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 diff --git a/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml b/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml index 44b89acb642e5..6fdff9c451eb0 100644 --- a/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml +++ b/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml @@ -16,10 +16,10 @@ name: PostCommit Java Jpms Direct Java11 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Java_Jpms_Direct_Java11: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Jpms Direct Java 11 PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java 11 - uses: actions/setup-java@v3.11.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' - java-version: '11' + java-version: 11 - name: run PostCommit Java Jpms Direct Java11 script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml b/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml index dbeb84ab660b1..3f62861eb025c 100644 --- a/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml +++ b/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml @@ -16,10 +16,10 @@ name: PostCommit Java Jpms Direct Java17 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Java_Jpms_Direct_Java17: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Jpms Direct Java 17 PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -69,10 +70,9 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java - uses: actions/setup-java@v3.11.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | 17 8 @@ -82,7 +82,7 @@ jobs: gradle-command: :sdks:java:testing:jpms-tests:directRunnerIntegrationTest arguments: -PskipCheckerFramework - -PcompileAndRunTestsWithJava17 + -PtestJavaVersion=17 -Pjava17Home=$JAVA_HOME_17_X64 - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 diff --git a/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java21.yml b/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java21.yml new file mode 100644 index 0000000000000..e1926a4300696 --- /dev/null +++ b/.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java21.yml @@ -0,0 +1,100 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: PostCommit Java Jpms Direct Java21 + +on: + schedule: + - cron: '0 */6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PostCommit_Java_Jpms_Direct_Java21: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Jpms Direct Java 21 PostCommit' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_PostCommit_Java_Jpms_Direct_Java21"] + job_phrase: ["Run Jpms Direct Java 21 PostCommit"] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Set up Java + uses: actions/setup-java@v3.11.0 + with: + distribution: 'temurin' + java-version: | + 21 + 8 + - name: run PostCommit Java Jpms Direct Java21 script + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:testing:jpms-tests:directRunnerIntegrationTest + arguments: + -PskipCheckerFramework + -PtestJavaVersion=21 + -Pjava21Home=$JAVA_HOME_21_X64 + - name: Archive JUnit Test Results + uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml b/.github/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml index f4ba8ada32ba6..fbc0d16e4994a 100644 --- a/.github/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml +++ b/.github/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml @@ -16,10 +16,10 @@ name: PostCommit Java Jpms Flink Java11 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Java_Jpms_Flink_Java11: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Jpms Flink Java 11 PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java 11 - uses: actions/setup-java@v3.11.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' - java-version: '11' + java-version: 11 - name: run PostCommit Java Jpms Flink Java11 script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml b/.github/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml index 9758fda66eb76..2c2e2acc6f191 100644 --- a/.github/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml +++ b/.github/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml @@ -16,10 +16,10 @@ name: PostCommit Java Jpms Spark Java11 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Java_Jpms_Spark_Java11: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Jpms Spark Java 11 PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java 11 - uses: actions/setup-java@v3.11.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' - java-version: '11' + java-version: 11 - name: run PostCommit Java Jpms Spark Java11 script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml index 74b786b599e16..f4e96961061e6 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml @@ -18,15 +18,15 @@ name: PostCommit Java Nexmark Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -75,7 +75,7 @@ env: jobs: beam_PostCommit_Java_Nexmark_Dataflow: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -87,7 +87,8 @@ jobs: queryLanguage: [sql, zetasql, none] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Dataflow Runner Nexmark Tests' steps: - uses: actions/checkout@v4 @@ -97,6 +98,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Nexmark Dataflow (${{ matrix.streaming }} ${{ matrix.queryLanguage }}) script if: matrix.queryLanguage != 'none' uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml index 3b7836990b697..875f54ea76320 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml @@ -18,15 +18,15 @@ name: PostCommit Java Nexmark Dataflow V2 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -76,7 +76,7 @@ env: jobs: beam_PostCommit_Java_Nexmark_Dataflow_V2: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -87,7 +87,8 @@ jobs: streaming: [false, true] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Dataflow Runner V2 Nexmark Tests' steps: - uses: actions/checkout@v4 @@ -97,6 +98,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Nexmark Dataflow V2 (streaming = ${{ matrix.streaming }}) script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml index a0957f6432791..ba3f8bb610051 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml @@ -18,15 +18,15 @@ name: PostCommit Java Nexmark Dataflow V2 Java on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -75,7 +75,7 @@ env: jobs: beam_PostCommit_Java_Nexmark_Dataflow_V2_Java: - name: ${{matrix.job_name}} (${{matrix.job_phrase_1}} ${{matrix.java_version}} ${{matrix.job_phrase_2}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -88,7 +88,8 @@ jobs: java_version: ['11','17'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || (contains(github.event.comment.body, 'Run Dataflow Runner V2 Java') && contains(github.event.comment.body, 'Nexmark Tests')) steps: @@ -96,21 +97,20 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase_1 }} ${{matrix.java_version}} ${{ matrix.job_phrase_2 }} + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} ${{ matrix.job_phrase_1 }} ${{matrix.java_version}} ${{ matrix.job_phrase_2 }} - - name: Set up Java${{ matrix.java_version }} - uses: actions/setup-java@v3.8.0 + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.java_version }} ${{ matrix.job_phrase_2 }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: ${{ matrix.java_version }} - - name: run PostCommit Java ${{matrix.java_version}} Nexmark Dataflow V2 (streaming = ${{ matrix.streaming }}) script + - name: run PostCommit Java ${{ matrix.java_version }} Nexmark Dataflow V2 (streaming = ${{ matrix.streaming }}) script uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:java:testing:nexmark:run arguments: | - -PcompileAndRunTestsWithJava${{ matrix.java_version }} \ + -PtestJavaVersion=${{ matrix.java_version }} \ -Pjava${{ matrix.java_version }}Home=$JAVA_HOME_${{ matrix.java_version }}_X64 \ -Pnexmark.runner.version=V2 \ -Pnexmark.runner=:runners:google-cloud-dataflow-java \ - '${{ env.GRADLE_COMMAND_ARGUMENTS }}--influxTags={"runnerVersion":"V2","javaVersion":"${{matrix.java_version}}"}--streaming=${{ matrix.streaming }}' \ No newline at end of file + '${{ env.GRADLE_COMMAND_ARGUMENTS }}--influxTags={"runnerVersion":"V2","javaVersion":"${{ matrix.java_version }}"}--streaming=${{ matrix.streaming }}' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml index 4daa13da8b13c..23d766c89823d 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml @@ -18,15 +18,15 @@ name: PostCommit Java Nexmark Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -70,7 +70,7 @@ env: jobs: beam_PostCommit_Java_Nexmark_Direct: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -82,7 +82,8 @@ jobs: queryLanguage: [sql, zetasql, none] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Direct Runner Nexmark Tests' steps: - uses: actions/checkout@v4 @@ -92,6 +93,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Nexmark Direct (${{ matrix.streaming }} ${{ matrix.queryLanguage }}) script if: matrix.queryLanguage != 'none' uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml index a03c447416f36..dd87fec145a4c 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml @@ -18,15 +18,15 @@ name: PostCommit Java Nexmark Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -69,7 +69,7 @@ env: jobs: beam_PostCommit_Java_Nexmark_Flink: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -81,7 +81,8 @@ jobs: queryLanguage: [sql, zetasql, none] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Flink Runner Nexmark Tests' steps: - uses: actions/checkout@v4 @@ -91,6 +92,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Nexmark Flink (${{ matrix.streaming }} ${{ matrix.queryLanguage }}) script if: matrix.queryLanguage != 'none' uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml index a43e7e6d311fe..05229fb4c89b9 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml @@ -18,15 +18,15 @@ name: PostCommit Java Nexmark Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -69,7 +69,7 @@ env: jobs: beam_PostCommit_Java_Nexmark_Spark: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -81,7 +81,8 @@ jobs: queryLanguage: [sql, zetasql, none] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Spark Runner Nexmark Tests' steps: - uses: actions/checkout@v4 @@ -91,6 +92,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Nexmark Spark (runner = ${{ matrix.runner }} queryLanguage = ${{ matrix.queryLanguage }}) script if: matrix.queryLanguage != 'none' uses: ./.github/actions/gradle-command-self-hosted-action diff --git a/.github/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml b/.github/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml index 991e4f71b1c41..cffe074f58eb5 100644 --- a/.github/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml +++ b/.github/workflows/beam_PostCommit_Java_PVR_Flink_Streaming.yml @@ -18,15 +18,15 @@ name: PostCommit Java PVR Flink Streaming on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_PVR_Flink_Streaming: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Flink PortableValidatesRunner Streaming] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Flink PortableValidatesRunner Streaming' steps: - uses: actions/checkout@v4 @@ -71,7 +72,22 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Flink PortableValidatesRunner Streaming script uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: runners:flink:1.15:job-server:validatesPortableRunnerStreaming + - name: Archive JUnit Test Results + uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_PVR_Samza.yml b/.github/workflows/beam_PostCommit_Java_PVR_Samza.yml index 041c031f3f761..729e95fe52193 100644 --- a/.github/workflows/beam_PostCommit_Java_PVR_Samza.yml +++ b/.github/workflows/beam_PostCommit_Java_PVR_Samza.yml @@ -18,15 +18,15 @@ name: PostCommit Java PVR Samza on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_PVR_Samza: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Samza PortableValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Samza PortableValidatesRunner' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Samza script env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} @@ -87,4 +90,6 @@ jobs: uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml b/.github/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml index 2a375d71981a1..d3508e968fe0d 100644 --- a/.github/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml +++ b/.github/workflows/beam_PostCommit_Java_PVR_Spark3_Streaming.yml @@ -18,15 +18,15 @@ name: PostCommit Java PVR Spark3 Streaming on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_PVR_Spark3_Streaming: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Spark v3 PortableValidatesRunner Streaming] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Spark v3 PortableValidatesRunner Streaming' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java PortableValidatesRunner Spark3 Streaming script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -85,4 +88,6 @@ jobs: uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - files: '**/build/test-results/**/*.xml' + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml b/.github/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml index 8d54c77072589..8cc977ddea824 100644 --- a/.github/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml +++ b/.github/workflows/beam_PostCommit_Java_PVR_Spark_Batch.yml @@ -18,15 +18,15 @@ name: PostCommit Java PVR Spark Batch on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_PVR_Spark_Batch: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -61,8 +61,8 @@ jobs: job_phrase: [Run Java Spark PortableValidatesRunner Batch] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || - github.event.comment.body == 'Run Java Spark PortableValidatesRunner Batch' + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') steps: - uses: actions/checkout@v4 - name: Setup repository @@ -71,12 +71,16 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java PortableValidatesRunner Spark Batch script + env: + CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: | - :runners:spark:3:job-server:validatesPortableRunnerBatch - :runners:spark:3:job-server:validatesPortableRunnerDocker + :runners:spark:3:job-server:validatesPortableRunnerBatch \ + :runners:spark:3:job-server:validatesPortableRunnerDocker \ - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() @@ -87,4 +91,12 @@ jobs: uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - files: '**/build/test-results/**/*.xml' \ No newline at end of file + large_files: true + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' + - name: Archive SpotBugs Results + uses: actions/upload-artifact@v3 + with: + name: SpotBugs Results + path: "**/build/reports/spotbugs/*.html" diff --git a/.github/workflows/beam_PostCommit_Java_Sickbay.yml b/.github/workflows/beam_PostCommit_Java_Sickbay.yml index 7f914dc358808..e6de4b2538b51 100644 --- a/.github/workflows/beam_PostCommit_Java_Sickbay.yml +++ b/.github/workflows/beam_PostCommit_Java_Sickbay.yml @@ -18,15 +18,15 @@ name: PostCommit Java Sickbay on: - issue_comment: - types: [created] schedule: - - cron: '0 0 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_Sickbay: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Sickbay] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Sickbay' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Sickbay script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml b/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml index 40a1dc8faa646..680bcec727d33 100644 --- a/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml +++ b/.github/workflows/beam_PostCommit_Java_SingleStoreIO_IT.yml @@ -18,10 +18,10 @@ name: PostCommit Java SingleStoreIO IT on: - issue_comment: - types: [created] schedule: - - cron: '0 */23 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: @@ -42,7 +42,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_SingleStoreIO_IT: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 strategy: @@ -62,8 +62,9 @@ jobs: job_phrase: [Run Java SingleStoreIO_IT] if: | github.event_name == 'push' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || github.event.comment.body == 'Run Java SingleStoreIO_IT' steps: - uses: actions/checkout@v3 @@ -73,6 +74,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: Authenticate on GCP id: auth uses: google-github-actions/auth@v1 @@ -101,5 +104,5 @@ jobs: - name: Run Java SingleStore IO IT uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :sdks:java:io:singlestore:integrationTest --tests org.apache.beam.sdk.io.singlestore.SingleStoreIODefaultMapperIT + gradle-command: :sdks:java:io:singlestore:integrationTest --tests org.apache.beam.sdk.io.singlestore.SingleStoreIODefaultMapperIT :sdks:java:io:singlestore:integrationTest --tests org.apache.beam.sdk.io.singlestore.SingleStoreIOSchemaTransformIT :sdks:java:io:singlestore:integrationTest --tests org.apache.beam.sdk.io.singlestore.SingleStoreIOConnectionAttributesIT arguments: --info -DintegrationTestRunner=dataflow -DintegrationTestPipelineOptions='["--tempRoot=gs://temp-storage-for-perf-tests","--project=apache-beam-testing","--runner=DataflowRunner","--singleStoreUsername=admin","--singleStorePassword=secretpass","--singleStorePort=3306","--numberOfRecords=1000", "--singleStoreServerName=${{ steps.install_singlestore.outputs.lb_ip }}"]' diff --git a/.github/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml b/.github/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml index e19831c607322..a7cc537a69983 100644 --- a/.github/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Java_Tpcds_Dataflow.yml @@ -16,10 +16,10 @@ name: PostCommit Java Tpcds Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -77,7 +77,8 @@ jobs: beam_PostCommit_Java_Tpcds_Dataflow: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Dataflow Runner Tpcds Tests' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -94,6 +95,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Tpcds Dataflow script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Tpcds_Flink.yml b/.github/workflows/beam_PostCommit_Java_Tpcds_Flink.yml index fd35fe4ec776a..a7b9daa569959 100644 --- a/.github/workflows/beam_PostCommit_Java_Tpcds_Flink.yml +++ b/.github/workflows/beam_PostCommit_Java_Tpcds_Flink.yml @@ -16,10 +16,10 @@ name: PostCommit Java Tpcds Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -74,7 +74,8 @@ jobs: beam_PostCommit_Java_Tpcds_Flink: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Flink Runner Tpcds Tests' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -91,6 +92,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Tpcds Flink script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Tpcds_Spark.yml b/.github/workflows/beam_PostCommit_Java_Tpcds_Spark.yml index 2284e7fa06bf2..82dfef6600533 100644 --- a/.github/workflows/beam_PostCommit_Java_Tpcds_Spark.yml +++ b/.github/workflows/beam_PostCommit_Java_Tpcds_Spark.yml @@ -16,10 +16,10 @@ name: PostCommit Java Tpcds Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -73,7 +73,8 @@ jobs: beam_PostCommit_Java_Tpcds_Spark: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Spark Runner Tpcds Tests' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 @@ -90,6 +91,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Java Tpcds Spark script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml index 596ef873c9646..338b148ff3fbb 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Dataflow: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 480 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Dataflow ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Dataflow ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -71,11 +72,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunner script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml index 19c2f3f8cb165..86b6b32a4abbb 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Dataflow JavaVersions on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Dataflow_JavaVersions: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.java_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.java_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 480 strategy: @@ -63,29 +63,27 @@ jobs: java_version: ['11','17'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startswith(github.event.comment.body, 'Run Dataflow ValidatesRunner Java') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase }} ${{matrix.java_version}} + comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.java_version }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) ${{matrix.java_version}} - - name: Set up Java${{ matrix.java_version }} - uses: actions/setup-java@v3.8.0 + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) ${{ matrix.java_version }} + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | ${{ matrix.java_version }} 8 - name: run jar Java${{ matrix.java_version }} script - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :runners:google-cloud-dataflow-java:testJar :runners:google-cloud-dataflow-java:worker:shadowJar - arguments: | - -Dorg.gradle.java.home=$JAVA_HOME_8_X64 \ + run: | + ./gradlew runners:google-cloud-dataflow-java:testJar :runners:google-cloud-dataflow-java:worker:shadowJar \ + -Dorg.gradle.java.home=$JAVA_HOME_8_X64 - name: run validatesRunner Java${{ matrix.java_version }} script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml index 013b34bc807b6..abe21ac3f7837 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Dataflow Streaming on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Dataflow_Streaming: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 720 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Dataflow Streaming ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Dataflow Streaming ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -71,11 +72,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunnerStreaming script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml index 3a9e0140f8182..63625b48ea2f8 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Dataflow V2 on: - issue_comment: - types: [created] schedule: - - cron: '0 */8 * * *' + - cron: '30 6/8 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Dataflow_V2: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 390 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Dataflow V2 ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Dataflow V2 ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -71,11 +72,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunnerV2 script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml index edf9a04f22c3b..b372c4c2acdac 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Dataflow V2 Streaming on: - issue_comment: - types: [created] schedule: - - cron: '0 */8 * * *' + - cron: '30 6/8 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Dataflow_V2_Streaming: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 510 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Java Dataflow V2 ValidatesRunner Streaming] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java Dataflow V2 ValidatesRunner Streaming' steps: - uses: actions/checkout@v4 @@ -71,11 +72,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunnerV2Streaming script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml index 79447f6108097..16e21c9faa264 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Direct: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Direct ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Direct ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -71,15 +72,12 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunner script - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :runners:direct-java:validatesRunner + run: ./gradlew :runners:direct-java:validatesRunner - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml index 8e5dbac0c4cdf..fafc821a2b205 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Direct JavaVersions on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Direct_JavaVersions: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.java_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.java_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 480 strategy: @@ -63,29 +63,27 @@ jobs: java_version: ['11','17'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startswith(github.event.comment.body, 'Run Direct ValidatesRunner Java') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase }} ${{matrix.java_version}} + comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.java_version }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) ${{matrix.java_version}} - - name: Set up Java${{ matrix.java_version }} - uses: actions/setup-java@v3.8.0 + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) ${{ matrix.java_version }} + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | ${{ matrix.java_version }} 8 - name: run jar Java${{ matrix.java_version }} script - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :runners:direct-java:shadowJar :runners:direct-java:shadowTestJar - arguments: | - -Dorg.gradle.java.home=$JAVA_HOME_8_X64 \ + run: | + ./gradlew :runners:direct-java:shadowJar :runners:direct-java:shadowTestJar \ + -Dorg.gradle.java.home=$JAVA_HOME_8_X64 - name: run validatesRunner Java${{ matrix.java_version }} script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml index 2ff883dafa75a..8171760f5528a 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml @@ -16,10 +16,10 @@ name: PostCommit Java ValidatesRunner Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,8 @@ jobs: job_phrase: [Run Flink ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Flink ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -88,6 +89,7 @@ jobs: uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: + large_files: true commit: '${{ env.prsha || env.GITHUB_SHA }}' comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml index c0339100845ef..6bc1b0ffa03c2 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink_Java11.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Flink Java11 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Flink_Java11: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 270 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Flink ValidatesRunner Java 11] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startswith(github.event.comment.body, 'Run Flink ValidatesRunner Java 11') steps: - uses: actions/checkout@v4 @@ -71,19 +72,16 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | 11 8 - name: run jar Java8 script - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :runners:flink:1.15:jar :runners:flink:1.15:testJar - arguments: | - -Dorg.gradle.java.home=$JAVA_HOME_8_X64 \ + run: | + ./gradlew :runners:flink:1.15:jar :runners:flink:1.15:testJar \ + -Dorg.gradle.java.home=$JAVA_HOME_8_X64 - name: run validatesRunner Java11 script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -109,6 +107,7 @@ jobs: uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: + large_files: true commit: '${{ env.prsha || env.GITHUB_SHA }}' comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml index 97e35490c25d3..8f8993f3108ea 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Samza.yml @@ -16,10 +16,10 @@ name: PostCommit Java ValidatesRunner Samza on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,8 @@ jobs: job_phrase: [Run Samza ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Samza ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunner script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml index 5800c338ed16e..ce67510b1e50a 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark.yml @@ -16,10 +16,10 @@ name: PostCommit Java ValidatesRunner Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,8 @@ jobs: job_phrase: [Run Spark ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Spark ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunner script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml index 08504316333a0..7883218d262ab 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.yml @@ -16,10 +16,10 @@ name: PostCommit Java ValidatesRunner SparkStructuredStreaming on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,8 @@ jobs: job_phrase: [Run Spark StructuredStreaming ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Spark StructuredStreaming ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesStructuredStreamingRunnerBatch script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml index 1cc0193b1e693..099f9ab1e724f 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.yml @@ -18,15 +18,15 @@ name: PostCommit Java ValidatesRunner Spark Java11 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Java_ValidatesRunner_Spark_Java11: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 270 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Spark ValidatesRunner Java 11] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startswith(github.event.comment.body, 'Run Spark ValidatesRunner Java 11') steps: - uses: actions/checkout@v4 @@ -71,19 +72,16 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' java-version: | 11 8 - name: run jar Java8 script - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :runners:spark:3:jar :runners:spark:3:testJar - arguments: | - -Dorg.gradle.java.home=$JAVA_HOME_8_X64 \ + run: | + ./gradlew :runners:spark:3:jar :runners:spark:3:testJar \ + -Dorg.gradle.java.home=$JAVA_HOME_8_X64 - name: run validatesRunner Java11 script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml index 21bacac11d590..de3d4914052e8 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Twister2.yml @@ -16,10 +16,10 @@ name: PostCommit Java ValidatesRunner Twister2 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,8 @@ jobs: job_phrase: [Run Twister2 ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Twister2 ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -69,11 +70,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' + java-version: 8 - name: run validatesRunner script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml index 75c07bc49783b..eb2139c562e9f 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_ULR.yml @@ -16,10 +16,10 @@ name: PostCommit Java ValidatesRunner ULR on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 4/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,8 @@ jobs: job_phrase: [Run ULR Loopback ValidatesRunner] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run ULR Loopback ValidatesRunner' steps: - uses: actions/checkout@v4 @@ -69,19 +70,13 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' + java-version: 8 + python-version: 3.8 - name: run ulrLoopbackValidatesRunner script - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :runners:portability:java:ulrLoopbackValidatesRunner + run: ./gradlew :runners:portability:java:ulrLoopbackValidatesRunner - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/beam_PostCommit_Javadoc.yml b/.github/workflows/beam_PostCommit_Javadoc.yml index 7185f588f463a..8207cbe9ca9af 100644 --- a/.github/workflows/beam_PostCommit_Javadoc.yml +++ b/.github/workflows/beam_PostCommit_Javadoc.yml @@ -18,15 +18,15 @@ name: PostCommit Javadoc on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Javadoc: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run Javadoc PostCommit] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Javadoc PostCommit' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run aggregateJavadoc script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_PortableJar_Flink.yml b/.github/workflows/beam_PostCommit_PortableJar_Flink.yml index 3f682ea57dde6..2f066979681e2 100644 --- a/.github/workflows/beam_PostCommit_PortableJar_Flink.yml +++ b/.github/workflows/beam_PostCommit_PortableJar_Flink.yml @@ -16,10 +16,10 @@ name: PostCommit PortableJar Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,8 +52,9 @@ jobs: beam_PostCommit_PortableJar_Flink: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || github.event.comment.body == 'Run PortableJar_Flink PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 @@ -69,8 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: 3.8 - name: run testPipelineJarFlinkRunner script @@ -81,8 +82,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py38:testPipelineJarFlinkRunner arguments: | -PpythonVersion=3.8 \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_PortableJar_Spark.yml b/.github/workflows/beam_PostCommit_PortableJar_Spark.yml index f9884d6519781..8b5bc031f7fa2 100644 --- a/.github/workflows/beam_PostCommit_PortableJar_Spark.yml +++ b/.github/workflows/beam_PostCommit_PortableJar_Spark.yml @@ -16,21 +16,21 @@ name: PostCommit PortableJar Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,8 +52,9 @@ jobs: beam_PostCommit_PortableJar_Spark: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || github.event.comment.body == 'Run PortableJar_Spark PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,8 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: 3.8 - name: run testPipelineJarSparkRunner script @@ -81,8 +82,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py38:testPipelineJarSparkRunner arguments: | -PpythonVersion=3.8 \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python.yml b/.github/workflows/beam_PostCommit_Python.yml index a7a214c7c5a98..0b22466f517bb 100644 --- a/.github/workflows/beam_PostCommit_Python.yml +++ b/.github/workflows/beam_PostCommit_Python.yml @@ -18,26 +18,26 @@ name: PostCommit Python on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Python: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) runs-on: [self-hosted, ubuntu-20.04, highmem] timeout-minutes: 240 strategy: @@ -63,8 +63,9 @@ jobs: python_version: ['3.8', '3.9', '3.10', '3.11'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || - github.event.comment.body == 'Run Python PostCommit' + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + startswith(github.event.comment.body, 'Run Python PostCommit 3.') steps: - uses: actions/checkout@v4 - name: Setup repository @@ -73,10 +74,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - python-version: ${{matrix.python_version}} + python-version: ${{ matrix.python_version }} - name: Install docker compose run: | sudo curl -L https://github.com/docker/compose/releases/download/1.22.0/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose @@ -96,8 +97,16 @@ jobs: -PpythonVersion=${{ matrix.python_version }} \ env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Arm.yml b/.github/workflows/beam_PostCommit_Python_Arm.yml index a77c4e96dc513..1a88c468a67c2 100644 --- a/.github/workflows/beam_PostCommit_Python_Arm.yml +++ b/.github/workflows/beam_PostCommit_Python_Arm.yml @@ -18,26 +18,28 @@ name: PostCommit Python Arm on: - # issue_comment: - # types: [created] - # schedule: - # - cron: '0 */6 * * *' + issue_comment: + types: [created] + schedule: + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -52,7 +54,7 @@ env: jobs: beam_PostCommit_Python_Arm: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -63,7 +65,8 @@ jobs: python_version: ['3.8', '3.9', '3.10', '3.11'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startsWith(github.event.comment.body, 'Run Python PostCommit Arm') steps: - uses: actions/checkout@v4 @@ -73,20 +76,28 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - python-version: ${{matrix.python_version}} + python-version: ${{ matrix.python_version }} - name: Install docker compose run: | sudo curl -L https://github.com/docker/compose/releases/download/1.22.0/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: GCloud Docker credential helper + run: | + gcloud auth configure-docker us.gcr.io - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | PY_VER=${{ matrix.python_version }} PY_VER_CLEAN=${PY_VER//.} echo "py_ver_clean=$PY_VER_CLEAN" >> $GITHUB_OUTPUT + - name: Generate TAG unique variable based on timestamp + id: set_tag + run: echo "TAG=$(date +'%Y%m%d-%H%M%S%N')" >> $GITHUB_OUTPUT - name: run PostCommit Python ${{ matrix.python_version }} script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -94,10 +105,24 @@ jobs: arguments: | -PuseWheelDistribution \ -PpythonVersion=${{ matrix.python_version }} \ + -Pcontainer-architecture-list=arm64,amd64 \ + -Pdocker-repository-root=us.gcr.io/apache-beam-testing/github-actions \ + -Pdocker-tag=${{ steps.set_tag.outputs.TAG }} \ + -Ppush-containers \ env: CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} - - name: Archive code coverage results + MULTIARCH_TAG: ${{ steps.set_tag.outputs.TAG }} + USER: github-actions + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() with: - name: python-code-coverage-report - path: "**/pytest*.xml" + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml index 40c508b38e6e2..ae07cb0273e0b 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml @@ -16,21 +16,21 @@ name: PostCommit Python Examples Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,8 +52,9 @@ jobs: beam_PostCommit_Python_Examples_Dataflow: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || github.event.comment.body == 'Run Python Examples_Dataflow' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 @@ -81,8 +82,16 @@ jobs: arguments: | -PuseWheelDistribution \ -PpythonVersion=3.11 \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml b/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml index 85d766b0575c3..b4b620e5dd1f0 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml @@ -16,21 +16,21 @@ name: PostCommit Python Examples Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +51,9 @@ env: jobs: beam_PostCommit_Python_Examples_Direct: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Examples_Direct') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 @@ -88,8 +89,16 @@ jobs: gradle-command: :sdks:python:test-suites:direct:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:examples arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml b/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml index 17112cf18e2ae..1f334f6a99636 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml @@ -16,21 +16,21 @@ name: PostCommit Python Examples Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,11 +51,12 @@ env: jobs: beam_PostCommit_Python_Examples_Flink: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Examples_Flink') runs-on: [self-hosted, ubuntu-20.04, main] - timeout-minutes: 120 + timeout-minutes: 240 name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: fail-fast: false @@ -88,8 +89,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:flinkExamples arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml b/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml index ccc7a998df27f..6a33c63f24a08 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml @@ -16,21 +16,21 @@ name: PostCommit Python Examples Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +51,9 @@ env: jobs: beam_PostCommit_Python_Examples_Spark: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Examples_Spark') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 @@ -88,8 +89,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:sparkExamples arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml b/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml index 0fb2302b8ed19..1c8daf4cffa39 100644 --- a/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml +++ b/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml @@ -16,21 +16,21 @@ name: PostCommit Python MongoDBIO IT on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,8 +52,9 @@ jobs: beam_PostCommit_Python_MongoDBIO_IT: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || github.event.comment.body == 'Run Python MongoDBIO_IT' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -80,8 +81,16 @@ jobs: gradle-command: :sdks:python:test-suites:direct:py311:mongodbioIT arguments: | -PpythonVersion=3.11 \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml b/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml index db95a48a3007e..f63461369be00 100644 --- a/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml @@ -18,15 +18,15 @@ name: PostCommit Python Nexmark Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -89,7 +89,7 @@ env: jobs: beam_PostCommit_Python_Nexmark_Direct: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -104,7 +104,8 @@ jobs: query: [0, 2, 3, 5, 7, 8, 10, 11] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Python Direct Runner Nexmark Tests' steps: - uses: actions/checkout@v4 @@ -114,8 +115,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: 3.8 - name: run Java Testing Nexmark (query ${{ matrix.query }}) diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml index fb7102a5b52e1..196dd1eaa84ed 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml @@ -16,21 +16,23 @@ name: PostCommit Python ValidatesContainer Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: + issue_comment: + types: [created] #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +42,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +53,9 @@ env: jobs: beam_PostCommit_Python_ValidatesContainer_Dataflow: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Dataflow ValidatesContainer') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -74,7 +77,9 @@ jobs: - name: Setup environment uses: ./.github/actions/setup-environment-action with: - java-version: 8 + java-version: | + 11 + 8 python-version: ${{ matrix.python_version }} - name: Set PY_VER_CLEAN id: set_py_ver_clean @@ -90,8 +95,16 @@ jobs: gradle-command: :sdks:python:test-suites:dataflow:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:validatesContainer arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml index a95682e415d4f..ac2179dd25214 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml @@ -16,21 +16,21 @@ name: PostCommit Python ValidatesContainer Dataflow With RC on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +51,9 @@ env: jobs: beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python RC Dataflow ValidatesContainer') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -74,7 +75,9 @@ jobs: - name: Setup environment uses: ./.github/actions/setup-environment-action with: - java-version: 8 + java-version: | + 11 + 8 python-version: ${{ matrix.python_version }} - name: Set PY_VER_CLEAN id: set_py_ver_clean @@ -89,10 +92,18 @@ jobs: with: gradle-command: :sdks:python:test-suites:dataflow:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:validatesContainer arguments: | - -PtestRCDependencies=true + -PtestRCDependencies=true \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml index 572f477773b6c..79bc303f1117c 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml @@ -16,21 +16,21 @@ name: PostCommit Python ValidatesRunner Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +51,9 @@ env: jobs: beam_PostCommit_Python_ValidatesRunner_Dataflow: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Dataflow ValidatesRunner') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 200 @@ -96,8 +97,16 @@ jobs: arguments: | -PuseWheelDistribution \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml index a1652f3a18de3..b403f76b9f9e3 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml @@ -16,21 +16,21 @@ name: PostCommit Python ValidatesRunner Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +51,9 @@ env: jobs: beam_PostCommit_Python_ValidatesRunner_Flink: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Flink ValidatesRunner') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -90,8 +91,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:flinkValidatesRunner arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml index c41605391d40c..4229304278c01 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml @@ -16,21 +16,21 @@ name: PostCommit Python ValidatesRunner Samza on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +51,9 @@ env: jobs: beam_PostCommit_Python_ValidatesRunner_Samza: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Samza ValidatesRunner') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -88,8 +89,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:samzaValidatesRunner arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml index d42835727d363..06db87f8fb76f 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml @@ -16,21 +16,21 @@ name: PostCommit Python ValidatesRunner Spark on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -51,8 +51,9 @@ env: jobs: beam_PostCommit_Python_ValidatesRunner_Spark: if: | - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request_target' || startsWith(github.event.comment.body, 'Run Python Spark ValidatesRunner') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -88,8 +89,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:sparkValidatesRunner arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml index 31025df81ddce..dd899a538e9a9 100644 --- a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Dataflow.yml @@ -16,21 +16,21 @@ name: PostCommit Python Xlang Gcp Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Python_Xlang_Gcp_Dataflow: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Python_Xlang_Gcp_Dataflow PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 @@ -69,8 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -80,9 +81,16 @@ jobs: with: gradle-command: :sdks:python:test-suites:dataflow:gcpCrossLanguagePostCommit arguments: -PuseWheelDistribution - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/pytest*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml index d17d15029c2c5..33eb748a2f84b 100644 --- a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml @@ -16,21 +16,21 @@ name: PostCommit Python Xlang Gcp Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Python_Xlang_Gcp_Direct: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Python_Xlang_Gcp_Direct PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,8 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -79,9 +80,16 @@ jobs: uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:python:test-suites:direct:gcpCrossLanguagePostCommit - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/pytest*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml index 386d915187956..4d71e507fe325 100644 --- a/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_Xlang_IO_Dataflow.yml @@ -16,21 +16,21 @@ name: PostCommit Python Xlang IO Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Python_Xlang_IO_Dataflow: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Python_Xlang_IO_Dataflow PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 @@ -69,8 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -82,9 +83,16 @@ jobs: arguments: | -PuseWheelDistribution \ -PkafkaBootstrapServer=10.128.0.40:9094,10.128.0.28:9094,10.128.0.165:9094 \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/pytest*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_SQL.yml b/.github/workflows/beam_PostCommit_SQL.yml index a753cde6e56d9..eae7d4374d5b5 100644 --- a/.github/workflows/beam_PostCommit_SQL.yml +++ b/.github/workflows/beam_PostCommit_SQL.yml @@ -18,15 +18,15 @@ name: PostCommit SQL on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_SQL: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 240 strategy: @@ -61,7 +61,8 @@ jobs: job_phrase: [Run SQL PostCommit] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run SQL PostCommit' steps: - uses: actions/checkout@v4 @@ -71,6 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit SQL script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Sickbay_Python.yml b/.github/workflows/beam_PostCommit_Sickbay_Python.yml index 2153e9908898a..b4d054f07a3d7 100644 --- a/.github/workflows/beam_PostCommit_Sickbay_Python.yml +++ b/.github/workflows/beam_PostCommit_Sickbay_Python.yml @@ -18,26 +18,26 @@ name: PostCommit Sickbay Python on: - issue_comment: - types: [created] schedule: - - cron: '0 0 * * *' + - cron: '0 8 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -52,7 +52,7 @@ env: jobs: beam_PostCommit_Sickbay_Python: - name: ${{matrix.job_name}} (${{matrix.job_phrase_1}} ${{matrix.python_version}} ${{matrix.job_phrase_2}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.python_version }} ${{ matrix.job_phrase_2 }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -64,7 +64,8 @@ jobs: python_version: ['3.8', '3.9', '3.10', '3.11'] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || (startswith(github.event.comment.body, 'Run Python') && endswith(github.event.comment.body, 'PostCommit Sickbay')) steps: @@ -72,13 +73,13 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{ matrix.job_phrase_1 }} ${{matrix.python_version}} ${{ matrix.job_phrase_2 }} + comment_phrase: ${{ matrix.job_phrase_1 }} ${{ matrix.python_version }} ${{ matrix.job_phrase_2 }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} ${{ matrix.job_phrase_1 }} ${{matrix.python_version}} ${{ matrix.job_phrase_2 }} - - name: Install Python - uses: actions/setup-python@v4 + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase_1 }} ${{ matrix.python_version }} ${{ matrix.job_phrase_2 }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - python-version: ${{matrix.python_version}} + python-version: ${{ matrix.python_version }} - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | @@ -91,8 +92,16 @@ jobs: gradle-command: :sdks:python:test-suites:dataflow:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:postCommitSickbay arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_TransformService_Direct.yml b/.github/workflows/beam_PostCommit_TransformService_Direct.yml index 9387e9f748313..e40112f0c5b19 100644 --- a/.github/workflows/beam_PostCommit_TransformService_Direct.yml +++ b/.github/workflows/beam_PostCommit_TransformService_Direct.yml @@ -16,21 +16,21 @@ name: PostCommit TransformService Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,11 +52,12 @@ jobs: beam_PostCommit_TransformService_Direct: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run TransformService_Direct PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: matrix: job_name: ["beam_PostCommit_TransformService_Direct"] @@ -70,14 +71,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Set up Java 11 - uses: actions/setup-java@v3.11.0 - with: - distribution: 'temurin' - java-version: '11' - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 11 python-version: | 3.8 3.11 @@ -86,13 +83,20 @@ jobs: with: gradle-command: :sdks:python:test-suites:direct:xlang:transformServicePythonUsingJava arguments: | - -PcompileAndRunTestsWithJava11 \ + -PtestJavaVersion=11 \ -Pjava11Home=$JAVA_HOME_11_X64 \ -PuseWheelDistribution \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/pytest*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_Website_Publish.yml b/.github/workflows/beam_PostCommit_Website_Publish.yml index a1e23b530a260..ed1729021e988 100644 --- a/.github/workflows/beam_PostCommit_Website_Publish.yml +++ b/.github/workflows/beam_PostCommit_Website_Publish.yml @@ -17,7 +17,7 @@ name: PostCommit Website Publish on: schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -38,7 +38,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -50,14 +50,19 @@ jobs: beam_PostCommit_Website_Publish: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' + (github.event_name == 'schedule' && github.repository == 'apache/beam') runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 30 name: beam_PostCommit_Website_Publish steps: - uses: actions/checkout@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + disable-cache: true - name: run PostCommit Website Publish script uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :website:clean :website:publishWebsite - arguments: -PgitPublishRemote="https://github.com/apache/beam.git" \ No newline at end of file + arguments: -PgitPublishRemote="https://github.com/apache/beam.git" + - uses: actions/checkout@v4 # Extra checkout to make sure we're on master for post steps. diff --git a/.github/workflows/beam_PostCommit_Website_Test.yml b/.github/workflows/beam_PostCommit_Website_Test.yml index cd06181c456d2..ba372d2232817 100644 --- a/.github/workflows/beam_PostCommit_Website_Test.yml +++ b/.github/workflows/beam_PostCommit_Website_Test.yml @@ -16,10 +16,10 @@ name: PostCommit Website Test on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_Website_Test: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Full Website Test' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 60 @@ -69,6 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PostCommit Website Test script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_XVR_Direct.yml b/.github/workflows/beam_PostCommit_XVR_Direct.yml index eaee2b669873b..ee90f9176f6d4 100644 --- a/.github/workflows/beam_PostCommit_XVR_Direct.yml +++ b/.github/workflows/beam_PostCommit_XVR_Direct.yml @@ -16,17 +16,17 @@ name: PostCommit XVR Direct on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,11 +52,12 @@ jobs: beam_PostCommit_XVR_Direct: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_Direct PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 120 - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: matrix: job_name: ["beam_PostCommit_XVR_Direct"] @@ -70,8 +71,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -96,9 +97,16 @@ jobs: arguments: | -PpythonVersion=${{ matrix.python_version }} \ -PskipNonPythonTask=false \ - - name: Archive code coverage results + - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/build/test-results/**/*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_XVR_Flink.yml b/.github/workflows/beam_PostCommit_XVR_Flink.yml index 67effa70b8b56..00c756a714f3e 100644 --- a/.github/workflows/beam_PostCommit_XVR_Flink.yml +++ b/.github/workflows/beam_PostCommit_XVR_Flink.yml @@ -16,17 +16,17 @@ name: PostCommit XVR Flink on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -53,11 +53,12 @@ jobs: beam_PostCommit_XVR_Flink: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_Flink PostCommit' - runs-on: [self-hosted, ubuntu-20.04, main] + runs-on: [self-hosted, ubuntu-20.04, highmem] timeout-minutes: 100 - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: matrix: job_name: ["beam_PostCommit_XVR_Flink"] @@ -71,8 +72,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -97,9 +98,16 @@ jobs: arguments: | -PpythonVersion=${{ matrix.python_version }} \ -PskipNonPythonTask=false \ - - name: Archive code coverage results + - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/build/test-results/**/*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml b/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml index 2f8a2eb8a3da2..aab8a0e0a84f0 100644 --- a/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_XVR_GoUsingJava_Dataflow.yml @@ -16,10 +16,10 @@ name: PostCommit XVR GoUsingJava Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_XVR_GoUsingJava_Dataflow: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_GoUsingJava_Dataflow PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,12 +70,13 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - python-version: '3.8' + python-version: 3.8 - name: run XVR GoUsingJava Dataflow script env: + USER: github-actions CLOUDSDK_CONFIG: ${{ env.KUBELET_GCLOUD_CONFIG_PATH}} uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml b/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml index 386fa304606d5..113c516260782 100644 --- a/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_XVR_JavaUsingPython_Dataflow.yml @@ -16,17 +16,17 @@ name: PostCommit XVR JavaUsingPython Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,11 +52,12 @@ jobs: beam_PostCommit_XVR_JavaUsingPython_Dataflow: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_JavaUsingPython_Dataflow PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: matrix: job_name: ["beam_PostCommit_XVR_JavaUsingPython_Dataflow"] @@ -70,8 +71,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -84,9 +85,16 @@ jobs: gradle-command: :runners:google-cloud-dataflow-java:validatesCrossLanguageRunnerJavaUsingPython arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/build/test-results/**/*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml b/.github/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml index 2432ddc30e2a4..6759930d5de81 100644 --- a/.github/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow.yml @@ -16,21 +16,21 @@ name: PostCommit XVR PythonUsingJavaSQL Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,7 +52,8 @@ jobs: beam_PostCommit_XVR_PythonUsingJavaSQL_Dataflow: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_PythonUsingJavaSQL_Dataflow PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 @@ -69,8 +70,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: 3.11 - name: run PostCommit XVR PythonUsingJavaSQL Dataflow script @@ -81,9 +82,16 @@ jobs: gradle-command: :runners:google-cloud-dataflow-java:validatesCrossLanguageRunnerPythonUsingSql arguments: | -PpythonVersion=3.11 \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/pytest*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml b/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml index bebac78a6315e..dda068049a172 100644 --- a/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_XVR_PythonUsingJava_Dataflow.yml @@ -16,21 +16,21 @@ name: PostCommit XVR PythonUsingJava Dataflow on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,11 +52,12 @@ jobs: beam_PostCommit_XVR_PythonUsingJava_Dataflow: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_PythonUsingJava_Dataflow PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: matrix: job_name: ["beam_PostCommit_XVR_PythonUsingJava_Dataflow"] @@ -70,8 +71,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -84,9 +85,16 @@ jobs: gradle-command: :runners:google-cloud-dataflow-java:validatesCrossLanguageRunnerPythonUsingJava arguments: | -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/pytest*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_XVR_Samza.yml b/.github/workflows/beam_PostCommit_XVR_Samza.yml index 8cf16cfe225d3..b05b588e0cf84 100644 --- a/.github/workflows/beam_PostCommit_XVR_Samza.yml +++ b/.github/workflows/beam_PostCommit_XVR_Samza.yml @@ -16,17 +16,17 @@ name: PostCommit XVR Samza on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,11 +52,12 @@ jobs: beam_PostCommit_XVR_Samza: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_Samza PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: matrix: job_name: ["beam_PostCommit_XVR_Samza"] @@ -70,8 +71,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -96,9 +97,16 @@ jobs: arguments: | -PpythonVersion=${{ matrix.python_version }} \ -PskipNonPythonTask=false \ - - name: Archive code coverage results + - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/build/test-results/**/*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostCommit_XVR_Spark3.yml b/.github/workflows/beam_PostCommit_XVR_Spark3.yml index 193526ffe7f34..0742196a69608 100644 --- a/.github/workflows/beam_PostCommit_XVR_Spark3.yml +++ b/.github/workflows/beam_PostCommit_XVR_Spark3.yml @@ -16,17 +16,17 @@ name: PostCommit XVR Spark3 on: - issue_comment: - types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 5/6 * * *' + pull_request_target: + paths: ['release/trigger_all_tests.json'] workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none @@ -40,7 +40,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -52,11 +52,12 @@ jobs: beam_PostCommit_XVR_Spark3: if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run XVR_Spark3 PostCommit' runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 100 - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: matrix: job_name: ["beam_PostCommit_XVR_Spark3"] @@ -70,8 +71,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: | 3.8 @@ -96,9 +97,16 @@ jobs: arguments: | -PpythonVersion=${{ matrix.python_version }} \ -PskipNonPythonTask=false \ - - name: Archive code coverage results + - name: Archive JUnit Test Results uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - name: archiveJunit - path: "**/build/test-results/**/*.xml" \ No newline at end of file + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PostRelease_NightlySnapshot.yml b/.github/workflows/beam_PostRelease_NightlySnapshot.yml new file mode 100644 index 0000000000000..73b3d46f5b923 --- /dev/null +++ b/.github/workflows/beam_PostRelease_NightlySnapshot.yml @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: PostRelease Nightly Snapshot + +on: + workflow_dispatch: + inputs: + RELEASE: + description: Beam version of current release (e.g. 2.XX.0) + required: true + default: '2.XX.0' + SNAPSHOT_URL: + description: Location of the staged artifacts in Maven central (https://repository.apache.org/content/repositories/orgapachebeam-NNNN/). + required: true + schedule: + - cron: '15 16 * * *' + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PostRelease_NightlySnapshot: + name: beam_PostRelease_NightlySnapshot + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - uses: actions/checkout@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: 8 + - name: run PostRelease validation script + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :release:runJavaExamplesValidationTask + arguments: | + -Pver='${{ github.event.inputs.RELEASE }}' \ + -Prepourl='${{ github.event.inputs.SNAPSHOT_URL }}' \ diff --git a/.github/workflows/beam_PreCommit_CommunityMetrics.yml b/.github/workflows/beam_PreCommit_CommunityMetrics.yml index f044b154c0ab2..570dd5dc3354f 100644 --- a/.github/workflows/beam_PreCommit_CommunityMetrics.yml +++ b/.github/workflows/beam_PreCommit_CommunityMetrics.yml @@ -19,14 +19,14 @@ on: push: tags: ['v*'] branches: ['master', 'release-*'] - paths: ['.test-infra/metrics/**', '.github/workflows/beam_PreCommit_CommunityMetrics.yml'] + paths: ['.test-infra/metrics/**', 'buildSrc/build.gradle.kts', '.github/workflows/beam_PreCommit_CommunityMetrics.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['.test-infra/metrics/**'] + paths: ['.test-infra/metrics/**', 'buildSrc/build.gradle.kts', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -47,7 +47,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -57,7 +57,7 @@ env: jobs: beam_PreCommit_CommunityMetrics: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) timeout-minutes: 120 runs-on: [self-hosted, ubuntu-20.04, main] strategy: @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run CommunityMetrics PreCommit' steps: @@ -75,7 +75,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment diff --git a/.github/workflows/beam_PreCommit_GHA.yml b/.github/workflows/beam_PreCommit_GHA.yml new file mode 100644 index 0000000000000..94811bc0f507f --- /dev/null +++ b/.github/workflows/beam_PreCommit_GHA.yml @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: PreCommit GHA + +on: + push: + tags: ['v*'] + branches: ['master', 'release-*'] + paths: ['.github/**/*.yml'] + pull_request_target: + branches: ['master', 'release-*' ] + paths: ['.github/**/*.yml', 'release/trigger_all_tests.json'] + issue_comment: + types: [created] + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PreCommit_GHA: + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + runs-on: [self-hosted, ubuntu-20.04, main] + strategy: + matrix: + job_name: [beam_PreCommit_GHA] + job_phrase: [Run GHA PreCommit] + timeout-minutes: 30 + if: | + github.event_name == 'push' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run GHA PreCommit' + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: 8 + go-version: 1.21 + - name: run GHA PreCommit script + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :beam-test-gha:preCommit \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Go.yml b/.github/workflows/beam_PreCommit_Go.yml index b7b561d54c1fd..a59a2ed5e34d6 100644 --- a/.github/workflows/beam_PreCommit_Go.yml +++ b/.github/workflows/beam_PreCommit_Go.yml @@ -22,11 +22,11 @@ on: paths: ['model/**', 'sdks/go.**', 'release/**', '.github/workflows/beam_PreCommit_Go.yml'] pull_request_target: branches: ['master', 'release-*' ] - paths: ['model/**', 'sdks/go.**', 'release/**'] + paths: ['model/**', 'sdks/go.**', 'release/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -47,7 +47,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Go PreCommit' steps: diff --git a/.github/workflows/beam_PreCommit_GoPortable.yml b/.github/workflows/beam_PreCommit_GoPortable.yml index 1c40a3c8d129c..231828ee6208c 100644 --- a/.github/workflows/beam_PreCommit_GoPortable.yml +++ b/.github/workflows/beam_PreCommit_GoPortable.yml @@ -22,16 +22,16 @@ on: paths: ['model/**', 'sdks/go.**', 'release/**','.github/workflows/beam_PreCommit_GoPortable.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['model/**', 'sdks/go.**', 'release/**'] + paths: ['model/**', 'sdks/go.**', 'release/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 1/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -57,7 +57,7 @@ permissions: jobs: beam_PreCommit_GoPortable: - name: ${{matrix.job_name}} (${{ matrix.job_phrase }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run GoPortable PreCommit' steps: diff --git a/.github/workflows/beam_PreCommit_GoPrism.yml b/.github/workflows/beam_PreCommit_GoPrism.yml index 0e60579ec64d4..00da770600b0c 100644 --- a/.github/workflows/beam_PreCommit_GoPrism.yml +++ b/.github/workflows/beam_PreCommit_GoPrism.yml @@ -22,16 +22,16 @@ on: paths: ['model/**', 'sdks/go.**', 'release/**','.github/workflows/beam_PreCommit_GoPrism.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['model/**', 'sdks/go.**', 'release/**'] + paths: ['model/**', 'sdks/go.**', 'release/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 1/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -57,7 +57,7 @@ permissions: jobs: beam_PreCommit_GoPrism: - name: ${{matrix.job_name}} (${{ matrix.job_phrase }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run GoPrism PreCommit' steps: diff --git a/.github/workflows/beam_PreCommit_ItFramework.yml b/.github/workflows/beam_PreCommit_ItFramework.yml index 6161f14225663..cf4cd29801760 100644 --- a/.github/workflows/beam_PreCommit_ItFramework.yml +++ b/.github/workflows/beam_PreCommit_ItFramework.yml @@ -28,15 +28,16 @@ on: branches: ['master', 'release-*'] paths: - 'it/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '10 */6 * * *' + - cron: '0 1/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -61,7 +62,7 @@ permissions: statuses: read jobs: beam_PreCommit_ItFramework: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -70,7 +71,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run It_Framework PreCommit' steps: @@ -78,7 +79,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment diff --git a/.github/workflows/beam_PreCommit_Java.yml b/.github/workflows/beam_PreCommit_Java.yml index 9a58f42fef78b..fe694b4eab702 100644 --- a/.github/workflows/beam_PreCommit_Java.yml +++ b/.github/workflows/beam_PreCommit_Java.yml @@ -40,6 +40,7 @@ on: - '!sdks/java/io/elasticsearch/**' - '!sdks/java/io/elasticsearch-tests/**' - '!sdks/java/io/file-schema-transform/**' + - '!sdks/java/io/google-ads/**' - '!sdks/java/io/google-cloud-platform/**' - '!sdks/java/io/hadoop-common/**' - '!sdks/java/io/hadoop-file-system/**' @@ -59,6 +60,7 @@ on: - '!sdks/java/io/pulsar/**' - '!sdks/java/io/rabbitmq/**' - '!sdks/java/io/redis/**' + - '!sdks/java/io/rrio/**' - '!sdks/java/io/singlestore/**' - '!sdks/java/io/snowflake/**' - '!sdks/java/io/solr/**' @@ -75,6 +77,7 @@ on: - 'examples/java/**' - 'examples/kotlin/**' - 'release/**' + - 'release/trigger_all_tests.json' - '!sdks/java/extensions/avro/**' - '!sdks/java/extensions/sql/**' - '!sdks/java/io/amazon-web-services/**' @@ -89,6 +92,7 @@ on: - '!sdks/java/io/elasticsearch/**' - '!sdks/java/io/elasticsearch-tests/**' - '!sdks/java/io/file-schema-transform/**' + - '!sdks/java/io/google-ads/**' - '!sdks/java/io/google-cloud-platform/**' - '!sdks/java/io/hadoop-common/**' - '!sdks/java/io/hadoop-file-system/**' @@ -108,6 +112,7 @@ on: - '!sdks/java/io/pulsar/**' - '!sdks/java/io/rabbitmq/**' - '!sdks/java/io/redis/**' + - '!sdks/java/io/rrio/**' - '!sdks/java/io/singlestore/**' - '!sdks/java/io/snowflake/**' - '!sdks/java/io/solr/**' @@ -117,12 +122,12 @@ on: issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 2/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -148,7 +153,7 @@ env: jobs: beam_PreCommit_Java: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) timeout-minutes: 180 runs-on: [self-hosted, ubuntu-20.04, main] strategy: @@ -159,7 +164,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java PreCommit' steps: @@ -170,6 +175,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Java PreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -192,6 +199,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml index 2d27ecc38be06..16d6562f827f4 100644 --- a/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct.yml @@ -37,6 +37,7 @@ on: - "sdks/java/io/amazon-web-services2/**" - "sdks/java/io/common/**" - "sdks/java/core/src/main/**" + - 'release/trigger_all_tests.json' - "build.gradle" - "buildSrc/**" - "gradle/**" @@ -47,7 +48,7 @@ on: issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -68,7 +69,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -87,7 +88,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Amazon-Web-Services2_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -99,6 +100,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Amazon-Web-Services2 IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -128,6 +131,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml index c812aac4d447f..f7c1d66e2f1e7 100644 --- a/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Amazon-Web-Services_IO_Direct.yml @@ -37,6 +37,7 @@ on: - "sdks/java/io/amazon-web-services/**" - "sdks/java/io/common/**" - "sdks/java/core/src/main/**" + - 'release/trigger_all_tests.json' - "build.gradle" - "buildSrc/**" - "gradle/**" @@ -47,7 +48,7 @@ on: issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -68,7 +69,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -87,7 +88,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Amazon-Web-Services_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -99,6 +100,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Amazon-Web-Services IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -128,6 +131,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml index 8898cf36f1880..8df097d8428f9 100644 --- a/.github/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Amqp_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/amqp/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Amqp_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Amqp IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml index 84ffe4dab834d..d32361708491b 100644 --- a/.github/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Azure_IO_Direct.yml @@ -37,6 +37,7 @@ on: - "sdks/java/io/azure/**" - "sdks/java/io/common/**" - "sdks/java/core/src/main/**" + - 'release/trigger_all_tests.json' - "build.gradle" - "buildSrc/**" - "gradle/**" @@ -47,7 +48,7 @@ on: issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -68,7 +69,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -87,7 +88,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Azure_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -99,6 +100,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Azure IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -121,6 +124,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml index 35f0e061f0918..04e1b10c23bd7 100644 --- a/.github/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Cassandra_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/cassandra/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Cassandra_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Cassandra IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml index b23490afe8f79..f23aad3ea4552 100644 --- a/.github/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Cdap_IO_Direct.yml @@ -30,10 +30,11 @@ on: - "sdks/java/io/cdap/**" - "sdks/java/io/hadoop-common/**" - "sdks/java/io/hadoop-format/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -54,7 +55,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -73,7 +74,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Cdap_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -85,6 +86,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Cdap IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -107,6 +110,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml index 0dd75cb0a6d20..f7c5ea1b86151 100644 --- a/.github/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Clickhouse_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/clickhouse/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Clickhouse_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Clickhouse IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml index 17e1876d20f37..496e0c15ea667 100644 --- a/.github/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Csv_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/csv/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Csv_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Csv IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml index 6f10c210b010f..97264ac146a58 100644 --- a/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/debezium/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Debezium_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Debezium IO build task uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -112,6 +115,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml index 6775708c2b8f0..f51e50aa82005 100644 --- a/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml @@ -28,10 +28,11 @@ on: paths: - "sdks/java/io/elasticsearch/**" - "sdks/java/io/elasticsearch-tests/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +53,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -71,7 +72,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_ElasticSearch_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -83,6 +84,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run ElasticSearch IO build task uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -116,6 +119,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml index 29a637baff3a1..b96e459470cfa 100644 --- a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml +++ b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml @@ -38,15 +38,16 @@ on: - 'examples/java/**' - 'examples/kotlin/**' - 'release/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 1/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -71,7 +72,7 @@ permissions: statuses: read jobs: beam_PreCommit_Java_Examples_Dataflow: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) timeout-minutes: 60 runs-on: [self-hosted, ubuntu-20.04, main] strategy: @@ -81,7 +82,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Examples_Dataflow PreCommit' steps: @@ -89,7 +90,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment diff --git a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml deleted file mode 100644 index cf8cefe388f31..0000000000000 --- a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml +++ /dev/null @@ -1,139 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: PreCommit Java Examples Dataflow Java17 - -on: - push: - tags: ['v*'] - branches: ['master', 'release-*'] - paths: - - 'model/**' - - 'sdks/java/**' - - 'runners/google-cloud-dataflow-java/**' - - 'examples/java/**' - - 'examples/kotlin/**' - - 'release/**' - - '.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml' - pull_request_target: - branches: ['master', 'release-*'] - paths: - - 'model/**' - - 'sdks/java/**' - - 'runners/google-cloud-dataflow-java/**' - - 'examples/java/**' - - 'examples/kotlin/**' - - 'release/**' - issue_comment: - types: [created] - schedule: - - cron: '0 */6 * * *' - workflow_dispatch: - -# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event -permissions: - actions: write - pull-requests: write - checks: write - contents: read - deployments: read - id-token: none - issues: write - discussions: read - packages: read - pages: read - repository-projects: read - security-events: read - statuses: read - -# This allows a subsequently queued workflow run to interrupt previous runs -concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' - cancel-in-progress: true - -env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} - GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} - GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - -jobs: - beam_PreCommit_Java_Examples_Dataflow_Java17: - name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - strategy: - matrix: - job_name: ["beam_PreCommit_Java_Examples_Dataflow_Java17"] - job_phrase: ["Run Java_Examples_Dataflow_Java17 PreCommit"] - timeout-minutes: 60 - if: | - github.event_name == 'push' || - github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || - github.event_name == 'workflow_dispatch' || - github.event.comment.body == 'Run Java_Examples_Dataflow_Java17 PreCommit' - runs-on: [self-hosted, ubuntu-20.04, main] - steps: - - uses: actions/checkout@v4 - - name: Setup repository - uses: ./.github/actions/setup-action - with: - comment_phrase: ${{ matrix.job_phrase }} - github_token: ${{ secrets.GITHUB_TOKEN }} - github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - # The test requires Java 17 and Java 8 versions. - # Java 8 is installed second because JAVA_HOME needs to point to Java 8. - - name: Set up Java 17 and 8 - uses: actions/setup-java@v3.11.0 - with: - distribution: 'temurin' - java-version: | - 17 - 8 - - name: Clean - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :clean - arguments: | - -PdisableSpotlessCheck=true \ - -PdisableCheckStyle=true \ - -PskipCheckerFramework \ - - name: Build and Test - uses: ./.github/actions/gradle-command-self-hosted-action - with: - gradle-command: :runners:google-cloud-dataflow-java:examples:preCommit - arguments: | - -PdisableSpotlessCheck=true \ - -PdisableCheckStyle=true \ - -PcompileAndRunTestsWithJava17 \ - -PskipCheckerFramework \ - -Pjava17Home=$JAVA_HOME_17_X64 \ - max-workers: 12 - - name: Archive JUnit Test Results - uses: actions/upload-artifact@v3 - if: failure() - with: - name: JUnit Test Results - path: "**/build/reports/tests/" - - name: Publish JUnit Test Results - uses: EnricoMi/publish-unit-test-result-action@v2 - if: always() - with: - commit: '${{ env.prsha || env.GITHUB_SHA }}' - comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} - files: '**/build/test-results/**/*.xml' - - name: Archive SpotBugs Results - uses: actions/upload-artifact@v3 - with: - name: SpotBugs Results - path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java21.yml b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java21.yml new file mode 100644 index 0000000000000..f482e22ec4a24 --- /dev/null +++ b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java21.yml @@ -0,0 +1,139 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: PreCommit Java Examples Dataflow Java21 + +on: + push: + tags: ['v*'] + branches: ['master', 'release-*'] + paths: + - 'model/**' + - 'sdks/java/**' + - 'runners/google-cloud-dataflow-java/**' + - 'examples/java/**' + - 'examples/kotlin/**' + - 'release/**' + - '.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java21.yml' + pull_request_target: + branches: ['master', 'release-*'] + paths: + - 'model/**' + - 'sdks/java/**' + - 'runners/google-cloud-dataflow-java/**' + - 'examples/java/**' + - 'examples/kotlin/**' + - 'release/**' + - 'release/trigger_all_tests.json' + issue_comment: + types: [created] + schedule: + - cron: '30 1/6 * * *' + workflow_dispatch: + +# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PreCommit_Java_Examples_Dataflow_Java21: + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_PreCommit_Java_Examples_Dataflow_Java21"] + job_phrase: ["Run Java_Examples_Dataflow_Java21 PreCommit"] + timeout-minutes: 60 + if: | + github.event_name == 'push' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Java_Examples_Dataflow_Java21 PreCommit' + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + # The test requires Java 21 and Java 8 versions. + # Java 8 is installed second because JAVA_HOME needs to point to Java 8. + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: | + 21 + 8 + - name: Clean + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :clean + arguments: | + -PdisableSpotlessCheck=true \ + -PdisableCheckStyle=true \ + -PskipCheckerFramework \ + - name: Build and Test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :runners:google-cloud-dataflow-java:examples:preCommit + arguments: | + -PdisableSpotlessCheck=true \ + -PdisableCheckStyle=true \ + -PtestJavaVersion=21 \ + -PskipCheckerFramework \ + -Pjava21Home=$JAVA_HOME_21_X64 \ + max-workers: 12 + - name: Archive JUnit Test Results + uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' + - name: Archive SpotBugs Results + uses: actions/upload-artifact@v3 + with: + name: SpotBugs Results + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml index 1bfd58d6a0d62..8ce77fef64165 100644 --- a/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/file-schema-transform/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_File-schema-transform_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run File-schema-transform IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -104,6 +107,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml b/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml index cefbf897de854..d5c244b2dbc9d 100644 --- a/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml +++ b/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml @@ -30,10 +30,11 @@ on: - 'model/**' - 'runners/flink/**' - 'release/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '20 */6 * * *' + - cron: '30 1/6 * * *' workflow_dispatch: permissions: @@ -53,7 +54,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -70,7 +71,7 @@ jobs: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event.comment.body == 'Run Java_Flink_Versions PreCommit' steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml index 7c821a0247421..73dd9d63d872e 100644 --- a/.github/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_GCP_IO_Direct.yml @@ -36,6 +36,7 @@ on: paths: - "runners/core-construction-java/**" - "runners/core-java/**" + - 'release/trigger_all_tests.json' - "sdks/java/core/src/main/**" - "sdks/java/extensions/arrow/**" - "sdks/java/extensions/google-cloud-platform-core/**" @@ -47,7 +48,7 @@ on: issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -68,7 +69,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -87,10 +88,10 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_GCP_IO_Direct PreCommit' - runs-on: [self-hosted, ubuntu-20.04, highmem] + runs-on: [self-hosted, ubuntu-20.04, main] steps: - uses: actions/checkout@v4 - name: Setup repository @@ -99,6 +100,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PreCommit Java GCP IO Direct script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -125,6 +128,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Google-ads_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Google-ads_IO_Direct.yml new file mode 100644 index 0000000000000..7e0ca73e7efd4 --- /dev/null +++ b/.github/workflows/beam_PreCommit_Java_Google-ads_IO_Direct.yml @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: PreCommit Java Google-ads IO Direct + +on: + push: + tags: ['v*'] + branches: ['master', 'release-*'] + paths: + - "sdks/java/io/google-ads/**" + - ".github/workflows/beam_PreCommit_Java_Google-ads_IO_Direct.yml" + pull_request_target: + branches: ['master', 'release-*'] + paths: + - "sdks/java/io/google-ads/**" + - 'release/trigger_all_tests.json' + issue_comment: + types: [created] + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PreCommit_Java_Google-ads_IO_Direct: + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_PreCommit_Java_Google-ads_IO_Direct"] + job_phrase: ["Run Java_Google-ads_IO_Direct PreCommit"] + timeout-minutes: 60 + if: | + github.event_name == 'push' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Java_Google-ads_IO_Direct PreCommit' + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: run Google-ads IO build script + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:io:google-ads:build + arguments: | + -PdisableSpotlessCheck=true \ + -PdisableCheckStyle=true \ + - name: Archive JUnit Test Results + uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' + - name: Archive SpotBugs Results + uses: actions/upload-artifact@v3 + if: always() + with: + name: SpotBugs Results + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml index 23d0afb933ded..805561f3475df 100644 --- a/.github/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_HBase_IO_Direct.yml @@ -28,10 +28,11 @@ on: paths: - "sdks/java/io/hbase/**" - "sdks/java/io/hadoop-common/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +53,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -71,7 +72,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_HBase_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -83,6 +84,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run HBase IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -105,6 +108,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml index 777725d7c3ec3..bb9e5ee31f273 100644 --- a/.github/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_HCatalog_IO_Direct.yml @@ -28,10 +28,11 @@ on: paths: - "sdks/java/io/hcatalog/**" - "sdks/java/io/hadoop-common/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +53,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -71,7 +72,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_HCatalog_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -83,6 +84,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run HCatalog IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -105,6 +108,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml index 2b93fa4c524be..ea09a8df70d6a 100644 --- a/.github/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Hadoop_IO_Direct.yml @@ -52,10 +52,11 @@ on: - "sdks/java/testing/test-utils/**" - "sdks/java/io/hadoop-common/**" - "sdks/java/io/hadoop-format/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -76,7 +77,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -95,7 +96,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Hadoop_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -107,6 +108,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Hadoop IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -143,6 +146,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_IOs_Direct.yml b/.github/workflows/beam_PreCommit_Java_IOs_Direct.yml index 2825719a24ef6..4a221a9110090 100644 --- a/.github/workflows/beam_PreCommit_Java_IOs_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_IOs_Direct.yml @@ -28,6 +28,7 @@ on: paths: - "sdks/java/io/common/**" - "sdks/java/core/src/main/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] workflow_dispatch: @@ -51,7 +52,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Java IOs PreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -104,6 +107,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml index 6d6bd34deef21..52a03ceea2048 100644 --- a/.github/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_InfluxDb_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/influxdb/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_InfluxDb_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run InfluxDb IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml index 2d36c05d2881f..f328dd513c889 100644 --- a/.github/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_JDBC_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/jdbc/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_JDBC_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run JDBC IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -110,6 +113,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml index fa8d2721d38e3..9abd63bb79c03 100644 --- a/.github/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Jms_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/jms/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Jms_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Jms IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -110,6 +113,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml index 629190e328e56..c73c384c78818 100644 --- a/.github/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Kafka_IO_Direct.yml @@ -34,10 +34,11 @@ on: - "sdks/java/expansion-service/**" - "sdks/java/io/synthetic/**" - "sdks/java/io/expansion-service/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '45 1/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -58,7 +59,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -77,7 +78,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Kafka_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -89,6 +90,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Kafka IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -112,6 +115,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml index 47411e0df379a..2aa4d4c10fd8e 100644 --- a/.github/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Kinesis_IO_Direct.yml @@ -44,10 +44,11 @@ on: - "gradlew" - "gradle.bat" - "settings.gradle.kts" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -68,7 +69,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -87,7 +88,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Kinesis_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -99,6 +100,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Kinesis IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -135,6 +138,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml index 40fdb06d3556a..25f90072db5c5 100644 --- a/.github/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Kudu_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/kudu/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Kudu_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Kudu IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml index f210affbb21f9..34fa18a23c717 100644 --- a/.github/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_MongoDb_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/mongodb/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_MongoDb_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run MongoDb IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml index 6f30a3dd84ba5..0b26f28e8f697 100644 --- a/.github/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Mqtt_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/mqtt/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Mqtt_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Mqtt IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml index 8e8041d434c7a..ed6d6baf456b0 100644 --- a/.github/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Neo4j_IO_Direct.yml @@ -28,10 +28,11 @@ on: paths: - "sdks/java/io/neo4j/**" - "sdks/java/testing/test-utils/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +53,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -71,7 +72,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Neo4j_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -83,6 +84,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Neo4j IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -112,6 +115,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml b/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml index b592d704661d1..0ff7785650def 100644 --- a/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml +++ b/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml @@ -32,15 +32,16 @@ on: - 'runners/flink/**' - 'runners/java-fn-execution/**' - 'sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 2/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -76,7 +77,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_PVR_Flink_Batch PreCommit' steps: @@ -87,6 +88,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run validatesPortableRunnerBatch script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -99,10 +102,9 @@ jobs: with: name: JUnit Test Results path: "**/build/reports/tests/" - - name: Publish JUnit Test Results - uses: EnricoMi/publish-unit-test-result-action@v2 - if: always() + - name: Upload test report + uses: actions/upload-artifact@v3 with: - commit: '${{ env.prsha || env.GITHUB_SHA }}' - comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} - files: '**/build/test-results/**/*.xml' \ No newline at end of file + name: java-code-coverage-report + path: "**/build/test-results/**/*.xml" +# TODO: Investigate 'Max retries exceeded' issue with EnricoMi/publish-unit-test-result-action@v2. \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml b/.github/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml index 327f654365607..841c6dbdcbabf 100644 --- a/.github/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml +++ b/.github/workflows/beam_PreCommit_Java_PVR_Flink_Docker.yml @@ -36,10 +36,11 @@ on: - 'sdks/java/harness/**' - 'runners/flink/**' - 'runners/java-fn-execution/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -60,7 +61,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,8 +70,8 @@ env: GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} jobs: - beam_PreCommit_Java: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + beam_PreCommit_Java_PVR_Flink_Docker: + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: fail-fast: false @@ -80,7 +81,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_PVR_Flink_Docker PreCommit' timeout-minutes: 240 @@ -92,6 +93,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run PreCommit Java PVR Flink Docker script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml index c00d0e43b619a..998a6bbbac046 100644 --- a/.github/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Parquet_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/parquet/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Parquet_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Parquet IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml index b0b73d9442751..00becbfd7ee66 100644 --- a/.github/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Pulsar_IO_Direct.yml @@ -37,6 +37,7 @@ on: - "sdks/java/io/pulsar/**" - "sdks/java/io/common/**" - "sdks/java/core/src/main/**" + - 'release/trigger_all_tests.json' - "build.gradle" - "buildSrc/**" - "gradle/**" @@ -47,7 +48,7 @@ on: issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -68,7 +69,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -87,7 +88,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Pulsar_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -99,6 +100,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Pulsar IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -121,6 +124,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml index a259c09beb749..7adbfe74bab58 100644 --- a/.github/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_RabbitMq_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/rabbitmq/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_RabbitMq_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run RabbitMq IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml index cd2741ead81aa..255df740a57f7 100644 --- a/.github/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Redis_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/redis/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Redis_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Redis IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_RequestResponse_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_RequestResponse_IO_Direct.yml new file mode 100644 index 0000000000000..cfe721c1e9486 --- /dev/null +++ b/.github/workflows/beam_PreCommit_Java_RequestResponse_IO_Direct.yml @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: PreCommit Java RequestResponse IO Direct + +on: + push: + tags: ['v*'] + branches: ['master', 'release-*'] + paths: + - "sdks/java/io/rrio/**" + - ".github/workflows/beam_PreCommit_Java_RequestResponse_IO_Direct.yml" + pull_request_target: + branches: ['master', 'release-*'] + paths: + - "sdks/java/io/rrio/**" + - 'release/trigger_all_tests.json' + issue_comment: + types: [created] + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PreCommit_Java_RequestResponse_IO_Direct: + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_PreCommit_Java_RequestResponse_IO_Direct"] + job_phrase: ["Run Java_RequestResponse_IO_Direct PreCommit"] + timeout-minutes: 60 + if: | + github.event_name == 'push' || + github.event_name == 'pull_request_target' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Java_RequestResponse_IO_Direct PreCommit' + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - uses: actions/checkout@v4 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: run RequestResponse IO build script + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:java:io:rrio:build + arguments: | + -PdisableSpotlessCheck=true \ + -PdisableCheckStyle=true \ + - name: Archive JUnit Test Results + uses: actions/upload-artifact@v3 + if: failure() + with: + name: JUnit Test Results + path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' + - name: Archive SpotBugs Results + uses: actions/upload-artifact@v3 + if: always() + with: + name: SpotBugs Results + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml index 9c370e607368c..733921da7ad2b 100644 --- a/.github/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_SingleStore_IO_Direct.yml @@ -28,10 +28,11 @@ on: paths: - "sdks/java/io/singlestore/**" - "sdks/java/testing/test-utils/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -52,7 +53,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -71,7 +72,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_SingleStore_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -83,6 +84,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run SingleStore IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -105,6 +108,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml index 3e27d5c5cfe89..c84f0026b7263 100644 --- a/.github/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Snowflake_IO_Direct.yml @@ -30,10 +30,11 @@ on: - "sdks/java/io/snowflake/**" - "sdks/java/extensions/google-cloud-platform-core/**" - "sdks/java/testing/test-utils/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -54,7 +55,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -73,7 +74,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Snowflake_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -85,6 +86,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Snowflake IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -114,6 +117,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml index c7d13e49f2a45..f3142b41dd4d1 100644 --- a/.github/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Solr_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/solr/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Solr_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Solr IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Spark3_Versions.yml b/.github/workflows/beam_PreCommit_Java_Spark3_Versions.yml index 9b2ada1da7766..9665e9770bc03 100644 --- a/.github/workflows/beam_PreCommit_Java_Spark3_Versions.yml +++ b/.github/workflows/beam_PreCommit_Java_Spark3_Versions.yml @@ -28,15 +28,16 @@ on: branches: ['master', 'release-*'] paths: - 'runners/spark/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 2/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -62,7 +63,7 @@ env: jobs: beam_PreCommit_Java_Spark3_Versions: - name: ${{matrix.job_name}} (${{ matrix.job_phrase }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -72,7 +73,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Spark3_Versions PreCommit' steps: @@ -80,7 +81,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Authenticate on GCP @@ -90,13 +91,10 @@ jobs: service_account_key: ${{ secrets.GCP_SA_KEY }} project_id: ${{ secrets.GCP_PROJECT_ID }} export_default_credentials: true - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - cache: 'gradle' - check-latest: true + java-version: 8 - name: run sparkVersionsTest script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml index 7e453c3792810..7032cc62a62ae 100644 --- a/.github/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Splunk_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/splunk/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Splunk_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Splunk IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml index 71528a8843127..da26e226e105c 100644 --- a/.github/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Thrift_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/thrift/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Thrift_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Thrift IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml index 7d26f12ee635d..20eff37521e2e 100644 --- a/.github/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Tika_IO_Direct.yml @@ -26,10 +26,11 @@ on: branches: ['master', 'release-*'] paths: - "sdks/java/io/tika/**" + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -50,7 +51,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -69,7 +70,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Java_Tika_IO_Direct PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -81,6 +82,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Tika IO build script uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -103,6 +106,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results - path: "**/build/reports/spotbugs/*.html" \ No newline at end of file + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Kotlin_Examples.yml b/.github/workflows/beam_PreCommit_Kotlin_Examples.yml index fb6c0f7696aaa..13d533442575d 100644 --- a/.github/workflows/beam_PreCommit_Kotlin_Examples.yml +++ b/.github/workflows/beam_PreCommit_Kotlin_Examples.yml @@ -38,15 +38,16 @@ on: - 'runners/direct-java/**' - 'examples/kotlin/**' - 'release/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 2/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -72,7 +73,7 @@ env: jobs: beam_PreCommit_Kotlin_Examples: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) timeout-minutes: 120 runs-on: [self-hosted, ubuntu-20.04, main] strategy: @@ -82,7 +83,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Kotlin_Examples PreCommit' steps: @@ -90,16 +91,13 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - cache: 'gradle' - check-latest: true + java-version: 8 - name: run Kotlin Examples script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Portable_Python.yml b/.github/workflows/beam_PreCommit_Portable_Python.yml index 0134c497b836a..169bdb74a649e 100644 --- a/.github/workflows/beam_PreCommit_Portable_Python.yml +++ b/.github/workflows/beam_PreCommit_Portable_Python.yml @@ -42,10 +42,11 @@ on: - 'runners/reference/**' - 'sdks/python/**' - 'release/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '30 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -66,7 +67,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -76,7 +77,7 @@ env: jobs: beam_PreCommit_Portable_Python: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) timeout-minutes: 120 runs-on: ['self-hosted', ubuntu-20.04, main] strategy: @@ -88,28 +89,22 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startsWith(github.event.comment.body, 'Run Portable_Python PreCommit') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} ${{matrix.python_version}} + comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 - with: - distribution: 'zulu' - java-version: '8' - cache: 'gradle' - check-latest: true - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: | - ${{matrix.python_version}} + ${{ matrix.python_version }} 3.8 - name: Set PY_VER_CLEAN id: set_py_ver_clean diff --git a/.github/workflows/beam_PreCommit_Python.yml b/.github/workflows/beam_PreCommit_Python.yml index c891a79cefd07..670f41875ffbf 100644 --- a/.github/workflows/beam_PreCommit_Python.yml +++ b/.github/workflows/beam_PreCommit_Python.yml @@ -17,7 +17,7 @@ name: PreCommit Python on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "model/**","sdks/python/**","release/**"] + paths: [ "model/**","sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,18 +25,18 @@ on: branches: ['master', 'release-*'] paths: [ "model/**","sdks/python/**","release/**",".github/workflows/beam_PreCommit_Python.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '0 3/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Python: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -68,7 +68,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || startsWith(github.event.comment.body, 'Run Python PreCommit') steps: @@ -97,8 +97,17 @@ jobs: arguments: | -Pposargs="--ignore=apache_beam/dataframe/ --ignore=apache_beam/examples/ --ignore=apache_beam/runners/ --ignore=apache_beam/transforms/" \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + -PuseWheelDistribution + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_PythonDocker.yml b/.github/workflows/beam_PreCommit_PythonDocker.yml index aa119cf674c2d..26ecf97c9b52f 100644 --- a/.github/workflows/beam_PreCommit_PythonDocker.yml +++ b/.github/workflows/beam_PreCommit_PythonDocker.yml @@ -17,7 +17,7 @@ name: PreCommit Python Docker on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "model/**","sdks/python/**","release/**"] + paths: [ "model/**","sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,7 +25,7 @@ on: branches: ['master', 'release-*'] paths: [ "model/**","sdks/python/**","release/**",".github/workflows/beam_PreCommit_PythonDocker.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '0 3/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -67,15 +67,16 @@ jobs: python_version: ['3.8','3.9','3.10','3.11'] if: | github.event_name == 'push' || - github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + github.event_name == 'pull_request_target' || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || startsWith(github.event.comment.body, 'Run PythonDocker PreCommit') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} ${{ matrix.python_version }} + comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - name: Setup environment @@ -83,7 +84,7 @@ jobs: with: java-version: 8 python-version: ${{ matrix.python_version }} - go-version: 1.16 + go-version: 1.21 - name: Setup Buildx uses: docker/setup-buildx-action@v2 with: @@ -101,4 +102,4 @@ jobs: gradle-command: :sdks:python:container:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:docker arguments: | -Pposargs=apache_beam/dataframe/ \ - -PpythonVersion=${{ matrix.python_version }} \ No newline at end of file + -PpythonVersion=${{ matrix.python_version }} diff --git a/.github/workflows/beam_PreCommit_PythonDocs.yml b/.github/workflows/beam_PreCommit_PythonDocs.yml index 844d41f2cb520..2ada891473210 100644 --- a/.github/workflows/beam_PreCommit_PythonDocs.yml +++ b/.github/workflows/beam_PreCommit_PythonDocs.yml @@ -18,7 +18,7 @@ name: PreCommit Python Docs on: pull_request_target: branches: [ "master", "release-*" ] - paths: ["sdks/python/**"] + paths: ["sdks/python/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -26,7 +26,7 @@ on: branches: ['master', 'release-*'] paths: ["sdks/python/**",".github/workflows/beam_PreCommit_PythonDocs.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '0 3/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -47,7 +47,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -57,7 +57,7 @@ env: jobs: beam_PreCommit_PythonDocs: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run PythonDocs PreCommit' steps: @@ -75,7 +75,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment diff --git a/.github/workflows/beam_PreCommit_PythonFormatter.yml b/.github/workflows/beam_PreCommit_PythonFormatter.yml index 23093e1db006b..17d9cd855138e 100644 --- a/.github/workflows/beam_PreCommit_PythonFormatter.yml +++ b/.github/workflows/beam_PreCommit_PythonFormatter.yml @@ -17,7 +17,7 @@ name: PreCommit Python Formatter on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "sdks/python/apache_beam/**"] + paths: [ "sdks/python/apache_beam/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,7 +25,7 @@ on: branches: ['master', 'release-*'] paths: [ "sdks/python/apache_beam/**",".github/workflows/beam_PreCommit_PythonFormatter.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '0 3/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_PythonFormatter: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -66,7 +66,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run PythonFormatter PreCommit' steps: @@ -74,7 +74,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment diff --git a/.github/workflows/beam_PreCommit_PythonLint.yml b/.github/workflows/beam_PreCommit_PythonLint.yml index 8a4558c8988d0..583c484e8e93a 100644 --- a/.github/workflows/beam_PreCommit_PythonLint.yml +++ b/.github/workflows/beam_PreCommit_PythonLint.yml @@ -17,7 +17,7 @@ name: PreCommit Python Lint on: pull_request_target: branches: [ "master", "release-*" ] - paths: ["sdks/python/**","release/**"] + paths: ["sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,7 +25,7 @@ on: branches: ['master', 'release-*'] paths: ["sdks/python/**","release/**",".github/workflows/beam_PreCommit_PythonLint.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '0 3/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_PythonLint: - name: ${{matrix.job_name}} (${{ matrix.job_phrase }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -66,7 +66,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run PythonLint PreCommit' steps: diff --git a/.github/workflows/beam_PreCommit_Python_Coverage.yml b/.github/workflows/beam_PreCommit_Python_Coverage.yml index 33be9644d34ad..f0444013df598 100644 --- a/.github/workflows/beam_PreCommit_Python_Coverage.yml +++ b/.github/workflows/beam_PreCommit_Python_Coverage.yml @@ -17,7 +17,7 @@ name: PreCommit Python Coverage on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "model/**","sdks/python/**","release/**"] + paths: [ "model/**","sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,18 +25,18 @@ on: branches: ['master', 'release-*'] paths: [ "model/**","sdks/python/**","release/**", ".github/workflows/beam_PreCommit_Python_Coverage.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '45 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Python_Coverage: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -66,7 +66,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Python_Coverage PreCommit' steps: @@ -74,7 +74,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment @@ -86,8 +86,18 @@ jobs: uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:python:test-suites:tox:py38:preCommitPyCoverage - - name: Archive code coverage results + arguments: | + -PuseWheelDistribution + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Python_Dataframes.yml b/.github/workflows/beam_PreCommit_Python_Dataframes.yml index 2862d7d5936c7..822c10862ac7f 100644 --- a/.github/workflows/beam_PreCommit_Python_Dataframes.yml +++ b/.github/workflows/beam_PreCommit_Python_Dataframes.yml @@ -17,7 +17,7 @@ name: PreCommit Python Dataframes on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "model/**","sdks/python/**","release/**"] + paths: [ "model/**","sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,18 +25,18 @@ on: branches: ['master', 'release-*'] paths: [ "model/**","sdks/python/**","release/**",".github/workflows/beam_PreCommit_Python_Dataframes.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '45 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Python_Dataframes: - name: ${{matrix.job_name}} (${{ matrix.job_phrase}} ${{ matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase}} ${{ matrix.python_version}}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -68,7 +68,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || startsWith(github.event.comment.body, 'Run Python_Dataframes PreCommit') steps: @@ -97,8 +97,17 @@ jobs: arguments: | -Pposargs=apache_beam/dataframe/ \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + -PuseWheelDistribution + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Python_Examples.yml b/.github/workflows/beam_PreCommit_Python_Examples.yml index 7f980885180a2..36a0bde3ebdc7 100644 --- a/.github/workflows/beam_PreCommit_Python_Examples.yml +++ b/.github/workflows/beam_PreCommit_Python_Examples.yml @@ -17,7 +17,7 @@ name: PreCommit Python Examples on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "model/**","sdks/python/**","release/**"] + paths: [ "model/**","sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,18 +25,18 @@ on: branches: ['master', 'release-*'] paths: [ "model/**","sdks/python/**","release/**",".github/workflows/beam_PreCommit_Python_Examples.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '45 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Python_Examples: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -68,7 +68,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || startsWith(github.event.comment.body, 'Run Python_Examples PreCommit') steps: @@ -97,8 +97,17 @@ jobs: arguments: | -Pposargs=apache_beam/examples/ \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + -PuseWheelDistribution + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Python_Integration.yml b/.github/workflows/beam_PreCommit_Python_Integration.yml index 5b377f23774e2..d0af4932168ee 100644 --- a/.github/workflows/beam_PreCommit_Python_Integration.yml +++ b/.github/workflows/beam_PreCommit_Python_Integration.yml @@ -17,7 +17,7 @@ name: PreCommit Python Integration on: pull_request_target: branches: [ "master", "release-*" ] - paths: ["model/**", "sdks/python/**", "release/**"] + paths: ["model/**", "sdks/python/**", "release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,18 +25,18 @@ on: branches: ['master', 'release-*'] paths: ["model/**", "sdks/python/**", "release/**", ".github/workflows/beam_PreCommit_Python_Integration.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '45 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Python_Integration: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -68,7 +68,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || startsWith(github.event.comment.body, 'Run Python_Integration PreCommit') steps: @@ -104,8 +104,16 @@ jobs: arguments: | -PuseWheelDistribution \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml b/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml index c268b4ed78f7c..7e083bf84313f 100644 --- a/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml +++ b/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml @@ -29,6 +29,7 @@ on: - 'runners/flink/**' - 'runners/java-fn-execution/**' - 'runners/reference/**' + - 'release/trigger_all_tests.json' issue_comment: types: [created] push: @@ -47,18 +48,18 @@ on: - 'runners/reference/**' - '.github/workflows/beam_PreCommit_Python_PVR_Flink.yml' schedule: - - cron: '* */6 * * *' + - cron: '45 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -68,7 +69,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -87,7 +88,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Python_PVR_Flink PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -99,8 +100,8 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: 3.11 - name: run Python PVR Flink PreCommit script @@ -111,8 +112,16 @@ jobs: gradle-command: :sdks:python:test-suites:portable:py311:flinkValidatesRunner arguments: | -PpythonVersion=3.11 \ - - name: Archive code coverage results + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report + name: Python Test Results path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Python_Runners.yml b/.github/workflows/beam_PreCommit_Python_Runners.yml index b0c5ab4fa34a3..f43c4eb47b6ea 100644 --- a/.github/workflows/beam_PreCommit_Python_Runners.yml +++ b/.github/workflows/beam_PreCommit_Python_Runners.yml @@ -17,7 +17,7 @@ name: PreCommit Python Runners on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "model/**","sdks/python/**","release/**"] + paths: [ "model/**","sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,18 +25,18 @@ on: branches: ['master', 'release-*'] paths: [ "model/**","sdks/python/**","release/**",".github/workflows/beam_PreCommit_Python_Runners.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '45 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Python_Runners: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -68,7 +68,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || startsWith(github.event.comment.body, 'Run Python_Runners PreCommit') steps: @@ -97,8 +97,17 @@ jobs: arguments: | -Pposargs=apache_beam/runners/ \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + -PuseWheelDistribution + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Python_Transforms.yml b/.github/workflows/beam_PreCommit_Python_Transforms.yml index 88ac59c9de96d..6d06596ed4c85 100644 --- a/.github/workflows/beam_PreCommit_Python_Transforms.yml +++ b/.github/workflows/beam_PreCommit_Python_Transforms.yml @@ -17,7 +17,7 @@ name: PreCommit Python Transforms on: pull_request_target: branches: [ "master", "release-*" ] - paths: [ "model/**","sdks/python/**","release/**"] + paths: [ "model/**","sdks/python/**","release/**", 'release/trigger_all_tests.json'] issue_comment: types: [created] push: @@ -25,18 +25,18 @@ on: branches: ['master', 'release-*'] paths: [ "model/**","sdks/python/**","release/**",".github/workflows/beam_PreCommit_Python_Transforms.yml"] schedule: - - cron: '0 */6 * * *' + - cron: '45 2/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event permissions: actions: write - pull-requests: read - checks: read + pull-requests: write + checks: write contents: read deployments: read id-token: none - issues: read + issues: write discussions: read packages: read pages: read @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Python_Transforms: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{matrix.python_version}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) runs-on: [self-hosted, ubuntu-20.04, main] timeout-minutes: 180 strategy: @@ -68,7 +68,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || startsWith(github.event.comment.body, 'Run Python_Transforms PreCommit') steps: @@ -97,8 +97,17 @@ jobs: arguments: | -Pposargs=apache_beam/transforms/ \ -PpythonVersion=${{ matrix.python_version }} \ - - name: Archive code coverage results + -PuseWheelDistribution + - name: Archive Python Test Results uses: actions/upload-artifact@v3 + if: failure() with: - name: python-code-coverage-report - path: "**/pytest*.xml" \ No newline at end of file + name: Python Test Results + path: '**/pytest*.xml' + - name: Publish Python Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/pytest*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_RAT.yml b/.github/workflows/beam_PreCommit_RAT.yml index c129fc12114f6..c45ae4fc55437 100644 --- a/.github/workflows/beam_PreCommit_RAT.yml +++ b/.github/workflows/beam_PreCommit_RAT.yml @@ -24,7 +24,7 @@ on: issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 3/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -45,7 +45,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -55,7 +55,7 @@ env: jobs: beam_PreCommit_RAT: - name: ${{matrix.job_name}} (${{ matrix.job_phrase }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -65,7 +65,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run RAT PreCommit' steps: diff --git a/.github/workflows/beam_PreCommit_SQL.yml b/.github/workflows/beam_PreCommit_SQL.yml index ee9799346f1f3..ac72f6b752dae 100644 --- a/.github/workflows/beam_PreCommit_SQL.yml +++ b/.github/workflows/beam_PreCommit_SQL.yml @@ -22,16 +22,16 @@ on: paths: ['sdks/java/extensions/sql/**','.github/workflows/beam_PreCommit_SQL.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: [sdks/java/extensions/sql/**] + paths: ['sdks/java/extensions/sql/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 3/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true # Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run SQL PreCommit' steps: @@ -78,11 +78,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' - java-version: '11' + java-version: 11 - name: Build and Test uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -106,6 +105,13 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: Publish SpotBugs path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_SQL_Java11.yml b/.github/workflows/beam_PreCommit_SQL_Java11.yml index f1c733418b8d3..9dadb4dcc2fff 100644 --- a/.github/workflows/beam_PreCommit_SQL_Java11.yml +++ b/.github/workflows/beam_PreCommit_SQL_Java11.yml @@ -22,16 +22,16 @@ on: paths: ['sdks/java/extensions/sql/**','.github/workflows/beam_PreCommit_SQL_Java11.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: [sdks/java/extensions/sql/**] + paths: ['sdks/java/extensions/sql/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 3/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run SQL_Java11 PreCommit' steps: @@ -78,19 +78,12 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Set up Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' - java-version: '11' - - name: Install Go - uses: actions/setup-go@v4 - with: - go-version: '1.21' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' + java-version: 11 + python-version: 3.8 + go-version: 1.21 - name: Install Flutter uses: subosito/flutter-action@v2 with: @@ -103,7 +96,7 @@ jobs: arguments: | -PdisableSpotlessCheck=true \ -PdisableCheckStyle=true \ - -PcompileAndRunTestsWithJava11 \ + -PtestJavaVersion=11 \ -PskipCheckerFramework \ -Pjava11Home=$JAVA_HOME_11_X64 \ - name: Archive JUnit Test Results @@ -121,6 +114,7 @@ jobs: files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results path: '**/build/reports/spotbugs/*.html' @@ -128,5 +122,5 @@ jobs: uses: jwgmeligmeyling/spotbugs-github-action@v1.2 if: always() with: - name: SpotBugs + name: Publish SpotBugs path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_SQL_Java17.yml b/.github/workflows/beam_PreCommit_SQL_Java17.yml index 7547bd396815e..8a1e88f7c6543 100644 --- a/.github/workflows/beam_PreCommit_SQL_Java17.yml +++ b/.github/workflows/beam_PreCommit_SQL_Java17.yml @@ -22,11 +22,11 @@ on: paths: ['sdks/java/extensions/sql/**','.github/workflows/beam_PreCommit_SQL_Java17.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: [sdks/java/extensions/sql/**] + paths: ['sdks/java/extensions/sql/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 3/6 * * *' workflow_dispatch: # Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -47,7 +47,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -66,7 +66,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run SQL_Java17 PreCommit' runs-on: [self-hosted, ubuntu-20.04, main] @@ -80,20 +80,13 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) # The test requires Java 17 and Java 8 versions. # Java 8 is installed second because JAVA_HOME needs to point to Java 8. - - name: Set up Java 17 - uses: actions/setup-java@v3.11.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'temurin' - java-version: '17' - - name: Set up Java 8 - uses: actions/setup-java@v3.11.0 - with: - distribution: 'temurin' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' + java-version: | + 17 + 8 + python-version: 3.8 - name: Build and Test uses: ./.github/actions/gradle-command-self-hosted-action with: @@ -101,7 +94,7 @@ jobs: arguments: | -PdisableSpotlessCheck=true \ -PdisableCheckStyle=true \ - -PcompileAndRunTestsWithJava17 \ + -PtestJavaVersion=17 \ -PskipCheckerFramework \ -Pjava17Home=$JAVA_HOME_17_X64 \ - name: Archive JUnit Test Results @@ -110,15 +103,22 @@ jobs: with: name: JUnit Test Results path: "**/build/reports/tests/" + - name: Publish JUnit Test Results + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() + with: + commit: '${{ env.prsha || env.GITHUB_SHA }}' + comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} + files: '**/build/test-results/**/*.xml' - name: Archive SpotBugs Results uses: actions/upload-artifact@v3 + if: always() with: name: SpotBugs Results path: '**/build/reports/spotbugs/*.html' - - name: Publish JUnit Test Results - uses: EnricoMi/publish-unit-test-result-action@v2 + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 if: always() with: - commit: '${{ env.prsha || env.GITHUB_SHA }}' - comment_mode: ${{ github.event_name == 'issue_comment' && 'always' || 'off' }} - files: '**/build/test-results/**/*.xml' \ No newline at end of file + name: Publish SpotBugs + path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Spotless.yml b/.github/workflows/beam_PreCommit_Spotless.yml index 394f627adbab0..6aa590e7556eb 100644 --- a/.github/workflows/beam_PreCommit_Spotless.yml +++ b/.github/workflows/beam_PreCommit_Spotless.yml @@ -35,15 +35,16 @@ on: - 'examples/java/**' - 'examples/kotlin/**' - '.test-infra/jenkins/' + - 'release/trigger_all_tests.json' issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '0 3/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -69,7 +70,7 @@ env: jobs: beam_PreCommit_Spotless: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) timeout-minutes: 120 runs-on: [self-hosted, ubuntu-20.04, main] strategy: @@ -80,7 +81,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Spotless PreCommit' steps: @@ -91,10 +92,12 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action - name: run Spotless PreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: spotlessCheck checkStyleMain checkStyleTest + gradle-command: spotlessCheck checkStyleMain checkStyleTest :buildSrc:spotlessCheck - name: Upload test report uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/beam_PreCommit_Typescript.yml b/.github/workflows/beam_PreCommit_Typescript.yml index a2df0033f98bd..f3184dc0f90d0 100644 --- a/.github/workflows/beam_PreCommit_Typescript.yml +++ b/.github/workflows/beam_PreCommit_Typescript.yml @@ -24,16 +24,16 @@ on: paths: ['sdks/python/apache_beam/runners/interactive/extensions/**', '.github/workflows/beam_PreCommit_Typescript.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['sdks/python/apache_beam/runners/interactive/extensions/**'] + paths: ['sdks/python/apache_beam/runners/interactive/extensions/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 3/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -58,7 +58,7 @@ permissions: statuses: read jobs: beam_PreCommit_Typescript: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) timeout-minutes: 120 runs-on: [self-hosted, ubuntu-20.04, main] strategy: @@ -68,7 +68,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Typescript PreCommit' steps: @@ -76,7 +76,7 @@ jobs: - name: Setup repository uses: ./.github/actions/setup-action with: - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Setup environment diff --git a/.github/workflows/beam_PreCommit_Website.yml b/.github/workflows/beam_PreCommit_Website.yml index 4ae4e1a99a0c8..848b4d44deec0 100644 --- a/.github/workflows/beam_PreCommit_Website.yml +++ b/.github/workflows/beam_PreCommit_Website.yml @@ -22,11 +22,11 @@ on: paths: ['website/**','.github/workflows/beam_PreCommit_Website.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['website/**'] + paths: ['website/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 3/6 * * *' workflow_dispatch: #Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event @@ -47,7 +47,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -57,7 +57,7 @@ env: jobs: beam_PreCommit_Website: - name: ${{matrix.job_name}} (${{ matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase}}) runs-on: [self-hosted, ubuntu-20.04, small] strategy: matrix: @@ -67,7 +67,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Website PreCommit' steps: diff --git a/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml b/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml index 00cda54912ac3..c7b51f137290b 100644 --- a/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml +++ b/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml @@ -22,16 +22,16 @@ on: paths: ['website/**','.github/workflows/beam_PreCommit_Website_Stage_GCS.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['website/**'] + paths: ['website/**', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 3/6 * * *' workflow_dispatch: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -59,7 +59,7 @@ permissions: jobs: beam_PreCommit_Website_Stage_GCS: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -69,7 +69,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Website_Stage_GCS PreCommit' steps: diff --git a/.github/workflows/beam_PreCommit_Whitespace.yml b/.github/workflows/beam_PreCommit_Whitespace.yml index 065c5cc7fd8f2..da58d309f3544 100644 --- a/.github/workflows/beam_PreCommit_Whitespace.yml +++ b/.github/workflows/beam_PreCommit_Whitespace.yml @@ -22,11 +22,11 @@ on: paths: ['**.md', '**.build.gradle', 'build.gradle.kts', '.github/workflows/beam_PreCommit_Whitespace.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['**.md', '**.build.gradle', 'build.gradle.kts'] + paths: ['**.md', '**.build.gradle', 'build.gradle.kts', 'release/trigger_all_tests.json'] issue_comment: types: [created] schedule: - - cron: '0 */6 * * *' + - cron: '15 3/6 * * *' workflow_dispatch: permissions: @@ -46,7 +46,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -56,7 +56,7 @@ env: jobs: beam_PreCommit_Whitespace: - name: ${{matrix.job_name}} (${{ matrix.job_phrase }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -66,7 +66,7 @@ jobs: if: | github.event_name == 'push' || github.event_name == 'pull_request_target' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || github.event.comment.body == 'Run Whitespace PreCommit' steps: diff --git a/.github/workflows/beam_Prober_CommunityMetrics.yml b/.github/workflows/beam_Prober_CommunityMetrics.yml new file mode 100644 index 0000000000000..3526b3ced8c0d --- /dev/null +++ b/.github/workflows/beam_Prober_CommunityMetrics.yml @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Community Metrics Prober + +on: + issue_comment: + types: [created] + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_Prober_CommunityMetrics: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Run Community Metrics Prober' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_Prober_CommunityMetrics"] + job_phrase: ["Run Community Metrics Prober"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Health check probes for the Community Metrics infrastructure + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :communityMetricsProber + arguments: | + --rerun-tasks \ No newline at end of file diff --git a/.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml b/.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml new file mode 100644 index 0000000000000..f91717a86d5bc --- /dev/null +++ b/.github/workflows/beam_Publish_Beam_SDK_Snapshots.yml @@ -0,0 +1,108 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Publish Beam SDK Snapshots + +on: + schedule: + - cron: '0 */4 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + docker_registry: gcr.io + +jobs: + beam_Publish_Beam_SDK_Snapshots: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + name: ${{ matrix.job_name }} (${{ matrix.container_task }}) + strategy: + fail-fast: false + matrix: + job_name: ["beam_Publish_Beam_SDK_Snapshots"] + job_phrase: ["N/A"] + container_task: + - "go:container" + - "java:container:java8" + - "java:container:java11" + - "java:container:java17" + - "java:container:java21" + - "python:container:py38" + - "python:container:py39" + - "python:container:py310" + - "python:container:py311" + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.container_task }}) + - name: Authenticate on GCP + uses: google-github-actions/setup-gcloud@v0 + with: + service_account_email: ${{ secrets.GCP_SA_EMAIL }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} + export_default_credentials: true + - name: GCloud Docker credential helper + run: | + gcloud auth configure-docker ${{ env.docker_registry }} + - name: Setup Java environment + if: ${{ startsWith(matrix.container_task, 'java') }} + uses: ./.github/actions/setup-environment-action + with: + java-version: 11 + - name: Setup Python environment + if: ${{ startsWith(matrix.container_task, 'python') }} + uses: ./.github/actions/setup-environment-action + with: + python-version: '3.8' + - name: run Publish Beam SDK Snapshots script + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:${{ matrix.container_task }}:dockerTagPush + arguments: | + -Pjava11Home=$JAVA_HOME_11_X64 \ + -Pdocker-repository-root=gcr.io/apache-beam-testing/beam-sdk \ + -Pdocker-tag-list=${{ github.sha }},latest \ No newline at end of file diff --git a/.github/workflows/beam_Publish_Docker_Snapshots.yml b/.github/workflows/beam_Publish_Docker_Snapshots.yml new file mode 100644 index 0000000000000..158f51c57d943 --- /dev/null +++ b/.github/workflows/beam_Publish_Docker_Snapshots.yml @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Publish Docker Snapshots + +on: + schedule: + - cron: '0 13 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + docker_registry: gcr.io + +jobs: + beam_Publish_Docker_Snapshots: + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || + github.event.comment.body == 'Publish Docker Snapshots' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_Publish_Docker_Snapshots"] + job_phrase: ["Publish Docker Snapshots"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Setup environment + uses: ./.github/actions/setup-environment-action + - name: Authenticate on GCP + uses: google-github-actions/setup-gcloud@v0 + with: + service_account_email: ${{ secrets.GCP_SA_EMAIL }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} + export_default_credentials: true + - name: GCloud Docker credential helper + run: | + gcloud auth configure-docker ${{ env.docker_registry }} + - name: run Publish Docker Snapshots script for Spark + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :runners:spark:3:job-server:container:dockerPush + arguments: | + -Pdocker-repository-root=gcr.io/apache-beam-testing/beam_portability \ + -Pdocker-tag-list=latest \ + - name: run Publish Docker Snapshots script for Flink + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :runners:flink:1.15:job-server-container:dockerPush + arguments: | + -Pdocker-repository-root=gcr.io/apache-beam-testing/beam_portability \ + -Pdocker-tag-list=latest \ No newline at end of file diff --git a/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml b/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml index 9a4ff4144ac59..6681669f59589 100644 --- a/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml +++ b/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml @@ -16,12 +16,6 @@ name: Python ValidatesContainer Dataflow ARM on: - issue_comment: - types: [created] - push: - tags: ['v*'] - branches: ['master', 'release-*'] - paths: ["sdks/python/**",".github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml"] schedule: - cron: '0 */6 * * *' workflow_dispatch: @@ -44,7 +38,7 @@ permissions: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}' + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}' cancel-in-progress: true env: @@ -54,7 +48,7 @@ env: jobs: beam_Python_ValidatesContainer_Dataflow_ARM: - name: ${{matrix.job_name}} (${{matrix.job_phrase}} ${{ matrix.python_version }}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) strategy: fail-fast: false matrix: @@ -63,7 +57,7 @@ jobs: python_version: ['3.8','3.9','3.10','3.11'] if: | github.event_name == 'push' || - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') || github.event_name == 'workflow_dispatch' || startsWith(github.event.comment.body, 'Run Python ValidatesContainer Dataflow ARM') @@ -117,6 +111,7 @@ jobs: USER: github-actions - name: Archive code coverage results uses: actions/upload-artifact@v3 + if: always() with: name: python-code-coverage-report path: "**/pytest*.xml" diff --git a/.github/workflows/beam_Release_NightlySnapshot.yml b/.github/workflows/beam_Release_NightlySnapshot.yml index a4be830cd3c50..8f6b85218a9a8 100644 --- a/.github/workflows/beam_Release_NightlySnapshot.yml +++ b/.github/workflows/beam_Release_NightlySnapshot.yml @@ -43,7 +43,7 @@ env: jobs: beam_Release_NightlySnapshot: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: @@ -51,16 +51,15 @@ jobs: job_phrase: [Release Nightly Snapshot] if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' - + (github.event_name == 'schedule' && github.repository == 'apache/beam') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - github_job: ${{matrix.job_name}} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) github_token: ${{ secrets.GITHUB_TOKEN }} - comment_phrase: "Release Nightly Snapshot" + comment_phrase: ${{ matrix.job_phrase }} - name: Setup environment uses: ./.github/actions/setup-environment-action with: diff --git a/.github/workflows/beam_Release_Python_NightlySnapshot.yml b/.github/workflows/beam_Release_Python_NightlySnapshot.yml index 62019c536969f..eca74ea5952a7 100644 --- a/.github/workflows/beam_Release_Python_NightlySnapshot.yml +++ b/.github/workflows/beam_Release_Python_NightlySnapshot.yml @@ -43,23 +43,23 @@ env: jobs: beam_Release_Python_NightlySnapshot: - name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) runs-on: [self-hosted, ubuntu-20.04, main] strategy: matrix: job_name: [beam_Release_Python_NightlySnapshot] job_phrase: [Release Nightly Snapshot Python] - if: github.event_name == 'workflow_dispatch' || - github.event_name == 'schedule' - + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'apache/beam') steps: - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: - github_job: ${{matrix.job_name}} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) github_token: ${{ secrets.GITHUB_TOKEN }} - comment_phrase: ${{matrix.job_phrase}} + comment_phrase: ${{ matrix.job_phrase }} - name: Setup environment uses: ./.github/actions/setup-environment-action with: diff --git a/.github/workflows/build_release_candidate.yml b/.github/workflows/build_release_candidate.yml index ded0896673649..eac2655492ced 100644 --- a/.github/workflows/build_release_candidate.yml +++ b/.github/workflows/build_release_candidate.yml @@ -20,7 +20,7 @@ on: description: Your Apache password. Required if you want to stage artifacts into https://dist.apache.org/repos/dist/dev/beam/ required: false BEAM_SITE_TOKEN: - description: Github Personal Access Token with repo permission if you want to create the beam-site docs PR. See https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens. + description: Github Personal Access Token with apache/beam-site repo permission if you want to create the beam-site docs PR. See https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens. default: '' PUBLISH_JAVA_ARTIFACTS: description: Whether to publish java artifacts to https://repository.apache.org/#stagingRepositories (yes/no) @@ -34,6 +34,10 @@ on: description: Whether to stage SDK docker images to docker hub Apache organization required: true default: 'no' + STAGE_PYTHON_ARTIFACTS: + description: Whether to stage the python artifacts into https://dist.apache.org/repos/dist/dev/beam/ + required: true + default: 'no' CREATE_BEAM_SITE_PR: description: Whether to create the documentation update PR against apache/beam-site. required: true @@ -147,7 +151,85 @@ jobs: svn status svn commit -m "Staging Java artifacts for Apache Beam ${{ github.event.inputs.RELEASE }} RC${{ github.event.inputs.RC }}" --non-interactive --username ${{ github.event.inputs.APACHE_ID }} --password ${{ github.event.inputs.APACHE_PASSWORD }} - + stage_python_artifacts: + if: ${{github.event.inputs.STAGE_PYTHON_ARTIFACTS == 'yes'}} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Validate and mask apache id/password + run: | + echo "::add-mask::${{ github.event.inputs.APACHE_PASSWORD }}" + if [ "${{ github.event.inputs.APACHE_ID }}" == "" ] + then + echo "Must provide an apache id to stage artifacts to https://dist.apache.org/repos/dist/dev/beam/" + fi + if [ "${{ github.event.inputs.APACHE_PASSWORD }}" == "" ] + then + echo "Must provide an apache password to stage artifacts to https://dist.apache.org/repos/dist/dev/beam/" + fi + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.8 + - name: Import GPG key + id: import_gpg + uses: crazy-max/ghaction-import-gpg@111c56156bcc6918c056dbef52164cfa583dc549 + with: + gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }} + - name: Install dependencies + run: | + pip install python-dateutil + pip install requests + - name: stage python artifacts + env: + RC_TAG: "v${{ github.event.inputs.RELEASE }}-RC${{ github.event.inputs.RC }}" + GIT_REPO_BASE_URL: https://github.com/apache/beam + RELEASE_DIR: "beam/${{ github.event.inputs.RELEASE }}" + RELEASE: "${{ github.event.inputs.RELEASE }}" + SCRIPT_DIR: release/src/main/scripts + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SVN_ARTIFACTS_DIR: "beam/${{ github.event.inputs.RELEASE }}/python" + run: | + svn co https://dist.apache.org/repos/dist/dev/beam + mkdir -p "${SVN_ARTIFACTS_DIR}" + + git fetch --all --tags --prune + RELEASE_COMMIT=$(git rev-list -n 1 "tags/${RC_TAG}") + + python "${SCRIPT_DIR}/download_github_actions_artifacts.py" \ + --github-token-var GITHUB_TOKEN \ + --repo-url "apache/beam" \ + --rc-tag "${RC_TAG}" \ + --release-commit "${RELEASE_COMMIT}" \ + --artifacts_dir "${RELEASE_DIR}/python" \ + --yes True + + cd "${RELEASE_DIR}"/python + ls + + echo "------Checking Hash Value for apache-beam-${RELEASE}.tar.gz-----" + sha512sum -c "apache-beam-${RELEASE}.tar.gz.sha512" + + echo "------Signing Source Release apache-beam-${RELEASE}.tar.gz------" + gpg --local-user "${{steps.import_gpg.outputs.name}}" --armor --detach-sig "apache-beam-${RELEASE}.tar.gz" + + for artifact in *.whl; do + echo "----------Checking Hash Value for ${artifact} wheel-----------" + sha512sum -c "${artifact}.sha512" + done + + for artifact in *.whl; do + echo "------------------Signing ${artifact} wheel-------------------" + gpg --local-user "${{steps.import_gpg.outputs.name}}" --armor --batch --yes --detach-sig "${artifact}" + done + + cd .. + svn add --force python + svn status + svn commit -m "Staging Python artifacts for Apache Beam ${RELEASE} RC${RC_NUM}" --non-interactive --username ${{ github.event.inputs.APACHE_ID }} --password ${{ github.event.inputs.APACHE_PASSWORD }} + + stage_docker: if: ${{github.event.inputs.STAGE_DOCKER_ARTIFACTS == 'yes'}} # Note: if this ever changes to self-hosted, remove the "Remove default github maven configuration" step @@ -192,7 +274,6 @@ jobs: - name: Push docker images run: ./gradlew :pushAllDockerImages -PisRelease -Pdocker-pull-licenses -Pprune-images -Pdocker-tag=${{ github.event.inputs.RELEASE }}rc${{ github.event.inputs.RC }} -Pjava11Home=${{steps.export-java11.outputs.JAVA11_HOME}} --no-daemon --no-parallel - beam_site_pr: if: ${{github.event.inputs.CREATE_BEAM_SITE_PR == 'yes'}} # Note: if this ever changes to self-hosted, remove the "Remove default github maven configuration" step @@ -221,7 +302,7 @@ jobs: with: python-version: '3.8' - name: Install node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: '16' - name: Install Java 8 @@ -244,7 +325,7 @@ jobs: pip install -U pip pip install tox # TODO(https://github.com/apache/beam/issues/20209): Don't hardcode py version in this file. - pip install -r build-requirements.txt && tox -e py38-docs + tox -e py38-docs rm -rf target/docs/_build/.doctrees - name: Build Typescript Docs working-directory: beam/sdks/typescript diff --git a/.github/workflows/build_runner_image.yml b/.github/workflows/build_runner_image.yml index 6071d936958ad..c0f4cacd86fe8 100644 --- a/.github/workflows/build_runner_image.yml +++ b/.github/workflows/build_runner_image.yml @@ -30,6 +30,7 @@ env: docker_repo: apache-beam-testing/beam-github-actions/beam-arc-runner jobs: build-and-version-runner: + if: github.repository == 'apache/beam' env: working-directory: .github/gh-actions-self-hosted-runners/arc/images/ runs-on: [self-hosted, ubuntu-20.04] diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 94248be0c008f..58d14c6fcd68b 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -92,11 +92,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - - name: Get build dependencies - working-directory: ./sdks/python - run: python -m pip install -r build-requirements.txt - - name: Install wheels - run: python -m pip install wheel - name: Get tag id: get_tag run: | @@ -117,15 +112,15 @@ jobs: echo "RELEASE_VERSION=$RELEASE_VERSION" >> $GITHUB_OUTPUT - name: Build source working-directory: ./sdks/python - run: python setup.py sdist --formats=zip + run: pip install -U build && python -m build --sdist - name: Add checksums working-directory: ./sdks/python/dist run: | - file=$(ls | grep .zip | head -n 1) + file=$(ls | grep .tar.gz | head -n 1) sha512sum $file > ${file}.sha512 - name: Unzip source working-directory: ./sdks/python - run: unzip dist/$(ls dist | grep .zip | head -n 1) + run: tar -xzvf dist/$(ls dist | grep .tar.gz | head -n 1) - name: Rename source directory working-directory: ./sdks/python run: mv $(ls | grep apache-beam) apache-beam-source @@ -155,17 +150,17 @@ jobs: - name: Build RC source if: steps.is_rc.outputs.is_rc == 1 working-directory: ./sdks/python - run: python setup.py sdist --formats=zip + run: pip install -U build && python -m build --sdist - name: Add RC checksums if: steps.is_rc.outputs.is_rc == 1 working-directory: ./sdks/python/dist run: | - file=$(ls | grep .zip | head -n 1) + file=$(ls | grep .tar.gz | head -n 1) sha512sum $file > ${file}.sha512 - name: Unzip RC source if: steps.is_rc.outputs.is_rc == 1 working-directory: ./sdks/python - run: unzip dist/$(ls dist | grep .zip | head -n 1) + run: tar -xzvf dist/$(ls dist | grep .tar.gz | head -n 1) - name: Rename RC source directory if: steps.is_rc.outputs.is_rc == 1 working-directory: ./sdks/python diff --git a/.github/workflows/dask_runner_tests.yml b/.github/workflows/dask_runner_tests.yml index 423a304db8259..35c320086992d 100644 --- a/.github/workflows/dask_runner_tests.yml +++ b/.github/workflows/dask_runner_tests.yml @@ -44,12 +44,9 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - - name: Get build dependencies - working-directory: ./sdks/python - run: pip install pip setuptools --upgrade && pip install -r build-requirements.txt - name: Build source working-directory: ./sdks/python - run: python setup.py sdist + run: pip install -U build && python -m build --sdist - name: Rename source file working-directory: ./sdks/python/dist run: mv $(ls | grep "apache-beam.*tar\.gz") apache-beam-source.tar.gz @@ -78,9 +75,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.params.py_ver }} - - name: Get build dependencies - working-directory: ./sdks/python - run: pip install -r build-requirements.txt - name: Install tox run: pip install tox - name: Install SDK with dask diff --git a/.github/workflows/deploy_release_candidate_pypi.yaml b/.github/workflows/deploy_release_candidate_pypi.yaml new file mode 100644 index 0000000000000..fd3994f658e84 --- /dev/null +++ b/.github/workflows/deploy_release_candidate_pypi.yaml @@ -0,0 +1,89 @@ +name: deploy_release_candidate_pypi + +# Workflow added after https://github.com/apache/beam/commit/4183e747becebd18becee5fff547af365910fc9c +# If help is needed debugging issues, you can view the release guide at that commit for guidance on how to do this manually. +# (https://github.com/apache/beam/blob/4183e747becebd18becee5fff547af365910fc9c/website/www/site/content/en/contribute/release-guide.md) +on: + workflow_dispatch: + inputs: + RELEASE: + description: Beam version of current release (e.g. 2.XX.0) + required: true + default: '2.XX.0' + RC: + description: Integer RC version for the release (e.g. 3 for RC3) + required: true + PYPI_USER: + description: PyPi username to perform the upload with + required: true + PYPI_PASSWORD: + description: PyPi password to perform the upload with + required: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + deploy_release_candidate_pypi: + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - name: Mask PyPi id/password + run: | + echo "::add-mask::${{ github.event.inputs.PYPI_USER }}" + echo "::add-mask::${{ github.event.inputs.PYPI_PASSWORD }}" + - name: Checkout + uses: actions/checkout@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.11 + - name: Install dependencies + run: | + pip install python-dateutil + pip install requests + pip install twine + - name: Deploy to Pypi + env: + RC_TAG: "v${{ github.event.inputs.RELEASE }}-RC${{ github.event.inputs.RC }}" + GIT_REPO_BASE_URL: https://github.com/apache/beam + RELEASE_DIR: "beam/${{ github.event.inputs.RELEASE }}" + RELEASE: "${{ github.event.inputs.RELEASE }}" + SCRIPT_DIR: release/src/main/scripts + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SVN_ARTIFACTS_DIR: "beam/${{ github.event.inputs.RELEASE }}/python" + run: | + git fetch --all --tags --prune + RELEASE_COMMIT=$(git rev-list -n 1 $RC_TAG) + + PYTHON_ARTIFACTS_DIR="./python" + python "release/src/main/scripts/download_github_actions_artifacts.py" \ + --github-token-var GITHUB_TOKEN \ + --repo-url "apache/beam" \ + --rc-tag "${RC_TAG}" \ + --release-commit "${RELEASE_COMMIT}" \ + --artifacts_dir "${PYTHON_ARTIFACTS_DIR}" \ + --rc_number "${{ github.event.inputs.RC }}" \ + --yes True + + cd "${PYTHON_ARTIFACTS_DIR}" + ls + + echo "------Checking Hash Value for apache-beam-${RELEASE}rc${{ github.event.inputs.RC }}.tar.gz-----" + sha512sum -c "apache-beam-${RELEASE}rc${{ github.event.inputs.RC }}.tar.gz.sha512" + + for artifact in *.whl; do + echo "----------Checking Hash Value for ${artifact} wheel-----------" + sha512sum -c "${artifact}.sha512" + done + + echo "===================Removing sha512 files=======================" + rm $(ls | grep -i ".*.sha512$") + + echo "====================Upload rc to pypi========================" + mkdir dist && mv $(ls | grep apache) dist && cd dist + echo "Will upload the following files to PyPI:" + ls + + twine upload * -u ${{ github.event.inputs.PYPI_USER }} -p ${{ github.event.inputs.PYPI_PASSWORD }} diff --git a/.github/workflows/finalize_release.yml b/.github/workflows/finalize_release.yml new file mode 100644 index 0000000000000..cb1d9571a34a9 --- /dev/null +++ b/.github/workflows/finalize_release.yml @@ -0,0 +1,141 @@ +name: finalize_release + +# Workflow added after https://github.com/apache/beam/commit/4183e747becebd18becee5fff547af365910fc9c +# If help is needed debugging issues, you can view the release guide at that commit for guidance on how to do this manually. +# (https://github.com/apache/beam/blob/4183e747becebd18becee5fff547af365910fc9c/website/www/site/content/en/contribute/release-guide.md) +on: + workflow_dispatch: + inputs: + RELEASE: + description: Beam version of current release (e.g. 2.XX.0) + required: true + default: '2.XX.0' + RC: + description: Integer RC version for the release that we'd like to finalize (e.g. 3 for RC3) + required: true + PYPI_USER: + description: PyPi username to perform the PyPi upload with + required: false + PYPI_PASSWORD: + description: PyPi password to perform the PyPi upload with + required: false + PUSH_DOCKER_ARTIFACTS: + description: Whether to push SDK docker images to docker hub Apache organization. Should be yes unless you've already completed this step. + required: true + default: 'no' + PUBLISH_PYTHON_ARTIFACTS: + description: Whether to publish the python artifacts into PyPi. Should be yes unless you've already completed this step. + required: true + default: 'no' + TAG_RELEASE: + description: Whether to tag the release on GitHub. Should be yes unless you've already completed this step. + required: true + default: 'no' +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + push_docker_artifacts: + if: ${{github.event.inputs.PUSH_DOCKER_ARTIFACTS == 'yes'}} + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - name: Publish to Docker + env: + RELEASE: "${{ github.event.inputs.RELEASE }}" + RC_NUM: "${{ github.event.inputs.RC }}" + RC_VERSION: "rc_${{ github.event.inputs.RC }}" + run: | + + echo "Publish SDK docker images to Docker Hub." + + echo "================Pull RC Containers from DockerHub===========" + IMAGES=$(docker search apache/beam_ --format "{{.Name}}" --limit 100) + KNOWN_IMAGES=() + echo "We are using ${RC_VERSION} to push docker images for ${RELEASE}." + while read IMAGE; do + # Try pull verified RC from dockerhub. + if docker pull "${IMAGE}:${RELEASE}${RC_VERSION}" 2>/dev/null ; then + KNOWN_IMAGES+=( $IMAGE ) + fi + done < <(echo "${IMAGES}") + + echo "================Confirming Release and RC version===========" + echo "Publishing the following images:" + # Sort by name for easy examination + IFS=$'\n' KNOWN_IMAGES=($(sort <<<"${KNOWN_IMAGES[*]}")) + unset IFS + printf "%s\n" ${KNOWN_IMAGES[@]} + + for IMAGE in "${KNOWN_IMAGES[@]}"; do + # Perform a carbon copy of ${RC_VERSION} to dockerhub with a new tag as ${RELEASE}. + docker buildx imagetools create --tag "${IMAGE}:${RELEASE}" "${IMAGE}:${RELEASE}${RC_VERSION}" + + # Perform a carbon copy of ${RC_VERSION} to dockerhub with a new tag as latest. + docker buildx imagetools create --tag "${IMAGE}:latest" "${IMAGE}:${RELEASE}" + done + + publish_python_artifacts: + if: ${{github.event.inputs.PUBLISH_PYTHON_ARTIFACTS == 'yes'}} + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Mask and validate PyPi id/password + run: | + echo "::add-mask::${{ github.event.inputs.PYPI_USER }}" + echo "::add-mask::${{ github.event.inputs.PYPI_PASSWORD }}" + if [ "${{ github.event.inputs.PYPI_USER }}" == "" ] + then + echo "Must provide a PyPi username to publish artifacts to PyPi" + exit 1 + fi + if [ "${{ github.event.inputs.PYPI_PASSWORD }}" == "" ] + then + echo "Must provide a PyPi password to publish artifacts to PyPi" + exit 1 + fi + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + python-version: 3.11 + - name: Install dependencies + run: | + pip install python-dateutil + pip install requests + pip install twine + - name: Deploy to Pypi + env: + RELEASE: "${{ github.event.inputs.RELEASE }}" + run: | + wget -r --no-parent -A zip,whl "https://dist.apache.org/repos/dist/dev/beam/${RELEASE}/python" + cd "dist.apache.org/repos/dist/dev/beam/${RELEASE}/python/" + echo "Will upload the following files to PyPI:" + ls + twine upload * -u ${{ github.event.inputs.PYPI_USER }} -p ${{ github.event.inputs.PYPI_PASSWORD }} + + push_git_tags: + if: ${{github.event.inputs.TAG_RELEASE == 'yes'}} + runs-on: [self-hosted, ubuntu-20.04, main] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Push tags + env: + VERSION_TAG: "v${{ github.event.inputs.RELEASE }}" + RC_TAG: "${{ github.event.inputs.RELEASE }}-RC${{ github.event.inputs.RC }}" + run: | + # Ensure local tags are in sync. If there's a mismatch, it will tell you. + git fetch --all --tags + + # If the tag exists, a commit number is produced, otherwise there's an error. + git rev-list $RC_TAG -n 1 + + # Tag for Go SDK + git tag -s "sdks/$VERSION_TAG" "$RC_TAG" + git push https://github.com/apache/beam "sdks/$VERSION_TAG" + + # Tag for repo root. + git tag -s "$VERSION_TAG" "$RC_TAG" + git push https://github.com/apache/beam "$VERSION_TAG" diff --git a/.github/workflows/go_tests.yml b/.github/workflows/go_tests.yml index 6884dd6925225..02947eff5ca03 100644 --- a/.github/workflows/go_tests.yml +++ b/.github/workflows/go_tests.yml @@ -46,6 +46,8 @@ jobs: - uses: actions/setup-go@v4 with: go-version: '1.21' + cache-dependency-path: | + sdks/go.sum - name: Delete old coverage run: "cd sdks && rm -rf .coverage.txt || :" - name: Run coverage @@ -66,8 +68,3 @@ jobs: go install "honnef.co/go/tools/cmd/staticcheck@2023.1.3" cd sdks/go/pkg/beam $(go env GOPATH)/bin/staticcheck ./... - - uses: golang/govulncheck-action@v1.0.1 - with: - work-dir: ./sdks - go-package: ./... - go-version-input: 1.21 \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt b/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt new file mode 100644 index 0000000000000..b1b45c4cc9e44 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/beam_CloudML_Benchmarks_Dataflow_arguments.txt @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--metrics_dataset=beam_cloudml +--publish_to_big_query=true +--region=us-central1 +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--runner=DataflowRunner +--requirements_file=apache_beam/testing/benchmarks/cloudml/requirements.txt \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152.txt new file mode 100644 index 0000000000000..c65317b495731 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--machine_type=n1-standard-2 +--num_workers=75 +--disk_size_gb=50 +--autoscaling_algorithm=NONE +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--requirements_file=apache_beam/ml/inference/torch_tests_requirements.txt +--publish_to_big_query=true +--metrics_dataset=beam_run_inference +--metrics_table=torch_inference_imagenet_results_resnet152 +--input_options={} +--influx_measurement=torch_inference_imagenet_resnet152 +--pretrained_model_name=resnet152 +--device=CPU +--input_file=gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt +--model_state_dict_path=gs://apache-beam-ml/models/torchvision.models.resnet152.pth +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152_Tesla_T4_GPU.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152_Tesla_T4_GPU.txt new file mode 100644 index 0000000000000..e79369befad72 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Imagenet_Classification_Resnet_152_Tesla_T4_GPU.txt @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--machine_type=n1-standard-2 +--num_workers=30 +--disk_size_gb=50 +--autoscaling_algorithm=NONE +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--requirements_file=apache_beam/ml/inference/torch_tests_requirements.txt +--publish_to_big_query=true +--metrics_dataset=beam_run_inference +--metrics_table=torch_inference_imagenet_results_resnet152_tesla_t4 +--input_options={} +--influx_measurement=torch_inference_imagenet_resnet152_tesla_t4 +--pretrained_model_name=resnet152 +--device=GPU +--experiments=worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver +--sdk_container_image=us.gcr.io/apache-beam-testing/python-postcommit-it/tensor_rt:latest +--input_file=gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt +--model_state_dict_path=gs://apache-beam-ml/models/torchvision.models.resnet152.pth +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Base_Uncased.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Base_Uncased.txt new file mode 100644 index 0000000000000..66aca5fdbcd7a --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Base_Uncased.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--machine_type=n1-standard-2 +--num_workers=250 +--disk_size_gb=50 +--autoscaling_algorithm=NONE +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--requirements_file=apache_beam/ml/inference/torch_tests_requirements.txt +--publish_to_big_query=true +--metrics_dataset=beam_run_inference +--metrics_table=torch_language_modeling_bert_base_uncased +--input_options={} +--influx_measurement=torch_language_modeling_bert_base_uncased +--device=CPU +--input_file=gs://apache-beam-ml/testing/inputs/sentences_50k.txt +--bert_tokenizer=bert-base-uncased +--model_state_dict_path=gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Large_Uncased.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Large_Uncased.txt new file mode 100644 index 0000000000000..d6406271685bc --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Language_Modeling_Bert_Large_Uncased.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--machine_type=n1-standard-2 +--num_workers=250 +--disk_size_gb=50 +--autoscaling_algorithm=NONE +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--requirements_file=apache_beam/ml/inference/torch_tests_requirements.txt +--publish_to_big_query=true +--metrics_dataset=beam_run_inference +--metrics_table=torch_language_modeling_bert_large_uncased +--input_options={} +--influx_measurement=torch_language_modeling_bert_large_uncased +--device=CPU +--input_file=gs://apache-beam-ml/testing/inputs/sentences_50k.txt +--bert_tokenizer=bert-large-uncased +--model_state_dict_path=gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-large-uncased.pth +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Vision_Classification_Resnet_101.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Vision_Classification_Resnet_101.txt new file mode 100644 index 0000000000000..5a0d250439097 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Pytorch_Vision_Classification_Resnet_101.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--machine_type=n1-standard-2 +--num_workers=75 +--disk_size_gb=50 +--autoscaling_algorithm=NONE +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--requirements_file=apache_beam/ml/inference/torch_tests_requirements.txt +--publish_to_big_query=true +--metrics_dataset=beam_run_inference +--metrics_table=torch_inference_imagenet_results_resnet101 +--input_options={} +--influx_measurement=torch_inference_imagenet_resnet101 +--pretrained_model_name=resnet101 +--device=CPU +--input_file=gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt +--model_state_dict_path=gs://apache-beam-ml/models/torchvision.models.resnet101.pth +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_MultipleKey.txt b/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_MultipleKey.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_MultipleKey.txt rename to .github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_MultipleKey.txt diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_Reiteration_10KB.txt b/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_Reiteration_10KB.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_Reiteration_10KB.txt rename to .github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_Reiteration_10KB.txt diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_Reiteration_2MB.txt b/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_Reiteration_2MB.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_Reiteration_2MB.txt rename to .github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_Reiteration_2MB.txt diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_SingleKey.txt b/.github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_SingleKey.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Go_Batch_SingleKey.txt rename to .github/workflows/load-tests-pipeline-options/go_CoGBK_Dataflow_Batch_SingleKey.txt diff --git a/.github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_MultipleKey.txt b/.github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_MultipleKey.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_MultipleKey.txt rename to .github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_MultipleKey.txt diff --git a/.github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_Reiteration_10KB.txt b/.github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_Reiteration_10KB.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_Reiteration_10KB.txt rename to .github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_Reiteration_10KB.txt diff --git a/.github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_Reiteration_2MB.txt b/.github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_Reiteration_2MB.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_CoGBK_Flink_Batch_Reiteration_2MB.txt rename to .github/workflows/load-tests-pipeline-options/go_CoGBK_Flink_Batch_Reiteration_2MB.txt diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_10b.txt b/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_10b.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_Combine_Go_Batch_10b.txt rename to .github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_10b.txt index b9ad28105903b..d97a3731bb972 100644 --- a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_10b.txt +++ b/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_10b.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-combine-1- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_4.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_4.txt index 5f3a185832703..a64b0c9e7c2c2 100644 --- a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_4.txt +++ b/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_4.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-combine-2- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_8.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_8.txt index eba65b666a257..23ef62663b1c1 100644 --- a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_8.txt +++ b/.github/workflows/load-tests-pipeline-options/go_Combine_Dataflow_Batch_Fanout_8.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-combine-3- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_10b.txt b/.github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_10b.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_10b.txt rename to .github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_10b.txt diff --git a/.github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_Fanout_4.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_Fanout_4.txt diff --git a/.github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_Fanout_8.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_Combine_Flink_Batch_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/go_Combine_Flink_Batch_Fanout_8.txt diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100b.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100b.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100b.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100b.txt index f3ebed91b6ade..b5266fa83047d 100644 --- a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100b.txt +++ b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100b.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-gbk-2- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100kb.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100kb.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100kb.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100kb.txt index e5007c7d5b90d..072ab494515ca 100644 --- a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100kb.txt +++ b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_100kb.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-gbk-3- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_10b.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_10b.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_10b.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_10b.txt index 7683eac5cb934..2e69dd0b457b7 100644 --- a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_10b.txt +++ b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_10b.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-gbk-1- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_4.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_4.txt index 5792b3bf0b95f..6371123142d82 100644 --- a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_4.txt +++ b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_4.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-gbk-4- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_8.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_8.txt index 369fb25aa0e19..77d5f2e0162bc 100644 --- a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_8.txt +++ b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Fanout_8.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-gbk-5- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_10KB.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_10KB.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_10KB.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_10KB.txt index 9eb878d4e9fb9..7b27693ed8cc9 100644 --- a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_10KB.txt +++ b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_10KB.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-gbk-6 ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_2MB.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_2MB.txt similarity index 78% rename from .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_2MB.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_2MB.txt index aa26473ca4335..40971e5464547 100644 --- a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_2MB.txt +++ b/.github/workflows/load-tests-pipeline-options/go_GBK_Dataflow_Batch_Reiteration_2MB.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-go-dataflow-batch-gbk-7 ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/loadtests --staging_location=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +26,4 @@ --autoscaling_algorithm=NONE --environment_type=DOCKER --environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest ---influx_db_name=beam_test_metrics ---influx_hostname=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_100b.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_100b.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_100b.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_100b.txt diff --git a/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_100kb.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_100kb.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_100kb.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_100kb.txt diff --git a/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_10b.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_10b.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_10b.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_10b.txt diff --git a/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Fanout_4.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Fanout_4.txt diff --git a/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Fanout_8.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Fanout_8.txt diff --git a/.github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Reiteration_10KB.txt b/.github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Reiteration_10KB.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_GBK_Flink_Batch_Reiteration_10KB.txt rename to .github/workflows/load-tests-pipeline-options/go_GBK_Flink_Batch_Reiteration_10KB.txt diff --git a/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_100_Counters.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_100_Counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_100_Counters.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_100_Counters.txt diff --git a/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_10_Counters.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_10_Counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_10_Counters.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_10_Counters.txt diff --git a/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_10_Iterations.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_10_Iterations.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_10_Iterations.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_10_Iterations.txt diff --git a/.github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_200_Iterations.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_200_Iterations.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_ParDo_Go_Batch_200_Iterations.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Dataflow_Batch_200_Iterations.txt diff --git a/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_100_counters.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_100_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_100_counters.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_100_counters.txt diff --git a/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_10_counters.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_10_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_10_counters.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_10_counters.txt diff --git a/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_10_times.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_10_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_10_times.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_10_times.txt diff --git a/.github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_200_times.txt b/.github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_200_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_ParDo_Flink_Batch_200_times.txt rename to .github/workflows/load-tests-pipeline-options/go_ParDo_Flink_Batch_200_times.txt diff --git a/.github/workflows/load-tests-job-configs/config_SideInput_Go_Batch_Dataflow_First_Iterable.txt b/.github/workflows/load-tests-pipeline-options/go_SideInput_Dataflow_Batch_First_Iterable.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_SideInput_Go_Batch_Dataflow_First_Iterable.txt rename to .github/workflows/load-tests-pipeline-options/go_SideInput_Dataflow_Batch_First_Iterable.txt diff --git a/.github/workflows/load-tests-job-configs/config_SideInput_Go_Batch_Dataflow_Iterable.txt b/.github/workflows/load-tests-pipeline-options/go_SideInput_Dataflow_Batch_Iterable.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_SideInput_Go_Batch_Dataflow_Iterable.txt rename to .github/workflows/load-tests-pipeline-options/go_SideInput_Dataflow_Batch_Iterable.txt diff --git a/.github/workflows/load-tests-job-configs/go_SideInput_Flink_Batch_First_Iterable.txt b/.github/workflows/load-tests-pipeline-options/go_SideInput_Flink_Batch_First_Iterable.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_SideInput_Flink_Batch_First_Iterable.txt rename to .github/workflows/load-tests-pipeline-options/go_SideInput_Flink_Batch_First_Iterable.txt diff --git a/.github/workflows/load-tests-job-configs/go_SideInput_Flink_Batch_Iterable.txt b/.github/workflows/load-tests-pipeline-options/go_SideInput_Flink_Batch_Iterable.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/go_SideInput_Flink_Batch_Iterable.txt rename to .github/workflows/load-tests-pipeline-options/go_SideInput_Flink_Batch_Iterable.txt diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt new file mode 100644 index 0000000000000..6402c43d1567d --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_2 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":5} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Single_Key.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Single_Key.txt new file mode 100644 index 0000000000000..5d53c3b3d7ef0 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_100b_Single_Key.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_1 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_10kB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_10kB.txt new file mode 100644 index 0000000000000..501fa6b3a57f7 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_10kB.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_3 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200000} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_2MB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_2MB.txt new file mode 100644 index 0000000000000..0d5f57fc47b8d --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Batch_2MB.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_4 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_MultipleKey.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_MultipleKey.txt similarity index 81% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_MultipleKey.txt rename to .github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_MultipleKey.txt index 5fd9518bc8d09..94cbd383a7361 100644 --- a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_MultipleKey.txt +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_MultipleKey.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,9 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --appName=load_tests_Java_Dataflow_streaming_CoGBK_2 --tempLocation=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +27,4 @@ --streaming=true --inputWindowDurationSec=1200 --coInputWindowDurationSec=1200 ---influxDatabase=beam_test_metrics ---influxHost=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_10KB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_10KB.txt similarity index 81% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_10KB.txt rename to .github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_10KB.txt index 2840fe75d5af2..6b16e4546ff6a 100644 --- a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_10KB.txt +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_10KB.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,9 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --appName=load_tests_Java_Dataflow_streaming_CoGBK_3 --tempLocation=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +27,4 @@ --streaming=true --inputWindowDurationSec=1200 --coInputWindowDurationSec=1200 ---influxDatabase=beam_test_metrics ---influxHost=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_2MB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_2MB.txt similarity index 81% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_2MB.txt rename to .github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_2MB.txt index bcc8a36cf31fb..d643e21d18cb4 100644 --- a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_Reiteration_2MB.txt +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_Reiteration_2MB.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,9 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --appName=load_tests_Java_Dataflow_streaming_CoGBK_4 --tempLocation=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +27,4 @@ --streaming=true --inputWindowDurationSec=1200 --coInputWindowDurationSec=1200 ---influxDatabase=beam_test_metrics ---influxHost=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_SingleKey.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_SingleKey.txt similarity index 81% rename from .github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_SingleKey.txt rename to .github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_SingleKey.txt index afae1a1bd6bf8..82c6612e920aa 100644 --- a/.github/workflows/load-tests-job-configs/config_CoGBK_Java_Streaming_2GB_SingleKey.txt +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_Streaming_2GB_SingleKey.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,9 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --appName=load_tests_Java_Dataflow_streaming_CoGBK_1 --tempLocation=gs://temp-storage-for-perf-tests/loadtests @@ -29,6 +27,4 @@ --streaming=true --inputWindowDurationSec=1200 --coInputWindowDurationSec=1200 ---influxDatabase=beam_test_metrics ---influxHost=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Multiple_Keys.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Multiple_Keys.txt new file mode 100644 index 0000000000000..24aff12bad79d --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Multiple_Keys.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_cogbk_2 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":5} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--coInputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Single_Key.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Single_Key.txt new file mode 100644 index 0000000000000..eead04aeb7e4e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_100b_Single_Key.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_cogbk_1 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--coInputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_10kB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_10kB.txt new file mode 100644 index 0000000000000..4b45c7df3f9e9 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_10kB.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_cogbk_3 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200000} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--coInputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_2MB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_2MB.txt new file mode 100644 index 0000000000000..f522fd7ec36dc --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_Dataflow_V2_Streaming_Java_2MB.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_cogbk_4 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--coInputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Multiple_Keys.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Multiple_Keys.txt new file mode 100644 index 0000000000000..747f495b144d2 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Multiple_Keys.txt @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--appName=load_tests_Java_SparkStructuredStreaming_batch_CoGBK_2 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_2 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":5} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=1 +--streaming=false +--runner=SparkStructuredStreamingRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Single_Key.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Single_Key.txt new file mode 100644 index 0000000000000..b17549a702f35 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_100b_Single_Key.txt @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--appName=load_tests_Java_SparkStructuredStreaming_batch_CoGBK_1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_1 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=1 +--streaming=false +--runner=SparkStructuredStreamingRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_10kB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_10kB.txt new file mode 100644 index 0000000000000..84f53ee120a2e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_10kB.txt @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--appName=load_tests_Java_SparkStructuredStreaming_batch_CoGBK_3 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_3 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200000} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=4 +--streaming=false +--runner=SparkStructuredStreamingRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_2MB.txt b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_2MB.txt new file mode 100644 index 0000000000000..8f8bdb8995516 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_CoGBK_SparkStructuredStreaming_Batch_2MB.txt @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--appName=load_tests_Java_SparkStructuredStreaming_batch_CoGBK_4 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_cogbk_4 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--coSourceOptions={"numRecords":2000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":1000} +--iterations=4 +--streaming=false +--runner=SparkStructuredStreamingRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_10b.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_10b.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_10b.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_10b.txt diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_Fanout_4.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_Fanout_4.txt diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_Fanout_8.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_Combine_Java_Dataflow_Batch_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Batch_Fanout_8.txt diff --git a/.github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_10b.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_10b.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_10b.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_10b.txt diff --git a/.github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_Fanout_4.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_Fanout_4.txt diff --git a/.github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_Fanout_8.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_Combine_Dataflow_Streaming_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_Dataflow_Streaming_Fanout_8.txt diff --git a/.github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_10b.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_10b.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_10b.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_10b.txt diff --git a/.github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_Fanout_4.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_Fanout_4.txt diff --git a/.github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_Fanout_8.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_Combine_SparkStructuredStreaming_Batch_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/java_Combine_SparkStructuredStreaming_Batch_Fanout_8.txt diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..29d0ded11fac8 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100B_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_batch_GBK_2 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_2 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..920f0a60d198a --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_100kB_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_batch_GBK_3 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_3 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000,"keySizeBytes":10000,"valueSizeBytes":90000} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..7ccacc6c791cd --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_2GB_of_10B_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_batch_GBK_1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_1 +--publishToInfluxDB=true +--sourceOptions={"numRecords":200000000,"keySizeBytes":1,"valueSizeBytes":9} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..8c6f6f1c89a08 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_batch_GBK_4 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_4 +--publishToInfluxDB=true +--sourceOptions={"numRecords":5000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=4 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..43f04dac1d02a --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_batch_GBK_5 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_5 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2500000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=8 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..c8f9ecc34770f --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_10kB_values.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_batch_GBK_6 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_6 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..653004c6a04e7 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Batch_reiterate_4_times_2MB_values.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_batch_GBK_7 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_7 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":10,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..a6523c64edbc1 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100B_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_streaming_GBK_2 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_2 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..422557e84a664 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_streaming_GBK_3 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_3 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000,"keySizeBytes":10000,"valueSizeBytes":90000} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..da6c04d0d284e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_2GB_of_10B_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_streaming_GBK_1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_1 +--publishToInfluxDB=true +--sourceOptions={"numRecords":200000000,"keySizeBytes":1,"valueSizeBytes":9} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..33181a9397009 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_streaming_GBK_4 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_4 +--publishToInfluxDB=true +--sourceOptions={"numRecords":5000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=4 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..32b59df2a2e0b --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_streaming_GBK_5 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_5 +--publishToInfluxDB=true +--sourceOptions={"numRecords":2500000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=8 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..376d7e3244a8c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_10kB_values.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_streaming_GBK_6 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_6 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..b7bf9db40e825 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_Streaming_reiterate_4_times_2MB_values.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java_Dataflow_streaming_GBK_7 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_7 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":10,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..d9b1918383b26 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100B_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_batch_GBK_2 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_2 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..66911feb7d173 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_100kB_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_batch_GBK_3 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_3 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000,"keySizeBytes":10000,"valueSizeBytes":90000} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..e265724ec1c16 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_2GB_of_10B_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_batch_GBK_1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_1 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":200000000,"keySizeBytes":1,"valueSizeBytes":9} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..8610a9dff9f98 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_batch_GBK_4 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_4 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":5000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=4 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..65aa569932945 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_batch_GBK_5 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_5 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":2500000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=8 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..aaf66ef03505c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_10kB_values.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_batch_GBK_6 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_6 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..8698ac90cac11 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java11_reiterate_4_times_2MB_values.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_batch_GBK_7 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_7 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":10,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..f5a432b6d1403 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100B_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_batch_GBK_2 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_2 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..976bd20f7dff2 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_100kB_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_batch_GBK_3 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_3 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000,"keySizeBytes":10000,"valueSizeBytes":90000} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..89d356201a76a --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_2GB_of_10B_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_batch_GBK_1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_1 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":200000000,"keySizeBytes":1,"valueSizeBytes":9} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..0735c9bafc45b --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_batch_GBK_4 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_4 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":5000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=4 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..4ea7f402cc581 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_batch_GBK_5 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_5 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":2500000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=8 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..881640714df2d --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_10kB_values.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_batch_GBK_6 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_6 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..a09c16d1e66a5 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Batch_Java17_reiterate_4_times_2MB_values.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_batch_GBK_7 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_gbk_7 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":10,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..84349e3dc06ea --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100B_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_streaming_GBK_2 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_2 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..761fad3d11ba3 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_100kB_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_streaming_GBK_3 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_3 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000,"keySizeBytes":10000,"valueSizeBytes":90000} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..1b6624e52482c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_2GB_of_10B_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_streaming_GBK_1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_1 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":200000000,"keySizeBytes":1,"valueSizeBytes":9} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..cf173fa845c3b --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_streaming_GBK_4 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_4 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":5000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=4 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..8e751d638ceb0 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_streaming_GBK_5 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_5 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":2500000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=8 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..2e9c847cb8327 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_10kB_values.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_streaming_GBK_6 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_6 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..dd410a81487c2 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java11_reiterate_4_times_2MB_values.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java11_Dataflow_V2_streaming_GBK_7 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_7 +--influxTags={"runnerVersion":"v2","jdk":"java11"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":10,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..05d93f213ec7e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100B_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_streaming_GBK_2 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_2 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..2391bfe4e416c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_100kB_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_streaming_GBK_3 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_3 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000,"keySizeBytes":10000,"valueSizeBytes":90000} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..3c4e04e02adfa --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_2GB_of_10B_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_streaming_GBK_1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_1 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":200000000,"keySizeBytes":1,"valueSizeBytes":9} +--fanout=1 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..d43ab9d84688c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_streaming_GBK_4 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_4 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":5000000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=4 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..fd5233180ff62 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_streaming_GBK_5 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_5 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":2500000,"keySizeBytes":10,"valueSizeBytes":90} +--fanout=8 +--iterations=1 +--numWorkers=16 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..50d17886b7c91 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_10kB_values.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_streaming_GBK_6 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_6 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":200,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..e28d4d895ada7 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_GBK_Dataflow_V2_Streaming_Java17_reiterate_4_times_2MB_values.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=load_tests_Java17_Dataflow_V2_streaming_GBK_7 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_gbk_7 +--influxTags={"runnerVersion":"v2","jdk":"java17"} +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90,"numHotKeys":10,"hotKeyFraction":1} +--fanout=1 +--iterations=4 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_100B_records.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_100B_records.txt rename to .github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_100B_records.txt diff --git a/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_100kB_records.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_100kB_records.txt rename to .github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_100kB_records.txt diff --git a/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_10B_records.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_2GB_of_10B_records.txt rename to .github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_2GB_of_10B_records.txt diff --git a/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt rename to .github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt diff --git a/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt rename to .github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt diff --git a/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_10kB_values.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_10kB_values.txt rename to .github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_10kB_values.txt diff --git a/.github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_2MB_values.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/Java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_2MB_values.txt rename to .github/workflows/load-tests-pipeline-options/java_GBK_SparkStructuredStreaming_Batch_reiterate_4_times_2MB_values.txt diff --git a/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-1.txt b/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-1.txt new file mode 100644 index 0000000000000..7cd503ad44992 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-1.txt @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=smoke-dsl-java +--tempLocation=gs://temp-storage-for-perf-tests/smoketests +--sourceOptions={"numRecords":100000,"splitPointFrequencyRecords":1} +--stepOptions={"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true} +--fanout=10 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-2.txt b/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-2.txt new file mode 100644 index 0000000000000..bccffadea8d7f --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-2.txt @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=smoke-dsl-java +--tempLocation=gs://temp-storage-for-perf-tests/smoketests +--sourceOptions={"numRecords":100000,"keySizeBytes":1,"valueSizeBytes":1} +--fanout=1 +--iterations=1 +--numWorkers=3 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-3.txt b/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-3.txt new file mode 100644 index 0000000000000..44bd342b462c3 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_LoadTests_Combine_Smoke_CombineLoadTest_load_test_Dataflow-3.txt @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--appName=smoke-dsl-java +--tempLocation=gs://temp-storage-for-perf-tests/smoketests +--sourceOptions={"numRecords":20000,"keySizeBytes":1,"valueSizeBytes":1} +--fanout=10 +--iterations=1 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_100_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_100_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_100_counters.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_100_counters.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_10_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_10_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_10_counters.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_10_counters.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_10_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_10_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_10_times.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_10_times.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_200_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_200_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Batch_200_times.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Batch_200_times.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_100_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_100_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_100_counters.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_100_counters.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_10_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_10_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_10_counters.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_10_counters.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_10_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_10_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_10_times.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_10_times.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_200_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_200_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_Dataflow_Streaming_200_times.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_Streaming_200_times.txt diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_100_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_100_counters.txt new file mode 100644 index 0000000000000..7f05aedab7636 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_100_counters.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_pardo_4 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=1 +--numberOfCounters=1 +--numberOfCounterOperations=100 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_counters.txt new file mode 100644 index 0000000000000..e9c158698c57a --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_counters.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_pardo_3 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=1 +--numberOfCounters=1 +--numberOfCounterOperations=10 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_times.txt new file mode 100644 index 0000000000000..cd6b006d8f3ae --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_10_times.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_pardo_1 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=10 +--numberOfCounters=1 +--numberOfCounterOperations=0 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_200_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_200_times.txt new file mode 100644 index 0000000000000..d4b6bef42f787 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Batch_JavaVersions_200_times.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_batch_pardo_2 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=200 +--numberOfCounters=1 +--numberOfCounterOperations=0 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=false +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_100_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_100_counters.txt new file mode 100644 index 0000000000000..1dcb6f4a7cbb8 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_100_counters.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_pardo_4 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=1 +--numberOfCounters=1 +--numberOfCounterOperations=100 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_counters.txt new file mode 100644 index 0000000000000..34fb4b4658d83 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_counters.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_pardo_3 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=1 +--numberOfCounters=1 +--numberOfCounterOperations=10 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_times.txt new file mode 100644 index 0000000000000..9d4d918f2e6d2 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_10_times.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_pardo_1 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=10 +--numberOfCounters=1 +--numberOfCounterOperations=0 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_200_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_200_times.txt new file mode 100644 index 0000000000000..c7d0e3e4835ac --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_ParDo_Dataflow_V2_Streaming_JavaVersions_200_times.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--influxMeasurement=java_streaming_pardo_2 +--publishToInfluxDB=true +--sourceOptions={"numRecords":20000000,"keySizeBytes":10,"valueSizeBytes":90} +--iterations=200 +--numberOfCounters=1 +--numberOfCounterOperations=0 +--numWorkers=5 +--autoscalingAlgorithm=NONE +--streaming=true +--inputWindowDurationSec=1200 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_100_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_100_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_100_counters.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_100_counters.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_10_counters.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_10_counters.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_10_counters.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_10_counters.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_10_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_10_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_10_times.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_10_times.txt diff --git a/.github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_200_times.txt b/.github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_200_times.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/java_ParDo_SparkStructuredStreaming_Batch_200_times.txt rename to .github/workflows/load-tests-pipeline-options/java_ParDo_SparkStructuredStreaming_Batch_200_times.txt diff --git a/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Dataflow.txt b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Dataflow.txt new file mode 100644 index 0000000000000..411aeaeaf5869 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Dataflow.txt @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--sourceOptions={"numRecords":100000,"splitPointFrequencyRecords":1} +--stepOptions={"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true} +--fanout=10 +--iterations=1 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Direct.txt b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Direct.txt new file mode 100644 index 0000000000000..a062dbc4846a1 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Direct.txt @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--sourceOptions={"numRecords":100000,"splitPointFrequencyRecords":1} +--stepOptions={"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true} +--fanout=10 +--iterations=1 \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Flink.txt b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Flink.txt new file mode 100644 index 0000000000000..bfa07b5f2dcd3 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Flink.txt @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--sourceOptions={"numRecords":100000,"splitPointFrequencyRecords":1} +--stepOptions={"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true} +--fanout=10 +--iterations=1 +--runner=FlinkRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Spark.txt b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Spark.txt new file mode 100644 index 0000000000000..9eed902195c71 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/java_Smoke_GroupByKey_Spark.txt @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--sparkMaster=local[4] +--sourceOptions={"numRecords":100000,"splitPointFrequencyRecords":1} +--stepOptions={"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true} +--fanout=10 +--iterations=1 +--runner=SparkRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt new file mode 100644 index 0000000000000..d5ba43180738e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Multiple_Keys.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_cogbk_2 +--influx_measurement=python_batch_cogbk_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":5,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Single_Key.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Single_Key.txt new file mode 100644 index 0000000000000..47ebf22dc8354 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_100b_Single_Key.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_cogbk_1 +--influx_measurement=python_batch_cogbk_1 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_10kB.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_10kB.txt new file mode 100644 index 0000000000000..13161125b570e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_10kB.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_cogbk_3 +--influx_measurement=python_batch_cogbk_3 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":200000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_2MB.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_2MB.txt new file mode 100644 index 0000000000000..052c2464a1cc3 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Batch_2MB.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_cogbk_4 +--influx_measurement=python_batch_cogbk_4 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Multiple_Keys.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Multiple_Keys.txt new file mode 100644 index 0000000000000..4b8a2f72010b5 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Multiple_Keys.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_cogbk_2 +--influx_measurement=python_batch_cogbk_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":5,\\"hot_key_fraction\\":1}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":5,\\"hot_key_fraction\\":1}'' +--iterations=1 +--parallelism=5 +--endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Single_Key.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Single_Key.txt new file mode 100644 index 0000000000000..3aeb927f04eed --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_100b_Single_Key.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_cogbk_1 +--influx_measurement=python_batch_cogbk_1 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1,\\"hot_key_fraction\\":1}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1}'' +--iterations=1 +--parallelism=5 +--endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_10kB.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_10kB.txt new file mode 100644 index 0000000000000..e350e2d29944d --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Flink_Batch_10kB.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_cogbk_3 +--influx_measurement=python_batch_cogbk_3 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":200000,\\"hot_key_fraction\\":1}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1}'' +--iterations=4 +--parallelism=5 +--endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Multiple_Keys.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Multiple_Keys.txt new file mode 100644 index 0000000000000..a687f0cf5de7d --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Multiple_Keys.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_cogbk_2 +--influx_measurement=python_streaming_cogbk_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":5,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--worker_machine_type=n1-highmem-4 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Single_Key.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Single_Key.txt new file mode 100644 index 0000000000000..9141182b90fc1 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_100b_Single_Key.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_cogbk_1 +--influx_measurement=python_streaming_cogbk_1 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--worker_machine_type=n1-highmem-4 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_10kB.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_10kB.txt new file mode 100644 index 0000000000000..7250f073f25ea --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_10kB.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_cogbk_3 +--influx_measurement=python_streaming_cogbk_3 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":200000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--worker_machine_type=n1-highmem-4 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_2MB.txt b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_2MB.txt new file mode 100644 index 0000000000000..59723107d53c8 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_CoGBK_Dataflow_Streaming_2MB.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_cogbk_4 +--influx_measurement=python_streaming_cogbk_4 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--co_input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":1000,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--worker_machine_type=n1-highmem-4 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_10b.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_10b.txt similarity index 77% rename from .github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_10b.txt rename to .github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_10b.txt index a6dabb5e50868..29b3015512833 100644 --- a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_10b.txt +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_10b.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -13,10 +12,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### ---job_name=load-tests-python-dataflow-batch-combine-1- ---project=apache-beam-testing +# limitations under the License. + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/smoketests --publish_to_big_query=true @@ -27,6 +24,4 @@ --num_workers=5 --autoscaling_algorithm=NONE --top_count=20 ---influxDatabase=beam_test_metrics ---influxHost=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_4.txt similarity index 79% rename from .github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_4.txt rename to .github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_4.txt index 7639456296b6d..7405d6b75a82c 100644 --- a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_4.txt +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_4.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -14,9 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -############################################################################### ---job_name=load-tests-python-dataflow-batch-combine-2- ---project=apache-beam-testing + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/smoketests --publish_to_big_query=true @@ -28,6 +25,4 @@ --autoscaling_algorithm=NONE --fanout=4 --top_count=20 ---influxDatabase=beam_test_metrics ---influxHost=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_8.txt similarity index 79% rename from .github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_8.txt rename to .github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_8.txt index e5d46791a83c1..b45c4eb1bd4ab 100644 --- a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_8.txt +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Batch_2GB_Fanout_8.txt @@ -1,4 +1,3 @@ -############################################################################### # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -14,9 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -############################################################################### ---job_name=load-tests-python-dataflow-batch-combine-3- ---project=apache-beam-testing + --region=us-central1 --temp_location=gs://temp-storage-for-perf-tests/smoketests --publish_to_big_query=true @@ -28,6 +25,4 @@ --autoscaling_algorithm=NONE --fanout=8 --top_count=20 ---influxDatabase=beam_test_metrics ---influxHost=http://10.128.0.96:8086 --runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_10_byte_records.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_10_byte_records.txt new file mode 100644 index 0000000000000..8535f85c737d8 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_10_byte_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/smoketests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_combine_1 +--influx_measurement=python_streaming_combine_1 +--input_options=''{\\"num_records\\":200000000,\\"key_size\\":1,\\"value_size\\":9,\\"algorithm\\":\\"lcg\\"}'' +--num_workers=5 +--autoscaling_algorithm=NONE +--top_count=20 +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_4.txt new file mode 100644 index 0000000000000..03b26b9d4e13e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_4.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/smoketests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_combine_4 +--influx_measurement=python_streaming_combine_4 +--input_options=''{\\"num_records\\":5000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--num_workers=16 +--autoscaling_algorithm=NONE +--fanout=4 +--top_count=20 +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_8.txt new file mode 100644 index 0000000000000..46d68261342f7 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Dataflow_Streaming_2GB_Fanout_8.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/smoketests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_combine_5 +--influx_measurement=python_streaming_combine_5 +--input_options=''{\\"num_records\\":2500000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--num_workers=16 +--autoscaling_algorithm=NONE +--fanout=8 +--top_count=20 +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_10_byte_records.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_10_byte_records.txt new file mode 100644 index 0000000000000..8295d1c8aa860 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_10_byte_records.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_combine_1 +--influx_measurement=python_batch_combine_1 +--input_options=''{\\"num_records\\":200000000,\\"key_size\\":1,\\"value_size\\":9,\\"algorithm\\":\\"lcg\\"}'' +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--top_count=20 +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_4.txt new file mode 100644 index 0000000000000..82f8bcc7c0aee --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_4.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_combine_4 +--influx_measurement=python_batch_combine_4 +--input_options=''{\\"num_records\\":5000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--parallelism=16 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--fanout=4 +--top_count=20 +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_8.txt new file mode 100644 index 0000000000000..45425b6bf1536 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Batch_2GB_Fanout_8.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_combine_5 +--influx_measurement=python_batch_combine_5 +--input_options=''{\\"num_records\\":2500000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--parallelism=16 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--fanout=8 +--top_count=20 +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_10_byte_records.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_10_byte_records.txt new file mode 100644 index 0000000000000..12ffc1790e46f --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_10_byte_records.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_combine_1 +--influx_measurement=python_streaming_combine_1 +--input_options=''{\\"num_records\\":200000000,\\"key_size\\":1,\\"value_size\\":9,\\"algorithm\\":\\"lcg\\"}'' +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--top_count=20 +--streaming +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt new file mode 100644 index 0000000000000..c7d5552a03bd6 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_4.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_combine_4 +--influx_measurement=python_streaming_combine_4 +--input_options=''{\\"num_records\\":5000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--parallelism=16 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--fanout=4 +--top_count=20 +--streaming +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt new file mode 100644 index 0000000000000..bffdeab2cb11f --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Combine_Flink_Streaming_2GB_Fanout_8.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_combine_5 +--influx_measurement=python_streaming_combine_5 +--input_options=''{\\"num_records\\":2500000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--parallelism=16 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--fanout=8 +--top_count=20 +--streaming +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_FnApiRunner_Python_Microbenchmark.txt b/.github/workflows/load-tests-pipeline-options/python_FnApiRunner_Microbenchmark.txt similarity index 100% rename from .github/workflows/load-tests-job-configs/config_FnApiRunner_Python_Microbenchmark.txt rename to .github/workflows/load-tests-pipeline-options/python_FnApiRunner_Microbenchmark.txt diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..ad05bf1e85d30 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100B_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_gbk_2 +--influx_measurement=python_batch_gbk_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..8d3358a12f98e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_100kB_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_gbk_3 +--influx_measurement=python_batch_gbk_3 +--input_options=''{\\"num_records\\":20000,\\"key_size\\":10000,\\"value_size\\":90000,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..885c5ca61954c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_2GB_of_10B_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_gbk_1 +--influx_measurement=python_batch_gbk_1 +--input_options=''{\\"num_records\\":200000000,\\"key_size\\":1,\\"value_size\\":9,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..1663e646f542a --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_gbk_4 +--influx_measurement=python_batch_gbk_4 +--input_options=''{\\"num_records\\":5000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=4 +--num_workers=16 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..4a1768c9d17dd --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_gbk_5 +--influx_measurement=python_batch_gbk_5 +--input_options=''{\\"num_records\\":2500000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=8 +--num_workers=16 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..057f71d5627c8 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_100B_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_gbk_2 +--influx_measurement=python_streaming_gbk_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt new file mode 100644 index 0000000000000..57c1be11d592e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_100kB_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_gbk_3 +--influx_measurement=python_streaming_gbk_3 +--input_options=''{\\"num_records\\":20000,\\"key_size\\":10000,\\"value_size\\":90000,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..64d224a4663f1 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_2GB_of_10B_records.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_gbk_1 +--influx_measurement=python_streaming_gbk_1 +--input_options=''{\\"num_records\\":200000000,\\"key_size\\":1,\\"value_size\\":9,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..8e38713cc66de --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_gbk_4 +--influx_measurement=python_streaming_gbk_4 +--input_options=''{\\"num_records\\":5000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=4 +--num_workers=16 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..35508480662c3 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Dataflow_Streaming_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_gbk_5 +--influx_measurement=python_streaming_gbk_5 +--input_options=''{\\"num_records\\":2500000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=8 +--num_workers=16 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt new file mode 100644 index 0000000000000..4cb5bfb0d9885 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_100B_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_GBK_2 +--influx_measurement=python_batch_gbk_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90}'' +--iterations=1 +--fanout=4 +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_10B_records.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_10B_records.txt new file mode 100644 index 0000000000000..2427e21cde454 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_2GB_of_10B_records.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_GBK_1 +--influx_measurement=python_batch_gbk_1 +--input_options=''{\\"num_records\\":200000000,\\"key_size\\":1,\\"value_size\\":9}'' +--iterations=1 +--fanout=1 +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..bf9085141eab8 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_4_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_GBK_4 +--influx_measurement=python_batch_gbk_4 +--input_options=''{\\"num_records\\":5000000,\\"key_size\\":10,\\"value_size\\":90}'' +--iterations=1 +--fanout=4 +--parallelism=16 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt new file mode 100644 index 0000000000000..a59f873eb775e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_fanout_8_times_with_2GB_10-byte_records_total.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_GBK_5 +--influx_measurement=python_batch_gbk_5 +--input_options=''{\\"num_records\\":2500000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--fanout=8 +--parallelism=16 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..0e5d00b961519 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_Flink_Batch_reiterate_4_times_10kB_values.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_GBK_6 +--influx_measurement=python_batch_gbk_6 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":200,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--fanout=1 +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..fb5a3db9e6bb3 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_10kB_values.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_gbk_6 +--influx_measurement=python_batch_gbk_6 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":200,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..b4b46682869ae --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Batch_reiterate_4_times_2MB_values.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_gbk_7 +--influx_measurement=python_batch_gbk_7 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":10,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_10kB_values.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_10kB_values.txt new file mode 100644 index 0000000000000..6cb1e68aeafc5 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_10kB_values.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_gbk_6 +--influx_measurement=python_streaming_gbk_6 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":200,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_2MB_values.txt b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_2MB_values.txt new file mode 100644 index 0000000000000..712749090aec7 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_GBK_reiterate_Dataflow_Streaming_reiterate_4_times_2MB_values.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_gbk_7 +--influx_measurement=python_streaming_gbk_7 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"num_hot_keys\\":10,\\"hot_key_fraction\\":1,\\"algorithm\\":\\"lcg\\"}'' +--iterations=4 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_100_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_100_Counters.txt new file mode 100644 index 0000000000000..a5bb7979b86b4 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_100_Counters.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_pardo_4 +--influx_measurement=python_batch_pardo_4 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=100 +--number_of_counters=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Counters.txt new file mode 100644 index 0000000000000..7e35ef74dfa1e --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Counters.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_pardo_3 +--influx_measurement=python_batch_pardo_3 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=10 +--number_of_counters=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Iterations.txt new file mode 100644 index 0000000000000..734360397c9b4 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_10_Iterations.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_pardo_1 +--influx_measurement=python_batch_pardo_1 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=10 +--number_of_counter_operations=0 +--number_of_counters=0 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_200_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_200_Iterations.txt new file mode 100644 index 0000000000000..825fee427a31a --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Batch_200_Iterations.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_pardo_2 +--influx_measurement=python_batch_pardo_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=200 +--number_of_counter_operations=0 +--number_of_counters=0 +--num_workers=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_100_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_100_Counters.txt new file mode 100644 index 0000000000000..71fc818d0e070 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_100_Counters.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_pardo_4 +--influx_measurement=python_streaming_pardo_4 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=100 +--number_of_counters=1 +--num_****s=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Counters.txt new file mode 100644 index 0000000000000..fe48dedfa8373 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Counters.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_pardo_3 +--influx_measurement=python_streaming_pardo_3 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=10 +--number_of_counters=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Iterations.txt new file mode 100644 index 0000000000000..84d2cdb7a85bd --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_10_Iterations.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_pardo_1 +--influx_measurement=python_streaming_pardo_1 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=10 +--number_of_counter_operations=0 +--number_of_counters=0 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_200_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_200_Iterations.txt new file mode 100644 index 0000000000000..02636335cb372 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Dataflow_Streaming_200_Iterations.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_streaming_pardo_2 +--influx_measurement=python_streaming_pardo_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=200 +--number_of_counter_operations=0 +--number_of_counters=0 +--num_workers=5 +--autoscaling_algorithm=NONE +--streaming +--experiments=use_runner_v2 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt new file mode 100644 index 0000000000000..4d8bda8ac2f8c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Counters.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_pardo_4 +--influx_measurement=python_batch_pardo_4 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=100 +--number_of_counters=1 +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt new file mode 100644 index 0000000000000..e84cee2f50cf2 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_10_Iterations.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_pardo_1 +--influx_measurement=python_batch_pardo_1 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=10 +--number_of_counter_operations=0 +--number_of_counters=0 +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt new file mode 100644 index 0000000000000..4d8bda8ac2f8c --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Batch_200_Iterations.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_batch_pardo_4 +--influx_measurement=python_batch_pardo_4 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=100 +--number_of_counters=1 +--parallelism=5 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt new file mode 100644 index 0000000000000..b17e2cecc2c80 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_100_Counters.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_pardo_4 +--influx_measurement=python_streaming_pardo_4 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=100 +--number_of_counters=1 +--parallelism=5 +--streaming +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt new file mode 100644 index 0000000000000..957bc6c086d82 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Counters.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_pardo_3 +--influx_measurement=python_streaming_pardo_3 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=1 +--number_of_counter_operations=10 +--number_of_counters=1 +--parallelism=5 +--streaming +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt new file mode 100644 index 0000000000000..baa34ec455b50 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_10_Iterations.txt @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_pardo_5 +--influx_measurement=python_streaming_pardo_1 +--input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=10 +--number_of_counter_operations=0 +--number_of_counters=0 +--parallelism=5 +--streaming +--stateful +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt new file mode 100644 index 0000000000000..44483a6e51ccf --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_200_Iterations.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_pardo_2 +--influx_measurement=python_streaming_pardo_2 +--input_options=''{\\"num_records\\":20000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=200 +--number_of_counter_operations=0 +--number_of_counters=0 +--parallelism=5 +--streaming +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt new file mode 100644 index 0000000000000..571b33fb7a490 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_ParDo_Flink_Streaming_5_Iterations.txt @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_flink_streaming_pardo_6 +--influx_measurement=python_streaming_pardo_6 +--input_options=''{\\"num_records\\":2000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--iterations=5 +--number_of_counter_operations=10 +--number_of_counters=3 +--parallelism=5 +--streaming +--stateful +--checkpointing_interval=10000 +--report_checkpoint_duration=python_flink_streaming_pardo_6 +--shutdown_sources_after_idle_ms=300000 +--job_endpoint=localhost:8099 +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_python3.8_sdk:latest +--use_stateful_load_generator +--runner=PortableRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_first_iterable.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_first_iterable.txt new file mode 100644 index 0000000000000..204c07bc16a28 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_first_iterable.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_9 +--influx_measurement=python_batch_sideinput_9 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":10000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=iter +--access_percentage=1 +--window_count=1000 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_iterable.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_iterable.txt new file mode 100644 index 0000000000000..1ae64bb4a3697 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1000window_iterable.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_10 +--influx_measurement=python_batch_sideinput_10 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":10000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=iter +--window_count=1000 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_first_iterable.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_first_iterable.txt new file mode 100644 index 0000000000000..0759517d9c2ed --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_first_iterable.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_3 +--influx_measurement=python_batch_sideinput_3 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":10000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=iter +--access_percentage=1 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_iterable.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_iterable.txt new file mode 100644 index 0000000000000..c555c0d32d4c3 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_10gb_1window_iterable.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_4 +--influx_measurement=python_batch_sideinput_4 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":10000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=iter +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_1key_percent_dict.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_1key_percent_dict.txt new file mode 100644 index 0000000000000..4b3cee817f430 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_1key_percent_dict.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_7 +--influx_measurement=python_batch_sideinput_7 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":1000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=dict +--access_percentage=1 +--window_count=1000 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_99key_percent_dict.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_99key_percent_dict.txt new file mode 100644 index 0000000000000..00ba6feef50cc --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1000window_99key_percent_dict.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_8 +--influx_measurement=python_batch_sideinput_8 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":1000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=dict +--access_percentage=99 +--window_count=1000 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_1key_percent_dict.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_1key_percent_dict.txt new file mode 100644 index 0000000000000..07e4a5ecba621 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_1key_percent_dict.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_1 +--influx_measurement=python_batch_sideinput_1 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":1000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=dict +--access_percentage=1 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_99key_percent_dict.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_99key_percent_dict.txt new file mode 100644 index 0000000000000..b565598c1a167 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_99key_percent_dict.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_2 +--influx_measurement=python_batch_sideinput_2 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":1000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=dict +--access_percentage=99 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_first_list.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_first_list.txt new file mode 100644 index 0000000000000..de5b02198177b --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_first_list.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_5 +--influx_measurement=python_batch_sideinput_5 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":1000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=list +--access_percentage=1 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_list.txt b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_list.txt new file mode 100644 index 0000000000000..078ce2cc74fec --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_SideInput_Dataflow_Batch_1gb_1window_list.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_sideinput_6 +--influx_measurement=python_batch_sideinput_6 +--num_workers=10 +--autoscaling_algorithm=NONE +--experiments=use_runner_v2 +--input_options=''{\\"num_records\\":1000000,\\"key_size\\":100,\\"value_size\\":900,\\"algorithm\\":\\"lcg\\"}'' +--side_input_type=list +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Dataflow.txt b/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Dataflow.txt new file mode 100644 index 0000000000000..9a069df2bd1c5 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Dataflow.txt @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test_SMOKE +--metrics_table=python_dataflow_gbk +--input_options=''{\\"num_records\\":100000,\\"key_size\\":1,\\"value_size\\":1}'' +--max_num_workers=1 \ No newline at end of file diff --git a/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Direct.txt b/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Direct.txt new file mode 100644 index 0000000000000..7490675e43831 --- /dev/null +++ b/.github/workflows/load-tests-pipeline-options/python_Smoke_GroupByKey_Direct.txt @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--publish_to_big_query=true +--metrics_dataset=load_test_SMOKE +--metrics_table=python_direct_gbk +--input_options=''{\\"num_records\\":100000,\\"key_size\\":1,\\"value_size\\":1}'' +--max_num_workers=1 \ No newline at end of file diff --git a/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Avro.txt b/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Avro.txt deleted file mode 100644 index 5e7e53821231c..0000000000000 --- a/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Avro.txt +++ /dev/null @@ -1,38 +0,0 @@ -############################################################################### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### -'["--tempLocation=gs://temp-storage-for-perf-tests/loadtests", -"--project=apache-beam-testing", -"--tempRoot=gs://temp-storage-for-perf-tests/loadtests", -"--writeMethod=FILE_LOADS", -"--writeFormat=AVRO", -"--testBigQueryDataset=beam_performance", -"--testBigQueryTable=bqio_write_10GB_java_avro_", -"--metricsBigQueryDataset=beam_performance", -"--metricsBigQueryTable=bqio_10GB_results_java_batch_avro", -"--influxMeasurement=bqio_10GB_results_java_batch_avro", -"--sourceOptions={ -\"numRecords\":\"10485760\", -\"keySizeBytes\":\"1\", -\"valueSizeBytes\":\"1024\" -}", -"--runner=DataflowRunner", -"--maxNumWorkers=5", -"--numWorkers=5", -"--autoscalingAlgorithm=NONE", -"--influxDatabase=beam_test_metrics", -"--influxHost=http://10.128.0.96:8086"]' \ No newline at end of file diff --git a/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Json.txt b/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Json.txt deleted file mode 100644 index 7bd9c30ae7380..0000000000000 --- a/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Batch_Java_Json.txt +++ /dev/null @@ -1,38 +0,0 @@ -############################################################################### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### -'["--tempLocation=gs://temp-storage-for-perf-tests/loadtests", -"--project=apache-beam-testing", -"--tempRoot=gs://temp-storage-for-perf-tests/loadtests", -"--writeMethod=FILE_LOADS", -"--writeFormat=JSON", -"--testBigQueryDataset=beam_performance", -"--testBigQueryTable=bqio_write_10GB_java_json_", -"--metricsBigQueryDataset=beam_performance", -"--metricsBigQueryTable=bqio_10GB_results_java_batch_json", -"--influxMeasurement=bqio_10GB_results_java_batch_json", -"--sourceOptions={ -\"numRecords\":\"10485760\", -\"keySizeBytes\":\"1\", -\"valueSizeBytes\":\"1024\" -}", -"--runner=DataflowRunner", -"--maxNumWorkers=5", -"--numWorkers=5", -"--autoscalingAlgorithm=NONE", -"--influxDatabase=beam_test_metrics", -"--influxHost=http://10.128.0.96:8086"]' \ No newline at end of file diff --git a/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Streaming_Java.txt b/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Streaming_Java.txt deleted file mode 100644 index 8bddea5fcb8bd..0000000000000 --- a/.github/workflows/performance-tests-job-configs/config_BigQueryIO_Streaming_Java.txt +++ /dev/null @@ -1,39 +0,0 @@ -############################################################################### -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -############################################################################### -'["--tempLocation=gs://temp-storage-for-perf-tests/loadtests", -"--project=apache-beam-testing", -"--tempRoot=gs://temp-storage-for-perf-tests/loadtests", -"--writeMethod=STREAMING_INSERTS", -"--writeFormat=JSON", -"--pipelineTimeout=1200", -"--testBigQueryDataset=beam_performance", -"--testBigQueryTable=bqio_write_10GB_java_stream_", -"--metricsBigQueryDataset=beam_performance", -"--metricsBigQueryTable=bqio_10GB_results_java_stream", -"--influxMeasurement=bqio_10GB_results_java_stream", -"--sourceOptions={ -\"numRecords\":\"10485760\", -\"keySizeBytes\":\"1\", -\"valueSizeBytes\":\"1024\" -}", -"--runner=DataflowRunner", -"--maxNumWorkers=5", -"--numWorkers=5", -"--autoscalingAlgorithm=NONE", -"--influxDatabase=beam_test_metrics", -"--influxHost=http://10.128.0.96:8086"]' \ No newline at end of file diff --git a/.github/workflows/performance-tests-job-configs/JDBC.txt b/.github/workflows/performance-tests-pipeline-options/JDBC.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/JDBC.txt rename to .github/workflows/performance-tests-pipeline-options/JDBC.txt diff --git a/.github/workflows/performance-tests-job-configs/SQLBigQueryIO_Batch_Java.txt b/.github/workflows/performance-tests-pipeline-options/SQLBigQueryIO_Batch_Java.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/SQLBigQueryIO_Batch_Java.txt rename to .github/workflows/performance-tests-pipeline-options/SQLBigQueryIO_Batch_Java.txt diff --git a/.github/workflows/performance-tests-job-configs/config_TFRecordIOIT.txt b/.github/workflows/performance-tests-pipeline-options/TFRecordIOIT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_TFRecordIOIT.txt rename to .github/workflows/performance-tests-pipeline-options/TFRecordIOIT.txt diff --git a/.github/workflows/performance-tests-job-configs/TFRecordIOIT_HDFS.txt b/.github/workflows/performance-tests-pipeline-options/TFRecordIOIT_HDFS.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/TFRecordIOIT_HDFS.txt rename to .github/workflows/performance-tests-pipeline-options/TFRecordIOIT_HDFS.txt diff --git a/.github/workflows/performance-tests-job-configs/config_AvroIOIT.txt b/.github/workflows/performance-tests-pipeline-options/avroIOIT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_AvroIOIT.txt rename to .github/workflows/performance-tests-pipeline-options/avroIOIT.txt diff --git a/.github/workflows/performance-tests-job-configs/config_AvroIOIT_HDFS.txt b/.github/workflows/performance-tests-pipeline-options/avroIOIT_HDFS.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_AvroIOIT_HDFS.txt rename to .github/workflows/performance-tests-pipeline-options/avroIOIT_HDFS.txt diff --git a/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Avro.txt b/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Avro.txt new file mode 100644 index 0000000000000..922114e63d0e0 --- /dev/null +++ b/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Avro.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--tempRoot=gs://temp-storage-for-perf-tests/loadtests +--writeMethod=FILE_LOADS +--writeFormat=AVRO +--testBigQueryDataset=beam_performance +--metricsBigQueryDataset=beam_performance +--metricsBigQueryTable=bqio_10GB_results_java_batch_avro +--influxMeasurement=bqio_10GB_results_java_batch_avro +--sourceOptions={\"numRecords\":\"10485760\",\"keySizeBytes\":\"1\",\"valueSizeBytes\":\"1024\"} +--runner=DataflowRunner +--maxNumWorkers=5 +--numWorkers=5 +--autoscalingAlgorithm=NONE \ No newline at end of file diff --git a/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Json.txt b/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Json.txt new file mode 100644 index 0000000000000..627368ec0e24a --- /dev/null +++ b/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Batch_Java_Json.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--tempRoot=gs://temp-storage-for-perf-tests/loadtests +--writeMethod=FILE_LOADS +--writeFormat=JSON +--testBigQueryDataset=beam_performance +--metricsBigQueryDataset=beam_performance +--metricsBigQueryTable=bqio_10GB_results_java_batch_json +--influxMeasurement=bqio_10GB_results_java_batch_json +--sourceOptions={\"numRecords\":\"10485760\",\"keySizeBytes\":\"1\",\"valueSizeBytes\":\"1024\"} +--runner=DataflowRunner +--maxNumWorkers=5 +--numWorkers=5 +--autoscalingAlgorithm=NONE \ No newline at end of file diff --git a/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Streaming_Java.txt b/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Streaming_Java.txt new file mode 100644 index 0000000000000..99bd30e25994b --- /dev/null +++ b/.github/workflows/performance-tests-pipeline-options/bigQueryIO_Streaming_Java.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--tempLocation=gs://temp-storage-for-perf-tests/loadtests +--tempRoot=gs://temp-storage-for-perf-tests/loadtests +--writeMethod=STREAMING_INSERTS +--writeFormat=JSON +--pipelineTimeout=1200 +--testBigQueryDataset=beam_performance +--metricsBigQueryDataset=beam_performance +--metricsBigQueryTable=bqio_10GB_results_java_stream +--influxMeasurement=bqio_10GB_results_java_stream +--sourceOptions={\"numRecords\":\"10485760\",\"keySizeBytes\":\"1\",\"valueSizeBytes\":\"1024\"} +--runner=DataflowRunner +--maxNumWorkers=5 +--numWorkers=5 +--autoscalingAlgorithm=NONE \ No newline at end of file diff --git a/.github/workflows/performance-tests-job-configs/biqQueryIO_Read_Python.txt b/.github/workflows/performance-tests-pipeline-options/biqQueryIO_Read_Python.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/biqQueryIO_Read_Python.txt rename to .github/workflows/performance-tests-pipeline-options/biqQueryIO_Read_Python.txt diff --git a/.github/workflows/performance-tests-job-configs/biqQueryIO_Write_Python_Batch.txt b/.github/workflows/performance-tests-pipeline-options/biqQueryIO_Write_Python_Batch.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/biqQueryIO_Write_Python_Batch.txt rename to .github/workflows/performance-tests-pipeline-options/biqQueryIO_Write_Python_Batch.txt diff --git a/.github/workflows/performance-tests-job-configs/cdap.txt b/.github/workflows/performance-tests-pipeline-options/cdap.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/cdap.txt rename to .github/workflows/performance-tests-pipeline-options/cdap.txt diff --git a/.github/workflows/performance-tests-job-configs/config_Compressed_TextIOIT.txt b/.github/workflows/performance-tests-pipeline-options/compressed_TextIOIT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_Compressed_TextIOIT.txt rename to .github/workflows/performance-tests-pipeline-options/compressed_TextIOIT.txt diff --git a/.github/workflows/performance-tests-job-configs/config_Compressed_TextIOIT_HDFS.txt b/.github/workflows/performance-tests-pipeline-options/compressed_TextIOIT_HDFS.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_Compressed_TextIOIT_HDFS.txt rename to .github/workflows/performance-tests-pipeline-options/compressed_TextIOIT_HDFS.txt diff --git a/.github/workflows/performance-tests-pipeline-options/config_PerformanceTests_SingleStoreIO.txt b/.github/workflows/performance-tests-pipeline-options/config_PerformanceTests_SingleStoreIO.txt new file mode 100644 index 0000000000000..909042791c9fe --- /dev/null +++ b/.github/workflows/performance-tests-pipeline-options/config_PerformanceTests_SingleStoreIO.txt @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--tempRoot=gs://temp-storage-for-perf-tests +--runner=DataflowRunner +--singleStoreUsername=admin +--singleStorePassword=secretpass +--singleStorePort=3306 +--numberOfRecords=5000000 +--influxMeasurement=singlestoreioit_results diff --git a/.github/workflows/performance-tests-job-configs/hadoopFormat.txt b/.github/workflows/performance-tests-pipeline-options/hadoopFormat.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/hadoopFormat.txt rename to .github/workflows/performance-tests-pipeline-options/hadoopFormat.txt diff --git a/.github/workflows/performance-tests-job-configs/kafka_IO_Batch.txt b/.github/workflows/performance-tests-pipeline-options/kafka_IO_Batch.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/kafka_IO_Batch.txt rename to .github/workflows/performance-tests-pipeline-options/kafka_IO_Batch.txt diff --git a/.github/workflows/performance-tests-job-configs/kafka_IO_Streaming.txt b/.github/workflows/performance-tests-pipeline-options/kafka_IO_Streaming.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/kafka_IO_Streaming.txt rename to .github/workflows/performance-tests-pipeline-options/kafka_IO_Streaming.txt diff --git a/.github/workflows/performance-tests-job-configs/config_ManyFiles_TextIOIT.txt b/.github/workflows/performance-tests-pipeline-options/manyFiles_TextIOIT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_ManyFiles_TextIOIT.txt rename to .github/workflows/performance-tests-pipeline-options/manyFiles_TextIOIT.txt diff --git a/.github/workflows/performance-tests-job-configs/config_ManyFiles_TextIOIT_HDFS.txt b/.github/workflows/performance-tests-pipeline-options/manyFiles_TextIOIT_HDFS.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_ManyFiles_TextIOIT_HDFS.txt rename to .github/workflows/performance-tests-pipeline-options/manyFiles_TextIOIT_HDFS.txt diff --git a/.github/workflows/performance-tests-job-configs/mongoDBIO_IT.txt b/.github/workflows/performance-tests-pipeline-options/mongoDBIO_IT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/mongoDBIO_IT.txt rename to .github/workflows/performance-tests-pipeline-options/mongoDBIO_IT.txt diff --git a/.github/workflows/performance-tests-job-configs/config_ParquetIOIT.txt b/.github/workflows/performance-tests-pipeline-options/parquetIOIT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_ParquetIOIT.txt rename to .github/workflows/performance-tests-pipeline-options/parquetIOIT.txt diff --git a/.github/workflows/performance-tests-job-configs/config_ParquetIOIT_HDFS.txt b/.github/workflows/performance-tests-pipeline-options/parquetIOIT_HDFS.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_ParquetIOIT_HDFS.txt rename to .github/workflows/performance-tests-pipeline-options/parquetIOIT_HDFS.txt diff --git a/.github/workflows/performance-tests-job-configs/pubsubIOIT_Python_Streaming.txt b/.github/workflows/performance-tests-pipeline-options/pubsubIOIT_Python_Streaming.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/pubsubIOIT_Python_Streaming.txt rename to .github/workflows/performance-tests-pipeline-options/pubsubIOIT_Python_Streaming.txt diff --git a/.github/workflows/performance-tests-job-configs/spannerIO_Read_2GB_Python.txt b/.github/workflows/performance-tests-pipeline-options/spannerIO_Read_2GB_Python.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/spannerIO_Read_2GB_Python.txt rename to .github/workflows/performance-tests-pipeline-options/spannerIO_Read_2GB_Python.txt diff --git a/.github/workflows/performance-tests-job-configs/spannerIO_Write_2GB_Python.txt b/.github/workflows/performance-tests-pipeline-options/spannerIO_Write_2GB_Python.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/spannerIO_Write_2GB_Python.txt rename to .github/workflows/performance-tests-pipeline-options/spannerIO_Write_2GB_Python.txt diff --git a/.github/workflows/performance-tests-job-configs/sparkReceiver_IO.txt b/.github/workflows/performance-tests-pipeline-options/sparkReceiver_IO.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/sparkReceiver_IO.txt rename to .github/workflows/performance-tests-pipeline-options/sparkReceiver_IO.txt diff --git a/.github/workflows/performance-tests-job-configs/textIOIT.txt b/.github/workflows/performance-tests-pipeline-options/textIOIT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/textIOIT.txt rename to .github/workflows/performance-tests-pipeline-options/textIOIT.txt diff --git a/.github/workflows/performance-tests-job-configs/textIOIT_HDFS.txt b/.github/workflows/performance-tests-pipeline-options/textIOIT_HDFS.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/textIOIT_HDFS.txt rename to .github/workflows/performance-tests-pipeline-options/textIOIT_HDFS.txt diff --git a/.github/workflows/performance-tests-job-configs/textIOIT_Python.txt b/.github/workflows/performance-tests-pipeline-options/textIOIT_Python.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/textIOIT_Python.txt rename to .github/workflows/performance-tests-pipeline-options/textIOIT_Python.txt diff --git a/.github/workflows/performance-tests-job-configs/wordCountIT_Python.txt b/.github/workflows/performance-tests-pipeline-options/wordCountIT_Python.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/wordCountIT_Python.txt rename to .github/workflows/performance-tests-pipeline-options/wordCountIT_Python.txt diff --git a/.github/workflows/performance-tests-job-configs/xlang_KafkaIO_Python.txt b/.github/workflows/performance-tests-pipeline-options/xlang_KafkaIO_Python.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/xlang_KafkaIO_Python.txt rename to .github/workflows/performance-tests-pipeline-options/xlang_KafkaIO_Python.txt diff --git a/.github/workflows/performance-tests-job-configs/config_XmlIOIT.txt b/.github/workflows/performance-tests-pipeline-options/xmlIOIT.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_XmlIOIT.txt rename to .github/workflows/performance-tests-pipeline-options/xmlIOIT.txt diff --git a/.github/workflows/performance-tests-job-configs/config_XmlIOIT_HDFS.txt b/.github/workflows/performance-tests-pipeline-options/xmlIOIT_HDFS.txt similarity index 100% rename from .github/workflows/performance-tests-job-configs/config_XmlIOIT_HDFS.txt rename to .github/workflows/performance-tests-pipeline-options/xmlIOIT_HDFS.txt diff --git a/.github/workflows/pr-bot-new-prs.yml b/.github/workflows/pr-bot-new-prs.yml index ef825e067b7d9..0f17d662db9c0 100644 --- a/.github/workflows/pr-bot-new-prs.yml +++ b/.github/workflows/pr-bot-new-prs.yml @@ -35,7 +35,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: 16 - name: Install pr-bot npm dependencies diff --git a/.github/workflows/pr-bot-pr-updates.yml b/.github/workflows/pr-bot-pr-updates.yml index c882c18feeba3..02c8a2473ff39 100644 --- a/.github/workflows/pr-bot-pr-updates.yml +++ b/.github/workflows/pr-bot-pr-updates.yml @@ -18,6 +18,7 @@ on: pull_request_target: types: ["synchronize"] # Synchronize is the action that runs after pushes to the user branch issue_comment: + types: [created] permissions: read-all jobs: process-pr-update: @@ -39,7 +40,7 @@ jobs: with: ref: 'master' - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: 16 - name: Install pr-bot npm dependencies diff --git a/.github/workflows/pr-bot-prs-needing-attention.yml b/.github/workflows/pr-bot-prs-needing-attention.yml index 9dff7c8565a46..95be91e8dcb48 100644 --- a/.github/workflows/pr-bot-prs-needing-attention.yml +++ b/.github/workflows/pr-bot-prs-needing-attention.yml @@ -35,7 +35,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: 16 - name: Install pr-bot npm dependencies diff --git a/.github/workflows/pr-bot-update-reviewers.yml b/.github/workflows/pr-bot-update-reviewers.yml index b4c41b66f9d63..ac80d1e346e6b 100644 --- a/.github/workflows/pr-bot-update-reviewers.yml +++ b/.github/workflows/pr-bot-update-reviewers.yml @@ -35,7 +35,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: 16 - name: Install pr-bot npm dependencies diff --git a/.github/workflows/python_dependency_tests.yml b/.github/workflows/python_dependency_tests.yml index 6fd865bda7546..166899df90cbc 100644 --- a/.github/workflows/python_dependency_tests.yml +++ b/.github/workflows/python_dependency_tests.yml @@ -7,7 +7,7 @@ on: branches: ['master', 'release-*'] tags: 'v*' # paths where Beam Python's dependencies are configured. - paths: ['sdks/python/setup.py', 'sdks/python/build-requirements.txt', 'sdks/python/container/base_image_requirements_manual.txt'] + paths: ['sdks/python/setup.py', 'sdks/python/pyproject.toml', 'sdks/python/container/base_image_requirements_manual.txt'] # This allows a subsequently queued workflow run to interrupt previous runs concurrency: @@ -38,9 +38,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.params.py_ver }} - - name: Install Build dependencies - working-directory: ./sdks/python - run: pip install -r build-requirements.txt - name: Install base_image_requirements.txt working-directory: ./sdks/python run: pip install --no-deps -r container/${{ matrix.params.py_env }}/base_image_requirements.txt diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 57ec895c24310..0309329e84e13 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -78,12 +78,9 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - - name: Get build dependencies - working-directory: ./sdks/python - run: pip install pip setuptools --upgrade && pip install -r build-requirements.txt - name: Build source working-directory: ./sdks/python - run: python setup.py sdist + run: pip install -U build && python -m build --sdist - name: Rename source file working-directory: ./sdks/python/dist run: mv $(ls | grep "apache-beam.*tar\.gz") apache-beam-source.tar.gz @@ -99,7 +96,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [macos-latest, windows-latest] params: [ {"py_ver": "3.8", "tox_env": "py38"}, {"py_ver": "3.9", "tox_env": "py39"}, @@ -113,9 +110,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.params.py_ver }} - - name: Get build dependencies - working-directory: ./sdks/python - run: pip install -r build-requirements.txt --use-pep517 - name: Install tox run: pip install tox - name: Run tests basic unix @@ -148,9 +142,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} - - name: Get build dependencies - working-directory: ./sdks/python - run: pip install -r build-requirements.txt - name: Install requirements working-directory: ./sdks/python run: pip install setuptools --upgrade && pip install -e . @@ -192,9 +183,6 @@ jobs: service_account_key: ${{ secrets.GCP_SA_KEY }} project_id: ${{ secrets.GCP_PROJECT_ID }} export_default_credentials: true - - name: Get build dependencies - working-directory: ./sdks/python - run: pip install -r build-requirements.txt - name: Install requirements working-directory: ./sdks/python run: pip install setuptools --upgrade && pip install -e ".[gcp]" diff --git a/.github/workflows/reportGenerator.yml b/.github/workflows/reportGenerator.yml index 8f6bccddcfad0..e568e84f7d2d2 100644 --- a/.github/workflows/reportGenerator.yml +++ b/.github/workflows/reportGenerator.yml @@ -28,7 +28,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: 16 - run: | diff --git a/.github/workflows/run_perf_alert_tool.yml b/.github/workflows/run_perf_alert_tool.yml index 6946011f06171..bc59bd945fe23 100644 --- a/.github/workflows/run_perf_alert_tool.yml +++ b/.github/workflows/run_perf_alert_tool.yml @@ -30,7 +30,7 @@ on: jobs: python_run_change_point_analysis: name: Run Change Point Analysis. - runs-on: ubuntu-latest + runs-on: [self-hosted, ubuntu-20.04, main] permissions: issues: write steps: @@ -46,9 +46,6 @@ jobs: with: service_account_key: ${{ secrets.GCP_SA_KEY }} export_default_credentials: true - - name: Get Apache Beam Build dependencies - working-directory: ./sdks/python - run: pip install pip setuptools --upgrade && pip install -r build-requirements.txt - name: Install Apache Beam working-directory: ./sdks/python run: pip install -e .[gcp,test] @@ -59,10 +56,17 @@ jobs: - name: Run Change Point Analysis. working-directory: ./sdks/python/apache_beam/testing/analyzers shell: bash - run: python perf_analysis.py + run: python perf_analysis.py --config_file_path=./tests_config.yaml --save_alert_metadata if: github.event_name != 'pull_request' env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Run Change Point Analysis. + working-directory: ./sdks/python/apache_beam/testing/analyzers + shell: bash + run: python perf_analysis.py --config_file_path=./tests_config.yaml + if: github.event_name == 'pull_request' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Run change point analysis tests. working-directory: ./sdks/python/apache_beam/testing/analyzers shell: bash diff --git a/.github/workflows/run_rc_validation.yml b/.github/workflows/run_rc_validation.yml index 4902fee810160..35fb3ce05eae2 100644 --- a/.github/workflows/run_rc_validation.yml +++ b/.github/workflows/run_rc_validation.yml @@ -17,14 +17,14 @@ # To learn more about GitHub Actions in Apache Beam check the CI.m -name: Run RC Validation +name: Run RC Validation on: workflow_dispatch: inputs: RELEASE_VER: description: Beam current Release Version required: true - default: 2.42.0 + default: 2.42.0 USER_GCS_BUCKET: description: Bucket to upload results required: true @@ -57,9 +57,9 @@ on: type: boolean required: true default: true -env: +env: RC_TAG: "v${{github.event.inputs.RELEASE_VER}}-RC${{github.event.inputs.RC_NUM}}" - RELEASE_VER: ${{github.event.inputs.RELEASE_VER}} + RELEASE_VER: ${{github.event.inputs.RELEASE_VER}} USER_GCP_PROJECT: apache-beam-testing PYTHON_RC_DOWNLOAD_URL: https://dist.apache.org/repos/dist/dev/beam USER_GCP_REGION: us-central1 @@ -100,19 +100,19 @@ jobs: - name: Comment on PR to Trigger Python ReleaseCandidate Test run: | gh pr comment "$GITHUB_PR_URL" --body "Run Python ReleaseCandidate" - + sql_taxi_with_dataflow: runs-on: [self-hosted,ubuntu-20.04] if: ${{github.event.inputs.RUN_SQL_TAXI_WITH_DATAFLOW == 'true'}} strategy: - matrix: + matrix: py_version: [3.8] steps: - name: Checkout code uses: actions/checkout@v4 - with: - ref: ${{env.RC_TAG}} - + with: + ref: ${{env.RC_TAG}} + - name: Install Python uses: actions/setup-python@v4 with: @@ -154,23 +154,23 @@ jobs: --num_workers 5 \ --output_topic projects/${USER_GCP_PROJECT}/topics/${SQL_TAXI_TOPIC} \ --beam_services="{\":sdks:java:extensions:sql:expansion-service:shadowJar\": \"${SQL_EXPANSION_SERVICE_JAR}\"}" \ - --sdk_location apache-beam-${RELEASE_VER}.zip || true + --sdk_location apache-beam-${RELEASE_VER}.tar.gz || true - name: Checking Results run: | gcloud pubsub subscriptions pull --project=${USER_GCP_PROJECT} --limit=5 ${SQL_TAXI_SUBSCRIPTION} gcloud pubsub subscriptions pull --project=${USER_GCP_PROJECT} --limit=5 ${SQL_TAXI_SUBSCRIPTION} - name: Removing Pub Sub Topic if: always() - run: | + run: | gcloud pubsub topics delete --project=${USER_GCP_PROJECT} ${SQL_TAXI_TOPIC} gcloud pubsub subscriptions delete --project=${USER_GCP_PROJECT} ${SQL_TAXI_SUBSCRIPTION} python_cross_validation: runs-on: [self-hosted,ubuntu-20.04] - if: ${{github.event.inputs.RUN_PYTHON_CROSS_VALIDATION == 'true'}} + if: ${{github.event.inputs.RUN_PYTHON_CROSS_VALIDATION == 'true'}} strategy: - matrix: + matrix: py_version: [3.8] steps: - name: Checkout code @@ -183,7 +183,7 @@ jobs: echo "====================Checking Environment & Variables=================" echo "" echo "running validations on release ${{github.event.inputs.RELEASE_VER}} RC${{github.event.inputs.RC_NUM}}." - - name: Install Kubectl + - name: Install Kubectl uses: azure/setup-kubectl@v3 - name: Setup Java JDK @@ -196,7 +196,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{matrix.py_version}} - + - name: Setting python env uses: ./.github/actions/common-rc-validation @@ -206,19 +206,19 @@ jobs: - name: Installing gcloud-auth-plugin run: sudo apt-get install google-cloud-sdk-gke-gcloud-auth-plugin - - name: Setting Kafka Cluster Name + - name: Setting Kafka Cluster Name run: | - echo "KAFKA_CLUSTER_NAME=xlang-kafka-cluster-$RANDOM">> $GITHUB_ENV - + echo "KAFKA_CLUSTER_NAME=xlang-kafka-cluster-$RANDOM">> $GITHUB_ENV + - name: Creating Kafka Cluster run: | gcloud container clusters create --project=${USER_GCP_PROJECT} --region=${USER_GCP_REGION} --no-enable-ip-alias $KAFKA_CLUSTER_NAME kubectl apply -R -f .test-infra/kubernetes/kafka-cluster - + - name: Waiting for Kafka cluster to be ready run: kubectl wait --for=condition=Ready pod/kafka-0 --timeout=1200s - - name: Start xlang Kafka Taxi with Dataflow Runner + - name: Start xlang Kafka Taxi with Dataflow Runner run: | echo "BOOTSTRAP_SERVERS=$(kubectl get svc outside-0 -o jsonpath='{.status.loadBalancer.ingress[0].ip}'):32400" >> $GITHUB_ENV echo "KAFKA_TAXI_DF_DATASET=${GITHUB_ACTOR}_python_validations_$(date +%m%d)_$RANDOM" >> $GITHUB_ENV @@ -243,16 +243,16 @@ jobs: --temp_location=${USER_GCS_BUCKET}/temp/ \ --with_metadata \ --beam_services="{\"sdks:java:io:expansion-service:shadowJar\": \"${KAFKA_EXPANSION_SERVICE_JAR}\"}" \ - --sdk_location apache-beam-${RELEASE_VER}.zip || true + --sdk_location apache-beam-${RELEASE_VER}.tar.gz || true - name: Checking executions results run: | - bq head -n 10 ${KAFKA_TAXI_DF_DATASET}.xlang_kafka_taxi + bq head -n 10 ${KAFKA_TAXI_DF_DATASET}.xlang_kafka_taxi - name: Remove BigQuery Dataset if: always() run: | bq rm -f ${KAFKA_TAXI_DF_DATASET}.xlang_kafka_taxi bq rm -f ${KAFKA_TAXI_DF_DATASET} - + - name: Delete Kafka Cluster if: always() run: gcloud container clusters delete --project=${USER_GCP_PROJECT} --region=${USER_GCP_REGION} --async -q $KAFKA_CLUSTER_NAME @@ -266,8 +266,8 @@ jobs: steps: - name: Sending PubSub name to env run: | - echo "SHARED_PUBSUB_TOPIC=leader_board-${GITHUB_ACTOR}-python-topic-$(date +%m%d)_$RANDOM" >> $GITHUB_ENV - - id: generate_pubsub_name + echo "SHARED_PUBSUB_TOPIC=leader_board-${GITHUB_ACTOR}-python-topic-$(date +%m%d)_$RANDOM" >> $GITHUB_ENV + - id: generate_pubsub_name run: | echo "::set-output name=pubsub::$SHARED_PUBSUB_TOPIC" - name: Creating Pub Sub Topics @@ -287,7 +287,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: '3.8' - + - name: Setting python env uses: ./.github/actions/common-rc-validation with: @@ -343,14 +343,14 @@ jobs: run: | ls cd word-count-beam - timeout --preserve-status 50m mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector -Dexec.args="${USER_GCP_PROJECT} ${{needs.generate_shared_pubsub.outputs.name}} none" || true - + timeout --preserve-status 50m mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector -Dexec.args="${USER_GCP_PROJECT} ${{needs.generate_shared_pubsub.outputs.name}} none" || true + direct_runner_leaderboard: runs-on: [self-hosted, ubuntu-20.04] if: ${{github.event.inputs.RUN_DIRECT_RUNNER_TESTS == 'true' }} strategy: - matrix: + matrix: py_version: [3.8] needs: generate_shared_pubsub steps: @@ -363,13 +363,13 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{matrix.py_version}} - + - name: Setting python env uses: ./.github/actions/common-rc-validation with: RELEASE_VER: ${{env.RELEASE_VER}} PYTHON_RC_DOWNLOAD_URL: ${{env.PYTHON_RC_DOWNLOAD_URL}} - + - name: Exporting leaderboard Dataset Name run: echo "LEADERBOARD_DIRECT_DATASET=${GITHUB_ACTOR}_python_validations_$(date +%m%d)_$RANDOM" >> $GITHUB_ENV - name: Creating Dataset @@ -389,11 +389,11 @@ jobs: bq head -n 10 ${LEADERBOARD_DIRECT_DATASET}.leader_board_teams - name: Removing BigQuery Dataset if: always() - run: | + run: | bq rm -f ${LEADERBOARD_DIRECT_DATASET}.leader_board_users bq rm -f ${LEADERBOARD_DIRECT_DATASET}.leader_board_teams bq rm -f $LEADERBOARD_DIRECT_DATASET - + dataflow_runner_leaderboard: runs-on: [self-hosted,ubuntu-20.04] if: ${{github.event.inputs.RUN_DATAFLOW_RUNNER_TESTS=='true'}} @@ -411,13 +411,13 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{matrix.py_version}} - + - name: Setting python env uses: ./.github/actions/common-rc-validation with: RELEASE_VER: ${{env.RELEASE_VER}} PYTHON_RC_DOWNLOAD_URL: ${{env.PYTHON_RC_DOWNLOAD_URL}} - + - name: Exporting Dataflow Dataset Name run: echo "LEADERBOARD_DF_DATASET=${GITHUB_ACTOR}_python_validations_$(date +%m%d)_$RANDOM" >> $GITHUB_ENV - name: Creating Dataset @@ -434,18 +434,18 @@ jobs: --dataset ${LEADERBOARD_DF_DATASET} \ --runner DataflowRunner \ --temp_location=${USER_GCS_BUCKET}/temp/ \ - --sdk_location apache-beam-${RELEASE_VER}.zip || true + --sdk_location apache-beam-${RELEASE_VER}.tar.gz || true - name: Checking results run: | bq head -n 10 ${LEADERBOARD_DF_DATASET}.leader_board_users bq head -n 10 ${LEADERBOARD_DF_DATASET}.leader_board_teams - name: Removing BigQuery Dataset if: always() - run: | + run: | bq rm -f ${LEADERBOARD_DF_DATASET}.leader_board_users bq rm -f ${LEADERBOARD_DF_DATASET}.leader_board_teams bq rm -f $LEADERBOARD_DF_DATASET - + direct_runner_gamestats: runs-on: [self-hosted,ubuntu-20.04] @@ -463,13 +463,13 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{matrix.py_version}} - + - name: Setting python env uses: ./.github/actions/common-rc-validation with: RELEASE_VER: ${{env.RELEASE_VER}} PYTHON_RC_DOWNLOAD_URL: ${{env.PYTHON_RC_DOWNLOAD_URL}} - + - name: Exporting Gamestates Direct Dataset Name run: echo "GAMESTATS_DIRECT_DATASET=${GITHUB_ACTOR}_python_validations_$(date +%m%d)_$RANDOM" >> $GITHUB_ENV - name: Creating Dataset @@ -490,7 +490,7 @@ jobs: bq head -n 10 ${GAMESTATS_DIRECT_DATASET}.game_stats_sessions - name: Removing BigQuery Dataset if: always() - run: | + run: | bq rm -f ${GAMESTATS_DIRECT_DATASET}.game_stats_sessions bq rm -f ${GAMESTATS_DIRECT_DATASET}.game_stats_teams bq rm -f $GAMESTATS_DIRECT_DATASET @@ -512,13 +512,13 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{matrix.py_version}} - + - name: Setting python env uses: ./.github/actions/common-rc-validation with: RELEASE_VER: ${{env.RELEASE_VER}} PYTHON_RC_DOWNLOAD_URL: ${{env.PYTHON_RC_DOWNLOAD_URL}} - + - name: Exporting Gamestates Direct Dataset Name run: echo "GAMESTATS_DF_DATASET=${GITHUB_ACTOR}_python_validations_$(date +%m%d)_$RANDOM" >> $GITHUB_ENV - name: Creating Dataset @@ -535,7 +535,7 @@ jobs: --dataset ${GAMESTATS_DF_DATASET} \ --runner DataflowRunner \ --temp_location=${USER_GCS_BUCKET}/temp/ \ - --sdk_location apache-beam-${RELEASE_VER}.zip \ + --sdk_location apache-beam-${RELEASE_VER}.tar.gz \ --fixed_window_duration ${FIXED_WINDOW_DURATION} || true - name: Checking Results run: | @@ -545,9 +545,9 @@ jobs: if: always() run: | bq rm -f ${GAMESTATS_DF_DATASET}.game_stats_teams - bq rm -f ${GAMESTATS_DF_DATASET}.game_stats_sessions + bq rm -f ${GAMESTATS_DF_DATASET}.game_stats_sessions bq rm -f $GAMESTATS_DF_DATASET - + remove_shared_pubsub: runs-on: [self-hosted,ubuntu-20.04] needs: [java_injector, generate_shared_pubsub] @@ -555,4 +555,3 @@ jobs: steps: - name: Deleting Shared Pub Sub run: gcloud pubsub topics delete --project=${USER_GCP_PROJECT} ${{needs.generate_shared_pubsub.outputs.name}} - \ No newline at end of file diff --git a/.github/workflows/self-assign.yml b/.github/workflows/self-assign.yml index c6b7cc69ce978..29ad240cd0ddb 100644 --- a/.github/workflows/self-assign.yml +++ b/.github/workflows/self-assign.yml @@ -16,7 +16,7 @@ name: Assign or close an issue on: issue_comment: - + types: [created] jobs: assign: permissions: diff --git a/.github/workflows/typescript_tests.yml b/.github/workflows/typescript_tests.yml index a4aa14c42efd6..e71834f09ffb2 100644 --- a/.github/workflows/typescript_tests.yml +++ b/.github/workflows/typescript_tests.yml @@ -50,7 +50,7 @@ jobs: persist-credentials: false submodules: recursive - name: Install node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: '16' - run: npm ci @@ -79,7 +79,7 @@ jobs: persist-credentials: false submodules: recursive - name: Install Node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: '16' - name: Install Python @@ -89,10 +89,8 @@ jobs: - name: Setup Beam Python working-directory: ./sdks/python run: | - pip install pip setuptools --upgrade - pip install -r build-requirements.txt pip install 'pandas>=1.0,<1.5' - python setup.py develop + pip install -e . - run: npm ci working-directory: ./sdks/typescript - run: npm run build @@ -136,7 +134,7 @@ jobs: persist-credentials: false submodules: recursive - name: Install node - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: '16' - name: Install python @@ -146,10 +144,7 @@ jobs: - name: Setup Beam Python working-directory: ./sdks/python run: | - pip install pip setuptools --upgrade - pip install -r build-requirements.txt pip install 'pandas>=1.0,<1.5' - python setup.py develop pip install -e ".[gcp]" - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 diff --git a/.gitignore b/.gitignore index d69995de84ecf..0852e63dbd3e5 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,6 @@ playground/cloudfunction.zip # Exception to .gitignore .test-infra/pipelines related files !.test-infra/pipelines/**/apache-beam-testing.tfvars + +# Ignore .test-infra/mock-apis related files +.test-infra/mock-apis/**/charts/ diff --git a/.test-infra/dataproc/cleanup.sh b/.test-infra/dataproc/cleanup.sh index e535fabb77764..ac12a0bff05c2 100755 --- a/.test-infra/dataproc/cleanup.sh +++ b/.test-infra/dataproc/cleanup.sh @@ -20,11 +20,8 @@ clustersList=( ) toDeleteList=( ) -generatedResources=("beam-loadtests-go-cogbk-flink" "beam-loadtests-python-cogbk-flink" \ -"beam-loadtests-go-combine-flink" "beam-loadtests-python-combine-flink" \ -"beam-loadtests-go-gbk-flink" "beam-loadtests-python-gbk-flink" \ -"beam-loadtests-go-pardo-flink" "beam-loadtests-python-pardo-flink" \ -"beam-postcommit-python-chicago" ) +generatedResources=("beam-loadtests-go-*-flink" "beam-loadtests-python-*-flink" \ +"beam-loadtests-py-*-flink" "beam-postcommit-python-chicago" ) function deleteFilteredClusters(){ for cluster in ${toDeleteList[@]};do @@ -51,7 +48,7 @@ function filterClusters(){ if [[ $elapsedHours -ge 2 ]]; then for name in ${generatedResources[@]}; do # Only resources generated by the groovy jobs set are queued for deletion - if [[ "$cluster" == *"$name"* ]]; then + if [[ "$cluster" == *${name}* && ! ("$cluster" =~ nokill) ]]; then toDeleteList+=( "$cluster" ) break fi diff --git a/.test-infra/jenkins/JavaTestProperties.groovy b/.test-infra/jenkins/JavaTestProperties.groovy index ce7446a6e71dd..5403cee5cf9a6 100644 --- a/.test-infra/jenkins/JavaTestProperties.groovy +++ b/.test-infra/jenkins/JavaTestProperties.groovy @@ -17,5 +17,10 @@ */ class JavaTestProperties { - final static List SUPPORTED_CONTAINER_TASKS = ['java8', 'java11', 'java17'] + final static List SUPPORTED_CONTAINER_TASKS = [ + 'java8', + 'java11', + 'java17', + 'java21' + ] } diff --git a/.test-infra/jenkins/NexmarkBuilder.groovy b/.test-infra/jenkins/NexmarkBuilder.groovy index 044b0cbb95612..69fa3dcc4277c 100644 --- a/.test-infra/jenkins/NexmarkBuilder.groovy +++ b/.test-infra/jenkins/NexmarkBuilder.groovy @@ -145,7 +145,7 @@ class NexmarkBuilder { rootBuildScriptDir(commonJobProperties.checkoutDir) tasks(':sdks:java:testing:nexmark:run') commonJobProperties.setGradleSwitches(delegate) - switches("-PcompileAndRunTestsWithJava11") + switches("-PtestJavaVersion=11") switches("-Pjava11Home=${commonJobProperties.JAVA_11_HOME}") switches("-Pnexmark.runner=${runner.getDependencyBySDK(sdk)}") switches("-Pnexmark.args=\"${parseOptions(options)}\"") @@ -168,7 +168,7 @@ class NexmarkBuilder { rootBuildScriptDir(commonJobProperties.checkoutDir) tasks(':sdks:java:testing:nexmark:run') commonJobProperties.setGradleSwitches(delegate) - switches("-PcompileAndRunTestsWithJava17") + switches("-PtestJavaVersion=17") switches("-Pjava17Home=${commonJobProperties.JAVA_17_HOME}") switches("-Pnexmark.runner=${runner.getDependencyBySDK(sdk)}") switches("-Pnexmark.args=\"${parseOptions(options)}\"") diff --git a/.test-infra/jenkins/job_CancelStaleDataflowJobs.groovy b/.test-infra/jenkins/job_CancelStaleDataflowJobs.groovy deleted file mode 100644 index a0ecd1eacfdd2..0000000000000 --- a/.test-infra/jenkins/job_CancelStaleDataflowJobs.groovy +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties - -job("beam_CancelStaleDataflowJobs") { - description("Cancel stale dataflow jobs") - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Sets that this is a cron job, run once randomly per day. - commonJobProperties.setCronJob(delegate, '0 */4 * * *') - - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - 'Cancel Stale Dataflow Jobs', - 'Run Cancel Stale Dataflow Jobs') - - // Gradle goals for this job. - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks(':beam-test-tools:cancelStaleDataflowJobs') - commonJobProperties.setGradleSwitches(delegate) - } - } -} diff --git a/.test-infra/jenkins/job_CleanUpGCPResources.groovy b/.test-infra/jenkins/job_CleanUpGCPResources.groovy deleted file mode 100644 index 59b78358e769e..0000000000000 --- a/.test-infra/jenkins/job_CleanUpGCPResources.groovy +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties - -job("beam_CleanUpGCPResources") { - description("Clean up stale resources on Beam's GCP testing project (BQ datasets, )") - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Sets that this is a cron job, run once randomly per day. - commonJobProperties.setCronJob(delegate, 'H H * * *') - - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - 'Clean Up GCP Resources', - 'Run Clean GCP Resources') - - // Gradle goals for this job. - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks(':beam-test-tools:cleanupOtherStaleResources') - commonJobProperties.setGradleSwitches(delegate) - } - } -} diff --git a/.test-infra/jenkins/job_CleanUpPrebuiltSDKImages.groovy b/.test-infra/jenkins/job_CleanUpPrebuiltSDKImages.groovy deleted file mode 100644 index 224c1bbeac52e..0000000000000 --- a/.test-infra/jenkins/job_CleanUpPrebuiltSDKImages.groovy +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties - -job("beam_CleanUpPrebuiltSDKImages") { - description("Clean up stale dataflow prebuilt sdk container images") - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Sets that this is a cron job, run once randomly per day. - commonJobProperties.setCronJob(delegate, '0 H * * *') - - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - 'Clean Up Prebuilt SDK Images', - 'Run Clean Prebuilt Images') - - // Gradle goals for this job. - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks(':beam-test-tools:removeStaleSDKContainerImages') - commonJobProperties.setGradleSwitches(delegate) - } - } -} diff --git a/.test-infra/jenkins/job_CloudMLBenchmarkTests_Python.groovy b/.test-infra/jenkins/job_CloudMLBenchmarkTests_Python.groovy deleted file mode 100644 index 1867cccf77541..0000000000000 --- a/.test-infra/jenkins/job_CloudMLBenchmarkTests_Python.groovy +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder - -def cloudMLJob = { scope -> - scope.description('Runs the TFT Criteo Examples on the Dataflow runner.') - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 360) - - Map pipelineOptions = [ - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - metrics_dataset : 'beam_cloudml', - publish_to_big_query: true, - project : 'apache-beam-testing', - region : 'us-central1', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - runner : 'DataflowRunner', - requirements_file : "apache_beam/testing/benchmarks/cloudml/requirements.txt" - ] - // Gradle goals for this job. - scope.steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - commonJobProperties.setGradleSwitches(delegate) - switches("-Popts=\'${commonJobProperties.mapToArgString(pipelineOptions)}\'") - tasks(':sdks:python:test-suites:dataflow:tftTests') - } - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_CloudML_Benchmarks_Dataflow', - 'Run TFT Criteo Benchmarks', - 'TFT Criteo benchmarks on Dataflow(\"Run TFT Criteo Benchmarks"\"")', - this - ) { - cloudMLJob(delegate) - } - -CronJobBuilder.cronJob( - 'beam_CloudML_Benchmarks_Dataflow', - 'H H * * *', - this - ) { - cloudMLJob(delegate) - } diff --git a/.test-infra/jenkins/job_IODatastoresCredentialsRotation.groovy b/.test-infra/jenkins/job_IODatastoresCredentialsRotation.groovy deleted file mode 100644 index 58d18205b54aa..0000000000000 --- a/.test-infra/jenkins/job_IODatastoresCredentialsRotation.groovy +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties - -job('Rotate IO-Datastores Cluster Credentials') { - description('Rotates Certificates and performs an IP rotation for Metrics cluster') - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Sets that this is a cron job. - commonJobProperties.setCronJob(delegate, 'H 2 1 * *')// At 00:02am every month. - def date = new Date().format('E MMM dd HH:mm:ss z yyyy') - - steps { - //Starting credential rotation - shell('''gcloud container clusters update io-datastores \ - --start-credential-rotation --zone=us-central1-a --quiet''') - - //Rebuilding the nodes - shell('''gcloud container clusters upgrade io-datastores \ - --node-pool=pool-1 --zone=us-central1-a --quiet''') - - //Completing the rotation - shell('''gcloud container clusters update io-datastores \ - --complete-credential-rotation --zone=us-central1-a --quiet''') - } - - publishers { - extendedEmail { - triggers { - failure { - subject('Credentials Rotation Failure on IO-Datastores cluster') - content("Something went wrong during the automatic credentials rotation for IO-Datastores Cluster, performed at ${date}. It may be necessary to check the state of the cluster certificates. For further details refer to the following links:\n * Failing job: https://ci-beam.apache.org/job/Rotate%20IO-Datastores%20Cluster%20Credentials/ \n * Job configuration: https://github.com/apache/beam/blob/master/.test-infra/jenkins/job_IODatastoresCredentialsRotation.groovy \n * Cluster URL: https://pantheon.corp.google.com/kubernetes/clusters/details/us-central1-a/io-datastores/details?mods=dataflow_dev&project=apache-beam-testing") - recipientList('dev@beam.apache.org') - } - } - } - } -} diff --git a/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy b/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy deleted file mode 100644 index a98b8d170437e..0000000000000 --- a/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import static PythonTestProperties.RUN_INFERENCE_TEST_PYTHON_VERSION - - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def loadTestConfigurations = { - -> - [ - // Benchmark test config. Add multiple configs for multiple models. - [ - title : 'Pytorch Vision Classification with Resnet 101', - test : 'apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'benchmark-tests-pytorch-imagenet-python' + now, - project : 'apache-beam-testing', - region : 'us-central1', - machine_type : 'n1-standard-2', - num_workers : 75, - disk_size_gb : 50, - autoscaling_algorithm : 'NONE', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - requirements_file : 'apache_beam/ml/inference/torch_tests_requirements.txt', - publish_to_big_query : true, - metrics_dataset : 'beam_run_inference', - metrics_table : 'torch_inference_imagenet_results_resnet101', - input_options : '{}', // this option is not required for RunInference tests. - influx_measurement : 'torch_inference_imagenet_resnet101', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - pretrained_model_name : 'resnet101', - device : 'CPU', - input_file : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt', - model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet101.pth', - output : 'gs://temp-storage-for-end-to-end-tests/torch/result_101' + now + '.txt' - ] - ], - [ - title : 'Pytorch Imagenet Classification with Resnet 152', - test : 'apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'benchmark-tests-pytorch-imagenet-python' + now, - project : 'apache-beam-testing', - region : 'us-central1', - machine_type : 'n1-standard-2', - num_workers : 75, - disk_size_gb : 50, - autoscaling_algorithm : 'NONE', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - requirements_file : 'apache_beam/ml/inference/torch_tests_requirements.txt', - publish_to_big_query : true, - metrics_dataset : 'beam_run_inference', - metrics_table : 'torch_inference_imagenet_results_resnet152', - input_options : '{}', // this option is not required for RunInference tests. - influx_measurement : 'torch_inference_imagenet_resnet152', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - pretrained_model_name : 'resnet152', - device : 'CPU', - input_file : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt', - model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet152.pth', - output : 'gs://temp-storage-for-end-to-end-tests/torch/result_resnet152' + now + '.txt' - ] - ], - // Pytorch language modeling test using HuggingFace BERT models - [ - title : 'Pytorch Lanugaue Modeling using Hugging face bert-base-uncased model', - test : 'apache_beam.testing.benchmarks.inference.pytorch_language_modeling_benchmarks', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'benchmark-tests-pytorch-language-modeling-bert-base-uncased' + now, - project : 'apache-beam-testing', - region : 'us-central1', - machine_type : 'n1-standard-2', - num_workers : 250, - disk_size_gb : 50, - autoscaling_algorithm : 'NONE', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - requirements_file : 'apache_beam/ml/inference/torch_tests_requirements.txt', - publish_to_big_query : true, - metrics_dataset : 'beam_run_inference', - metrics_table : 'torch_language_modeling_bert_base_uncased', - input_options : '{}', // this option is not required for RunInference tests. - influx_measurement : 'torch_language_modeling_bert_base_uncased', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - device : 'CPU', - input_file : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt', - bert_tokenizer : 'bert-base-uncased', - model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth', - output : 'gs://temp-storage-for-end-to-end-tests/torch/result_bert_base_uncased' + now + '.txt', - ] - ], - [ - title : 'Pytorch Langauge Modeling using Hugging Face bert-large-uncased model', - test : 'apache_beam.testing.benchmarks.inference.pytorch_language_modeling_benchmarks', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'benchmark-tests-pytorch-language-modeling-bert-large-cased' + now, - project : 'apache-beam-testing', - region : 'us-central1', - machine_type : 'n1-standard-2', - num_workers : 250, - disk_size_gb : 50, - autoscaling_algorithm : 'NONE', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - requirements_file : 'apache_beam/ml/inference/torch_tests_requirements.txt', - publish_to_big_query : true, - metrics_dataset : 'beam_run_inference', - metrics_table : 'torch_language_modeling_bert_large_uncased', - input_options : '{}', // this option is not required for RunInference tests. - influx_measurement : 'torch_language_modeling_bert_large_uncased', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - device : 'CPU', - input_file : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt', - bert_tokenizer : 'bert-large-uncased', - model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-large-uncased.pth', - output : 'gs://temp-storage-for-end-to-end-tests/torch/result_bert_large_uncased' + now + '.txt' - ] - ], - [ - title : 'Pytorch Imagenet Classification with Resnet 152 with Tesla T4 GPU', - test : 'apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'benchmark-tests-pytorch-imagenet-python-gpu' + now, - project : 'apache-beam-testing', - region : 'us-central1', - machine_type : 'n1-standard-2', - num_workers : 75, // this could be lower as the quota for the apache-beam-testing project is 32 T4 GPUs as of November 28th, 2022. - disk_size_gb : 50, - autoscaling_algorithm : 'NONE', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - requirements_file : 'apache_beam/ml/inference/torch_tests_requirements.txt', - publish_to_big_query : true, - metrics_dataset : 'beam_run_inference', - metrics_table : 'torch_inference_imagenet_results_resnet152_tesla_t4', - input_options : '{}', // this option is not required for RunInference tests. - influx_measurement : 'torch_inference_imagenet_resnet152_tesla_t4', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - pretrained_model_name : 'resnet152', - device : 'GPU', - experiments : 'worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver', - sdk_container_image : 'us.gcr.io/apache-beam-testing/python-postcommit-it/tensor_rt:latest', - input_file : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt', - model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet152.pth', - output : 'gs://temp-storage-for-end-to-end-tests/torch/result_resnet152_gpu' + now + '.txt' - ] - ], - ] -} - -def loadTestJob = { scope -> - List testScenarios = loadTestConfigurations() - for (Map testConfig: testScenarios){ - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 180) - loadTestsBuilder.loadTest(scope, testConfig.title, testConfig.runner, CommonTestProperties.SDK.PYTHON, testConfig.pipelineOptions, testConfig.test, null, - testConfig.pipelineOptions.requirements_file, RUN_INFERENCE_TEST_PYTHON_VERSION) - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_Inference_Python_Benchmarks_Dataflow', - 'Run Inference Benchmarks', - 'RunInference benchmarks on Dataflow(\"Run Inference Benchmarks"\"")', - this - ) { - loadTestJob(delegate) - } - -CronJobBuilder.cronJob( - 'beam_Inference_Python_Benchmarks_Dataflow', 'H H * * *', - this - ) { - loadTestJob(delegate) - } diff --git a/.test-infra/jenkins/job_Inventory.groovy b/.test-infra/jenkins/job_Inventory.groovy index 4723dfdf97e16..e115603d0d42f 100644 --- a/.test-infra/jenkins/job_Inventory.groovy +++ b/.test-infra/jenkins/job_Inventory.groovy @@ -32,7 +32,7 @@ nums.each { commonJobProperties.setTopLevelMainJobProperties(delegate) // Sets that this is a cron job. - commonJobProperties.setCronJob(delegate, '45 6,18 * * *') + commonJobProperties.setCronJob(delegate, '45 */8 * * *') // Allows triggering this build against pull requests. commonJobProperties.enablePhraseTriggeringFromPullRequest( @@ -50,7 +50,7 @@ nums.each { } stringParam { name("tmp_unaccessed_for") - defaultValue("48") + defaultValue("24") description("Files from /tmp dir that were not accessed for last `tmp_unaccessed_for` hours will be deleted.") trim(true) } diff --git a/.test-infra/jenkins/job_LoadTests_CoGBK_Dataflow_V2_Java11.groovy b/.test-infra/jenkins/job_LoadTests_CoGBK_Dataflow_V2_Java11.groovy deleted file mode 100644 index fc7f39d28a0d2..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_CoGBK_Dataflow_V2_Java11.groovy +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: CoGBK 2GB 100 byte records - single key', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_CoGBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_1", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: CoGBK 2GB 100 byte records - multiple keys', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_CoGBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_2", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 5 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: CoGBK 2GB reiteration 10kB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_CoGBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_3", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: CoGBK 2GB reiteration 2MB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_CoGBK_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_4", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def final JOB_SPECIFIC_SWITCHES = [ - '-Prunner.version="V2"', - '-PcompileAndRunTestsWithJava11', - "-Pjava11Home=${commonJobProperties.JAVA_11_HOME}" -] - -def streamingLoadTestJob = { scope, triggeringContext -> - scope.description('Runs Java 11 CoGBK load tests on Dataflow runner V2 in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in loadTestConfigurations('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200, coInputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, - testConfiguration.pipelineOptions, testConfiguration.test, JOB_SPECIFIC_SWITCHES) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_Java11', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_Java11', - 'Run Load Tests Java 11 CoGBK Dataflow V2 Streaming', - 'Load Tests Java 11 CoGBK Dataflow V2 Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), - "CoGBK", "batch", JOB_SPECIFIC_SWITCHES) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_Java11', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_Java11', - 'Run Load Tests Java 11 CoGBK Dataflow V2 Batch', - 'Load Tests Java 11 CoGBK Dataflow V2 Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_CoGBK_Dataflow_V2_Java17.groovy b/.test-infra/jenkins/job_LoadTests_CoGBK_Dataflow_V2_Java17.groovy deleted file mode 100644 index ca8c6689ad0f7..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_CoGBK_Dataflow_V2_Java17.groovy +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: CoGBK 2GB 100 byte records - single key', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_CoGBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_1", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: CoGBK 2GB 100 byte records - multiple keys', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_CoGBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_2", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 5 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: CoGBK 2GB reiteration 10kB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_CoGBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_3", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: CoGBK 2GB reiteration 2MB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_CoGBK_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_4", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def final JOB_SPECIFIC_SWITCHES = [ - '-Prunner.version="V2"', - '-PcompileAndRunTestsWithJava17', - "-Pjava17Home=${commonJobProperties.JAVA_17_HOME}" -] - -def streamingLoadTestJob = { scope, triggeringContext -> - scope.description('Runs Java 17 CoGBK load tests on Dataflow runner V2 in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in loadTestConfigurations('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200, coInputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, - testConfiguration.pipelineOptions, testConfiguration.test, JOB_SPECIFIC_SWITCHES) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_Java17', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_CoGBK_Dataflow_V2_Streaming_Java17', - 'Run Load Tests Java 17 CoGBK Dataflow V2 Streaming', - 'Load Tests Java 17 CoGBK Dataflow V2 Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), - "CoGBK", "batch", JOB_SPECIFIC_SWITCHES) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_Java17', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_CoGBK_Dataflow_V2_Batch_Java17', - 'Run Load Tests Java 17 CoGBK Dataflow V2 Batch', - 'Load Tests Java 17 CoGBK Dataflow V2 Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_CoGBK_Java.groovy b/.test-infra/jenkins/job_LoadTests_CoGBK_Java.groovy deleted file mode 100644 index 373e11e5a7af6..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_CoGBK_Java.groovy +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: CoGBK 2GB 100 byte records - single key', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_CoGBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: CoGBK 2GB 100 byte records - multiple keys', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_CoGBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_2", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 5 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: CoGBK 2GB reiteration 10kB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_CoGBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_3", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: CoGBK 2GB reiteration 2MB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_CoGBK_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def streamingLoadTestJob = { scope, triggeringContext -> - scope.description('Runs Java CoGBK load tests on Dataflow runner in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in loadTestConfigurations('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200, coInputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, testConfiguration.pipelineOptions, testConfiguration.test) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_CoGBK_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_CoGBK_Dataflow_Streaming', - 'Run Load Tests Java CoGBK Dataflow Streaming', - 'Load Tests Java CoGBK Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - - -def batchLoadTestJob = { scope, triggeringContext -> - - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), "CoGBK", "batch") -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_CoGBK_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_CoGBK_Dataflow_Batch', - 'Run Load Tests Java CoGBK Dataflow Batch', - 'Load Tests Java CoGBK Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_CoGBK_Java_spark_structured_streaming.groovy b/.test-infra/jenkins/job_LoadTests_CoGBK_Java_spark_structured_streaming.groovy deleted file mode 100644 index 1d02dd7c6d818..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_CoGBK_Java_spark_structured_streaming.groovy +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import CronJobBuilder -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: CoGBK 2GB 100 byte records - single key', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_CoGBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - streaming : isStreaming - ] - ], - [ - title : 'Load test: CoGBK 2GB 100 byte records - multiple keys', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_CoGBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_2", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 5 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - streaming : isStreaming - ] - ], - [ - - title : 'Load test: CoGBK 2GB reiteration 10kB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_CoGBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_3", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - streaming : isStreaming - ] - - ], - [ - title : 'Load test: CoGBK 2GB reiteration 2MB value', - test : 'org.apache.beam.sdk.loadtests.CoGroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_CoGBK_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_cogbk_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - coSourceOptions : """ - { - "numRecords": 2000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 1000 - } - """.trim().replaceAll("\\s", ""), - iterations : 4, - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), "CoGBK", "batch") -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_CoGBK_SparkStructuredStreaming_Batch', - 'Run Load Tests Java CoGBK SparkStructuredStreaming Batch', - 'Load Tests Java CoGBK SparkStructuredStreaming Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Go.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Flink_Go.groovy deleted file mode 100644 index 9b8adc732f98a..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Go.groovy +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.GO_SDK_CONTAINER - - -String now = new Date().format('MMddHHmmss', TimeZone.getTimeZone('UTC')) - -def batchScenarios = { - [ - [ - title : 'Combine Go Load test: 2GB of 10B records', - test : 'combine', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-combine-1-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_combine_1', - input_options : '\'{' + - '"num_records": 200000000,' + - '"key_size": 1,' + - '"value_size": 9}\'', - fanout : 1, - top_count : 20, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Combine Go Load test: fanout 4 times with 2GB 10-byte records total', - test : 'combine', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-combine-4-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_combine_4', - input_options : '\'{' + - '"num_records": 5000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - fanout : 4, - top_count : 20, - parallelism : 16, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Combine Go Load test: fanout 8 times with 2GB 10-byte records total', - test : 'combine', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-combine-5-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_combine_5', - fanout : 8, - top_count : 20, - parallelism : 16, - input_options : '\'{' + - '"num_records": 2500000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - Map testScenariosByParallelism = batchScenarios().groupBy { test -> - test.pipelineOptions.parallelism - } - Integer initialParallelism = testScenariosByParallelism.keySet().iterator().next() - List initialScenarios = testScenariosByParallelism.remove(initialParallelism) - - def flink = new Flink(scope, "beam_LoadTests_Go_Combine_Flink_${mode.capitalize()}") - flink.setUp( - [ - GO_SDK_CONTAINER - ], - initialParallelism, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - // Execute all scenarios connected with initial parallelism. - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, initialScenarios, 'combine', mode) - - // Execute the rest of scenarios. - testScenariosByParallelism.each { parallelism, scenarios -> - flink.scaleCluster(parallelism) - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, scenarios, 'combine', mode) - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_Combine_Flink_Batch', - 'Run Load Tests Go Combine Flink Batch', - 'Load Tests Go Combine Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_Combine_Flink_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fix this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy deleted file mode 100644 index 54b92fdade264..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_CONTAINER_REGISTRY -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.DOCKER_BEAM_SDK_IMAGE - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -// TODO(https://github.com/apache/beam/issues/20402): Skipping some cases because they are too slow. -def TESTS_TO_SKIP = [ - 'load-tests-python-flink-streaming-combine-1', -] - -def loadTestConfigurations = { mode, datasetName -> - [ - [ - title : 'Combine Python Load test: 2GB 10 byte records', - test : 'apache_beam.testing.load_tests.combine_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load-tests-python-flink-${mode}-combine-1-${now}", - project : 'apache-beam-testing', - publish_to_big_query: true, - metrics_dataset : datasetName, - metrics_table : "python_flink_${mode}_combine_1", - influx_measurement : "python_${mode}_combine_1", - input_options : '\'{' + - '"num_records": 200000000,' + - '"key_size": 1,' + - '"value_size": 9,' + - '"algorithm": "lcg"}\'', - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - top_count : 20, - ] - ], - [ - title : 'Combine Python Load test: 2GB Fanout 4', - test : 'apache_beam.testing.load_tests.combine_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load-tests-python-flink-${mode}-combine-4-${now}", - project : 'apache-beam-testing', - publish_to_big_query: true, - metrics_dataset : datasetName, - metrics_table : "python_flink_${mode}_combine_4", - influx_measurement : "python_${mode}_combine_4", - input_options : '\'{' + - '"num_records": 5000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - parallelism : 16, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - fanout : 4, - top_count : 20, - ] - ], - [ - title : 'Combine Python Load test: 2GB Fanout 8', - test : 'apache_beam.testing.load_tests.combine_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load-tests-python-flink-${mode}-combine-5-${now}", - project : 'apache-beam-testing', - publish_to_big_query: true, - metrics_dataset : datasetName, - metrics_table : "python_flink_${mode}_combine_5", - influx_measurement : "python_${mode}_combine_5", - input_options : '\'{' + - '"num_records": 2500000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - parallelism : 16, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - fanout : 8, - top_count : 20, - ] - ] - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .each { test -> (mode != 'streaming') ?: addStreamingOptions(test) } - .collectMany { test -> - TESTS_TO_SKIP.any { element -> test.pipelineOptions.job_name.startsWith(element) } ? []: [test] - } -} - -def addStreamingOptions(test) { - test.pipelineOptions << [streaming: null, - use_stateful_load_generator: null - ] -} - -def loadTestJob = { scope, triggeringContext, mode -> - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - List testScenarios = loadTestConfigurations(mode, datasetName) - Map testScenariosByParallelism = testScenarios.groupBy { test -> - test.pipelineOptions.parallelism - } - Integer initialParallelism = testScenariosByParallelism.keySet().iterator().next() - List initialScenarios = testScenariosByParallelism.remove(initialParallelism) - - def flink = new Flink(scope, "beam_LoadTests_Python_Combine_Flink_${mode.capitalize()}") - flink.setUp( - [ - "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" - ], - initialParallelism, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - // Execute all scenarios connected with initial parallelism. - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, initialScenarios, 'Combine', mode) - - // Execute the rest of scenarios. - testScenariosByParallelism.each { parallelism, scenarios -> - flink.scaleCluster(parallelism) - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, scenarios, 'Combine', mode) - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_Combine_Flink_Batch', - 'Run Load Tests Python Combine Flink Batch', - 'Load Tests Python Combine Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_Combine_Flink_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_Combine_Flink_Streaming', - 'Run Load Tests Python Combine Flink Streaming', - 'Load Tests Python Combine Flink Streaming suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'streaming') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_Combine_Flink_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fix this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'streaming') -} diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Go.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Go.groovy deleted file mode 100644 index 0d6b7b8bb5b49..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Combine_Go.groovy +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.GO_SDK_CONTAINER - - -String now = new Date().format('MMddHHmmss', TimeZone.getTimeZone('UTC')) - -def batchScenarios = { - [ - [ - title : 'Combine Go Load test: 2GB of 10B records', - test : 'combine', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-combine-1-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_combine_1', - input_options : '\'{' + - '"num_records": 200000000,' + - '"key_size": 1,' + - '"value_size": 9}\'', - fanout : 1, - top_count : 20, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Combine Go Load test: fanout 4 times with 2GB 10-byte records total', - test : 'combine', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-combine-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_combine_4', - input_options : '\'{' + - '"num_records": 5000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - fanout : 4, - top_count : 20, - num_workers : 16, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Combine Go Load test: fanout 8 times with 2GB 10-byte records total', - test : 'combine', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-combine-5-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_combine_5', - input_options : '\'{' + - '"num_records": 2500000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - fanout : 8, - top_count : 20, - num_workers : 16, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, batchScenarios(), 'combine', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_Combine_Dataflow_Batch', - 'Run Load Tests Go Combine Dataflow Batch', - 'Load Tests Go Combine Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_Combine_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Java.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Java.groovy deleted file mode 100644 index a4254ca219493..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Combine_Java.groovy +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import CronJobBuilder -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def commonLoadTestConfig = { jobType, isStreaming -> - [ - [ - title : 'Load test: 2GB of 10B records', - test : 'org.apache.beam.sdk.loadtests.CombineLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${jobType}_Combine_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_combine_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 200000000, - "keySizeBytes": 1, - "valueSizeBytes": 9 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - topCount : 20, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - perKeyCombiner : "TOP_LARGEST", - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 4 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.CombineLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${jobType}_Combine_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_combine_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 5000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 4, - iterations : 1, - topCount : 20, - numWorkers : 16, - autoscalingAlgorithm: "NONE", - perKeyCombiner : "TOP_LARGEST", - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 8 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.CombineLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${jobType}_Combine_5", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_combine_5", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2500000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 8, - iterations : 1, - topCount : 20, - numWorkers : 16, - autoscalingAlgorithm: "NONE", - perKeyCombiner : "TOP_LARGEST", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, commonLoadTestConfig('batch', false), "Combine", "batch") -} - -def streamingLoadTestJob = {scope, triggeringContext -> - scope.description('Runs Java Combine load tests on Dataflow runner in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in commonLoadTestConfig('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, testConfiguration.pipelineOptions, testConfiguration.test) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_Combine_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_Combine_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_Combine_Dataflow_Batch', - 'Run Load Tests Java Combine Dataflow Batch', - 'Load Tests Java Combine Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_Combine_Dataflow_Streaming', - 'Run Load Tests Java Combine Dataflow Streaming', - 'Load Tests Java Combine Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Java_Smoke.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Java_Smoke.groovy deleted file mode 100644 index fa22932716d92..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Combine_Java_Smoke.groovy +++ /dev/null @@ -1,85 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder - -import static LoadTestConfig.fromTemplate -import static LoadTestConfig.templateConfig - -def smokeTestConfigurations = { - def template = templateConfig { - title 'CombineLoadTest load test Dataflow-1' - test 'org.apache.beam.sdk.loadtests.CombineLoadTest' - dataflow() - pipelineOptions { - java() - appName 'smoke-dsl-java' - project 'apache-beam-testing' - tempLocation 'gs://temp-storage-for-perf-tests/smoketests' - numWorkers 5 - autoscalingAlgorithm 'NONE' - sourceOptions { - numRecords 100000 - splitPointFrequencyRecords 1 - } - stepOptions { - outputRecordsPerInputRecord 1 - preservesInputKeyDistribution true - } - specificParameters([ - fanout: 10, - iterations: 1 - ]) - } - } - [ - fromTemplate(template), - fromTemplate(template) { - title 'CombineLoadTest load test Dataflow-2' - pipelineOptions { - numWorkers 3 - specificParameters([ - fanout: 1 - ]) - } - }, - fromTemplate(template) { - title 'CombineLoadTest load test Dataflow-3' - pipelineOptions { - sourceOptions { - numRecords 20000 - } - } - }, - ] -} - - - -// Runs a tiny version load test suite to ensure nothing is broken. -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_Java_LoadTests_Combine_Smoke', - 'Run Java Load Tests Combine Smoke', - 'Java Load Tests Combine Smoke', - this - ) { - loadTestsBuilder.loadTests(delegate, CommonTestProperties.SDK.JAVA, smokeTestConfigurations(), "Combine", "smoke") - } diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Java_spark_structured_streaming.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Java_spark_structured_streaming.groovy deleted file mode 100644 index ff1dc2bf190a4..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Combine_Java_spark_structured_streaming.groovy +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import CronJobBuilder -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def commonLoadTestConfig = { jobType, isStreaming -> - [ - [ - title : 'Load test: 2GB of 10B records', - test : 'org.apache.beam.sdk.loadtests.CombineLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${jobType}_Combine_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_combine_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 200000000, - "keySizeBytes": 1, - "valueSizeBytes": 9 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - topCount : 20, - perKeyCombiner : "TOP_LARGEST", - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 4 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.CombineLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${jobType}_Combine_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_combine_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 5000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 4, - iterations : 1, - topCount : 20, - perKeyCombiner : "TOP_LARGEST", - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 8 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.CombineLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${jobType}_Combine_5", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_combine_5", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2500000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 8, - iterations : 1, - topCount : 20, - perKeyCombiner : "TOP_LARGEST", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, commonLoadTestConfig('batch', false), "Combine", "batch") -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_Combine_SparkStructuredStreaming_Batch', - 'Run Load Tests Java Combine SparkStructuredStreaming Batch', - 'Load Tests Java Combine SparkStructuredStreaming Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Python.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Python.groovy deleted file mode 100644 index b1adca8e8f195..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Combine_Python.groovy +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def loadTestConfigurations = { datasetName, mode -> - [ - [ - title : 'Combine Python Load test: 2GB 10 byte records', - test : 'apache_beam.testing.load_tests.combine_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-combine-1-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/smoketests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_combine_1", - influx_measurement : "python_${mode}_combine_1", - input_options : '\'{' + - '"num_records": 200000000,' + - '"key_size": 1,' + - '"value_size": 9,' + - '"algorithm": "lcg"}\'', - num_workers : 5, - autoscaling_algorithm: "NONE", - top_count : 20, - ] - ], - [ - title : 'Combine Python Load test: 2GB Fanout 4', - test : 'apache_beam.testing.load_tests.combine_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-combine-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/smoketests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_combine_4", - influx_measurement : "python_${mode}_combine_4", - input_options : '\'{' + - '"num_records": 5000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - num_workers : 16, - autoscaling_algorithm: "NONE", - fanout : 4, - top_count : 20, - ] - ], - [ - title : 'Combine Python Load test: 2GB Fanout 8', - test : 'apache_beam.testing.load_tests.combine_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-combine-5-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/smoketests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_combine_5", - influx_measurement : "python_${mode}_combine_5", - input_options : '\'{' + - '"num_records": 2500000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - num_workers : 16, - autoscaling_algorithm: "NONE", - fanout : 8, - top_count : 20, - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .each{ test -> (mode != 'streaming') ?: addStreamingOptions(test) } -} - -def addStreamingOptions(test){ - test.pipelineOptions << [streaming: null, - experiments: "use_runner_v2" - ] -} - -def loadTestJob = { scope, triggeringContext, jobType -> - scope.description("Runs Python Combine load tests on Dataflow runner in ${jobType} mode") - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 720) - - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - for (testConfiguration in loadTestConfigurations(datasetName, jobType)) { - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.PYTHON, testConfiguration.pipelineOptions, testConfiguration.test) - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_Combine_Dataflow_Batch', - 'Run Load Tests Python Combine Dataflow Batch', - 'Load Tests Python Combine Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, "batch") - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_Combine_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, "batch") -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_Combine_Dataflow_Streaming', - 'Run Load Tests Python Combine Dataflow Streaming', - 'Load Tests Python Combine Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, "streaming") - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_Combine_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, "streaming") -} diff --git a/.test-infra/jenkins/job_LoadTests_FnApiRunner_Python.groovy b/.test-infra/jenkins/job_LoadTests_FnApiRunner_Python.groovy deleted file mode 100644 index 538ef2be908cf..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_FnApiRunner_Python.groovy +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def loadTestConfigurations = { datasetName -> - [ - [ - title : 'FnApiRunner Python load test - microbenchmark', - test : 'apache_beam.testing.load_tests.microbenchmarks_test', - runner : CommonTestProperties.Runner.DIRECT, - pipelineOptions: [ - publish_to_big_query: true, - influx_measurement : 'python_direct_microbenchmarks', - project : 'apache-beam-testing', - metrics_dataset : datasetName, - metrics_table : 'python_direct_microbenchmarks', - input_options : '\'{}\'', - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext -> - scope.description("Runs Python FnApiRunner Microbenchmark") - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 120) - - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - for (testConfiguration in loadTestConfigurations(datasetName)) { - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.PYTHON, testConfiguration.pipelineOptions, testConfiguration.test) - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_Python_LoadTests_FnApiRunner_Microbenchmark', - 'Run Python Load Tests FnApiRunner Microbenchmark', - 'Python Load Tests FnApiRunner Microbenchmark', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - - -// Run this job every 6 hours on a random minute. -CronJobBuilder.cronJob('beam_Python_LoadTests_FnApiRunner_Microbenchmark', 'H H/6 * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Dataflow_V2_Java11.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Dataflow_V2_Java11.groovy deleted file mode 100644 index cc2d5d2e55540..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Dataflow_V2_Java11.groovy +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: 2GB of 10B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_GBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_1", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 200000000, - "keySizeBytes": 1, - "valueSizeBytes": 9 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: 2GB of 100B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_GBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_2", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: 2GB of 100kB records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_GBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_3", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000, - "keySizeBytes": 10000, - "valueSizeBytes": 90000 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: fanout 4 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_GBK_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_4", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 5000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 4, - iterations : 1, - numWorkers : 16, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 8 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_GBK_5", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_5", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2500000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 8, - iterations : 1, - numWorkers : 16, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 10kB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_GBK_6", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_6", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 2MB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${mode}_GBK_7", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_7", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 10, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def final JOB_SPECIFIC_SWITCHES = [ - '-Prunner.version="V2"', - '-PcompileAndRunTestsWithJava11', - "-Pjava11Home=${commonJobProperties.JAVA_11_HOME}" -] - -def streamingLoadTestJob = { scope, triggeringContext -> - scope.description('Runs Java 11 GBK load tests on Dataflow runner V2 in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in loadTestConfigurations('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, - testConfiguration.pipelineOptions, testConfiguration.test, JOB_SPECIFIC_SWITCHES) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java11', - 'Run Load Tests Java 11 GBK Dataflow V2 Streaming', - 'Load Tests Java 11 GBK Dataflow V2 Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), - "GBK", "batch", JOB_SPECIFIC_SWITCHES) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java11', - 'Run Load Tests Java 11 GBK Dataflow V2 Batch', - 'Load Tests Java 11 GBK Dataflow V2 Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Dataflow_V2_Java17.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Dataflow_V2_Java17.groovy deleted file mode 100644 index 7405f9154b838..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Dataflow_V2_Java17.groovy +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: 2GB of 10B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_GBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_1", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 200000000, - "keySizeBytes": 1, - "valueSizeBytes": 9 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: 2GB of 100B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_GBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_2", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: 2GB of 100kB records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_GBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_3", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000, - "keySizeBytes": 10000, - "valueSizeBytes": 90000 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: fanout 4 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_GBK_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_4", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 5000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 4, - iterations : 1, - numWorkers : 16, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 8 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_GBK_5", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_5", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2500000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 8, - iterations : 1, - numWorkers : 16, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 10kB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_GBK_6", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_6", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 2MB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${mode}_GBK_7", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_7", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 10, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def final JOB_SPECIFIC_SWITCHES = [ - '-Prunner.version="V2"', - '-PcompileAndRunTestsWithJava17', - "-Pjava17Home=${commonJobProperties.JAVA_17_HOME}" -] - -def streamingLoadTestJob = { scope, triggeringContext -> - scope.description('Runs Java 17 GBK load tests on Dataflow runner V2 in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in loadTestConfigurations('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, - testConfiguration.pipelineOptions, testConfiguration.test, JOB_SPECIFIC_SWITCHES) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_GBK_Dataflow_V2_Streaming_Java17', - 'Run Load Tests Java 17 GBK Dataflow V2 Streaming', - 'Load Tests Java 17 GBK Dataflow V2 Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), - "GBK", "batch", JOB_SPECIFIC_SWITCHES) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_GBK_Dataflow_V2_Batch_Java17', - 'Run Load Tests Java 17 GBK Dataflow V2 Batch', - 'Load Tests Java 17 GBK Dataflow V2 Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Go.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Flink_Go.groovy deleted file mode 100644 index d5a6910b2a0d1..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Go.groovy +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.GO_SDK_CONTAINER - -String now = new Date().format('MMddHHmmss', TimeZone.getTimeZone('UTC')) - -// TODO(https://github.com/apache/beam/issues/20146): Skipping some cases because they are too slow or have memory errors. -def TESTS_TO_SKIP = [ - 'load-tests-go-flink-batch-gbk-7', -] - -def batchScenarios = { - [ - [ - title : 'Group By Key Go Load test: 2GB of 10B records', - test : 'group_by_key', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-gbk-1-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_gbk_1', - input_options : '\'{' + - '"num_records": 200000000,' + - '"key_size": 1,' + - '"value_size": 9}\'', - iterations : 1, - fanout : 1, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: 2GB of 100B records', - test : 'group_by_key', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-gbk-2-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_gbk_2', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - fanout : 1, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: 2GB of 100kB records', - test : 'group_by_key', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-gbk-3-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_gbk_3', - iterations : 1, - fanout : 1, - parallelism : 5, - input_options : '\'{' + - '"num_records": 20000,' + - '"key_size": 10000,' + - '"value_size": 90000}\'', - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: fanout 4 times with 2GB 10-byte records total', - test : 'group_by_key', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-gbk-4-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_gbk_4', - iterations : 1, - fanout : 4, - parallelism : 16, - input_options : '\'{' + - '"num_records": 5000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: fanout 8 times with 2GB 10-byte records total', - test : 'group_by_key', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-gbk-5-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_gbk_5', - iterations : 1, - fanout : 8, - parallelism : 16, - input_options : '\'{' + - '"num_records": 2500000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: reiterate 4 times 10kB values', - test : 'group_by_key', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-gbk-6-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_gbk_6', - iterations : 4, - fanout : 1, - parallelism : 5, - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 200,' + - '"hot_key_fraction": 1}\'', - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: reiterate 4 times 2MB values', - test : 'group_by_key', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-gbk-7-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_gbk_7', - iterations : 4, - fanout : 1, - parallelism : 5, - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 10,' + - '"hot_key_fraction": 1}\'', - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .collectMany { test -> - TESTS_TO_SKIP.any { element -> test.pipelineOptions.job_name.startsWith(element) } ? []: [test] - } -} - -def loadTestJob = { scope, triggeringContext, mode -> - Map testScenariosByParallelism = batchScenarios().groupBy { test -> - test.pipelineOptions.parallelism - } - Integer initialParallelism = testScenariosByParallelism.keySet().iterator().next() - List initialScenarios = testScenariosByParallelism.remove(initialParallelism) - - def flink = new Flink(scope, "beam_LoadTests_Go_GBK_Flink_${mode.capitalize()}") - flink.setUp( - [ - GO_SDK_CONTAINER - ], - initialParallelism, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - // Execute all scenarios connected with initial parallelism. - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, initialScenarios, 'group_by_key', mode) - - // Execute the rest of scenarios. - testScenariosByParallelism.each { parallelism, scenarios -> - flink.scaleCluster(parallelism) - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, scenarios, 'group_by_key', mode) - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_GBK_Flink_Batch', - 'Run Load Tests Go GBK Flink Batch', - 'Load Tests Go GBK Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_GBK_Flink_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fix this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy deleted file mode 100644 index 25e2647ebf32f..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_CONTAINER_REGISTRY -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.DOCKER_BEAM_SDK_IMAGE - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def scenarios = { datasetName -> - [ - [ - title : 'Load test: 2GB of 10B records', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load_tests_Python_Flink_Batch_GBK_1_${now}", - publish_to_big_query: true, - project : 'apache-beam-testing', - metrics_dataset : datasetName, - metrics_table : "python_flink_batch_GBK_1", - influx_measurement : 'python_batch_gkb_1', - input_options : '\'{"num_records": 200000000,"key_size": 1,"value_size":9}\'', - iterations : 1, - fanout : 1, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - [ - title : 'Load test: 2GB of 100B records', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load_tests_Python_Flink_Batch_GBK_2_${now}", - publish_to_big_query: true, - project : 'apache-beam-testing', - metrics_dataset : datasetName, - metrics_table : "python_flink_batch_GBK_2", - influx_measurement : 'python_batch_gbk_2', - input_options : '\'{"num_records": 20000000,"key_size": 10,"value_size":90}\'', - iterations : 1, - fanout : 1, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - [ - title : 'Load test: fanout 4 times with 2GB 10-byte records total', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load_tests_Python_Flink_Batch_GBK_4_${now}", - publish_to_big_query: true, - project : 'apache-beam-testing', - metrics_dataset : datasetName, - metrics_table : "python_flink_batch_GBK_4", - influx_measurement : 'python_batch_gbk_4', - input_options : '\'{"num_records": 5000000,"key_size": 10,"value_size":90}\'', - iterations : 1, - fanout : 4, - parallelism : 16, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - [ - title : 'Load test: fanout 8 times with 2GB 10-byte records total', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load_tests_Python_Flink_Batch_GBK_5_${now}", - publish_to_big_query: true, - project : 'apache-beam-testing', - metrics_dataset : datasetName, - metrics_table : "python_flink_batch_GBK_5", - influx_measurement : 'python_batch_gbk_5', - input_options : '\'{"num_records":2500000,"key_size":10,"value_size":90,"algorithm":"lcg"}\'', - iterations : 1, - fanout : 8, - parallelism : 16, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - [ - title : 'Load test: reiterate 4 times 10kB values', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : "load_tests_Python_Flink_Batch_GBK_6_${now}", - publish_to_big_query: true, - project : 'apache-beam-testing', - metrics_dataset : datasetName, - metrics_table : "python_flink_batch_GBK_6", - influx_measurement : 'python_batch_gbk_6', - input_options : '\'{"num_records":20000000,"key_size":10,"value_size":90,"num_hot_keys":200,"hot_key_fraction":1,"algorithm":"lcg"}\'', - iterations : 4, - fanout : 1, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTest = { scope, triggeringContext -> - def sdk = CommonTestProperties.SDK.PYTHON - - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - def numberOfWorkers = 16 - List testScenarios = scenarios(datasetName) - - def flink = new Flink(scope, 'beam_LoadTests_Python_GBK_Flink_Batch') - flink.setUp( - [ - "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" - ], - numberOfWorkers, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - def configurations = testScenarios.findAll { it.pipelineOptions?.parallelism?.value == numberOfWorkers } - loadTestsBuilder.loadTests(scope, sdk, configurations, "GBK", "batch") - - numberOfWorkers = 5 - flink.scaleCluster(numberOfWorkers) - - configurations = testScenarios.findAll { it.pipelineOptions?.parallelism?.value == numberOfWorkers } - loadTestsBuilder.loadTests(scope, sdk, configurations, "GBK", "batch") -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_GBK_Flink_Batch', - 'Run Load Tests Python GBK Flink Batch', - 'Load Tests Python GBK Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTest(delegate, CommonTestProperties.TriggeringContext.PR) - } - -// TODO(https://github.com/apache/beam/issues/20146) Re-enable auto builds after these tests pass. diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Go.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Go.groovy deleted file mode 100644 index da31a692f61d5..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Go.groovy +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.GO_SDK_CONTAINER - -String now = new Date().format('MMddHHmmss', TimeZone.getTimeZone('UTC')) - -def batchScenarios = { - [ - [ - title : 'Group By Key Go Load test: 2GB of 10B records', - test : 'group_by_key', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-gbk-1-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_gbk_1', - input_options : '\'{' + - '"num_records": 200000000,' + - '"key_size": 1,' + - '"value_size": 9}\'', - iterations : 1, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: 2GB of 100B records', - test : 'group_by_key', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-gbk-2-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_gbk_2', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: 2GB of 100kB records', - test : 'group_by_key', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-gbk-3-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_gbk_3', - input_options : '\'{' + - '"num_records": 20000,' + - '"key_size": 10000,' + - '"value_size": 90000}\'', - iterations : 1, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: fanout 4 times with 2GB 10-byte records total', - test : 'group_by_key', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-gbk-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_gbk_4', - input_options : '\'{' + - '"num_records": 5000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - fanout : 4, - num_workers : 16, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: fanout 8 times with 2GB 10-byte records total', - test : 'group_by_key', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-gbk-5-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_gbk_5', - input_options : '\'{' + - '"num_records": 2500000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - fanout : 8, - num_workers : 16, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: reiterate 4 times 10kB values', - test : 'group_by_key', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-gbk-6-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_gbk_6', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 200,' + - '"hot_key_fraction": 1}\'', - iterations : 4, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'Group By Key Go Load test: reiterate 4 times 2MB values', - test : 'group_by_key', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-gbk-7-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_gbk_7', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 10,' + - '"hot_key_fraction": 1}\'', - iterations : 4, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, batchScenarios(), 'group_by_key', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_GBK_Dataflow_Batch', - 'Run Load Tests Go GBK Dataflow Batch', - 'Load Tests Go GBK Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_GBK_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Java.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Java.groovy deleted file mode 100644 index 6427b5b123579..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Java.groovy +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: 2GB of 10B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_GBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 200000000, - "keySizeBytes": 1, - "valueSizeBytes": 9 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: 2GB of 100B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_GBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_2", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: 2GB of 100kB records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_GBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_3", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000, - "keySizeBytes": 10000, - "valueSizeBytes": 90000 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: fanout 4 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : 'load_tests_Java_Dataflow_${mode}_GBK_4', - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 5000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 4, - iterations : 1, - numWorkers : 16, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 8 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_GBK_5", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_5", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2500000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 8, - iterations : 1, - numWorkers : 16, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 10kB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_GBK_6", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_6", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 2MB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${mode}_GBK_7", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_7", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 10, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - numWorkers : 5, - autoscalingAlgorithm : "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def streamingLoadTestJob = { scope, triggeringContext -> - scope.description('Runs Java GBK load tests on Dataflow runner in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in loadTestConfigurations('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, testConfiguration.pipelineOptions, testConfiguration.test) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_GBK_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_GBK_Dataflow_Streaming', - 'Run Load Tests Java GBK Dataflow Streaming', - 'Load Tests Java GBK Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), "GBK", "batch") -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_GBK_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_GBK_Dataflow_Batch', - 'Run Load Tests Java GBK Dataflow Batch', - 'Load Tests Java GBK Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Java_Smoke.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Java_Smoke.groovy deleted file mode 100644 index 131c26efe2e5e..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Java_Smoke.groovy +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder - -def smokeTestConfigurations = { - [ - [ - title : 'GroupByKey load test Direct', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DIRECT, - pipelineOptions: [ - sourceOptions : '{"numRecords":100000,"splitPointFrequencyRecords":1}', - stepOptions : '{"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true}', - fanout : 10, - iterations : 1, - ] - ], - [ - title : 'GroupByKey load test Dataflow', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - tempLocation : 'gs://temp-storage-for-perf-tests/smoketests', - sourceOptions : '{"numRecords":100000,"splitPointFrequencyRecords":1}', - stepOptions : '{"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true}', - fanout : 10, - iterations : 1, - ] - ], - [ - title : 'GroupByKey load test Flink', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - sourceOptions : '{"numRecords":100000,"splitPointFrequencyRecords":1}', - stepOptions : '{"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true}', - fanout : 10, - iterations : 1, - ] - ], - [ - title : 'GroupByKey load test Spark', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK, - pipelineOptions: [ - sparkMaster : 'local[4]', - sourceOptions : '{"numRecords":100000,"splitPointFrequencyRecords":1}', - stepOptions : '{"outputRecordsPerInputRecord":1,"preservesInputKeyDistribution":true}', - fanout : 10, - iterations : 1, - ] - ] - ] -} - - -// Runs a tiny version load test suite to ensure nothing is broken. -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_Java_LoadTests_GBK_Smoke', - 'Run Java Load Tests GBK Smoke', - 'Java Load Tests GBK Smoke', - this - ) { - loadTestsBuilder.loadTests(delegate, CommonTestProperties.SDK.JAVA, smokeTestConfigurations(), "GBK", "smoke") - } diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Java_spark_structured_streaming.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Java_spark_structured_streaming.groovy deleted file mode 100644 index 8e5ec703f3b07..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Java_spark_structured_streaming.groovy +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import CronJobBuilder -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def loadTestConfigurations = { mode, isStreaming -> - [ - [ - title : 'Load test: 2GB of 10B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_GBK_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 200000000, - "keySizeBytes": 1, - "valueSizeBytes": 9 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - streaming : isStreaming - ] - ], - [ - title : 'Load test: 2GB of 100B records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_GBK_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_2", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - streaming : isStreaming - ] - ], - [ - - title : 'Load test: 2GB of 100kB records', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_GBK_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_3", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000, - "keySizeBytes": 10000, - "valueSizeBytes": 90000 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 1, - streaming : isStreaming - ] - - ], - [ - title : 'Load test: fanout 4 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : 'load_tests_Java_SparkStructuredStreaming_${mode}_GBK_4', - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 5000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 4, - iterations : 1, - streaming : isStreaming - ] - ], - [ - title : 'Load test: fanout 8 times with 2GB 10-byte records total', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_GBK_5", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_5", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 2500000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - fanout : 8, - iterations : 1, - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 10kB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_GBK_6", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_6", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 200, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - streaming : isStreaming - ] - ], - [ - title : 'Load test: reiterate 4 times 2MB values', - test : 'org.apache.beam.sdk.loadtests.GroupByKeyLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${mode}_GBK_7", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${mode}_gbk_7", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90, - "numHotKeys": 10, - "hotKeyFraction": 1 - } - """.trim().replaceAll("\\s", ""), - fanout : 1, - iterations : 4, - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, loadTestConfigurations('batch', false), "GBK", "batch") -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_GBK_SparkStructuredStreaming_Batch', - 'Run Load Tests Java GBK SparkStructuredStreaming Batch', - 'Load Tests Java GBK SparkStructuredStreaming Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Python.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Python.groovy deleted file mode 100644 index 8538e2bcc4220..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Python.groovy +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -// TODO(https://github.com/apache/beam/issues/20403): Skipping some cases because they are too slow. -def TESTS_TO_SKIP = [ - 'load-tests-python-dataflow-streaming-gbk-1', - 'load-tests-python-dataflow-streaming-gbk-2', - 'load-tests-python-dataflow-streaming-gbk-4', - 'load-tests-python-dataflow-streaming-gbk-5', -] - -def loadTestConfigurations = { mode, datasetName -> - [ - [ - title : 'GroupByKey Python Load test: 2GB of 10B records', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-gbk-1-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_gbk_1", - influx_measurement : "python_${mode}_gbk_1", - input_options : '\'{"num_records": 200000000,' + - '"key_size": 1,' + - '"value_size": 9,' + - '"algorithm": "lcg"}\'', - iterations : 1, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'GroupByKey Python Load test: 2GB of 100B records', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-gbk-2-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_gbk_2", - influx_measurement : "python_${mode}_gbk_2", - input_options : '\'{"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'GroupByKey Python Load test: 2GB of 100kB records', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-gbk-3-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_gbk_3", - influx_measurement : "python_${mode}_gbk_3", - input_options : '\'{"num_records": 20000,' + - '"key_size": 10000,' + - '"value_size": 90000,' + - '"algorithm": "lcg"}\'', - iterations : 1, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'GroupByKey Python Load test: fanout 4 times with 2GB 10-byte records total', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-gbk-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_gbk_4", - influx_measurement : "python_${mode}_gbk_4", - input_options : '\'{"num_records": 5000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - fanout : 4, - num_workers : 16, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'GroupByKey Python Load test: fanout 8 times with 2GB 10-byte records total', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-gbk-5-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_gbk_5", - influx_measurement : "python_${mode}_gbk_5", - input_options : '\'{"num_records": 2500000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - fanout : 8, - num_workers : 16, - autoscaling_algorithm: 'NONE', - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .each { test -> (mode != 'streaming') ?: addStreamingOptions(test) } - .collectMany { test -> - TESTS_TO_SKIP.any { element -> test.pipelineOptions.job_name.startsWith(element) } ? []: [test] - } -} - -def addStreamingOptions(test) { - test.pipelineOptions << [ - streaming: null, - // Use the new Dataflow runner, which offers improved efficiency of Dataflow jobs. - // See https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-runner-v2 - // for more details. - experiments: 'use_runner_v2', - ] -} - -def loadTestJob = { scope, triggeringContext, mode -> - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, - loadTestConfigurations(mode, datasetName), 'GBK', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_GBK_Dataflow_Batch', - 'Run Load Tests Python GBK Dataflow Batch', - 'Load Tests Python GBK Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_GBK_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_GBK_Dataflow_Streaming', - 'Run Load Tests Python GBK Dataflow Streaming', - 'Load Tests Python GBK Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'streaming') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_GBK_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'streaming') -} - diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Python_reiterate.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Python_reiterate.groovy deleted file mode 100644 index b958dd8a907a8..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_GBK_Python_reiterate.groovy +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def loadTestConfigurations = { mode, datasetName -> - [ - [ - title : 'GroupByKey Python Load test: reiterate 4 times 10kB values', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - job_name : "load-tests-python-dataflow-${mode}-gbk-6-${now}", - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_gbk_6", - influx_measurement : "python_${mode}_gbk_6", - input_options : '\'{"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 200,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - iterations : 4, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'GroupByKey Python Load test: reiterate 4 times 2MB values', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - job_name : "load-tests-python-dataflow-${mode}-gbk-7-${now}", - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_gbk_7", - influx_measurement : "python_${mode}_gbk_7", - input_options : '\'{"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 10,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - iterations : 4, - fanout : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ] - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .each { test -> (mode != 'streaming') ?: addStreamingOptions(test) } -} - -def addStreamingOptions(test) { - test.pipelineOptions << [ - streaming: null, - // Use the new Dataflow runner, which offers improved efficiency of Dataflow jobs. - // See https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-runner-v2 - // for more details. - experiments: 'use_runner_v2', - ] -} - -def loadTestJob = { scope, triggeringContext, mode -> - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, - loadTestConfigurations(mode, datasetName), 'GBK reiterate', mode) -} - -CronJobBuilder.cronJob('beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch', - 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') - } - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_GBK_reiterate_Dataflow_Batch', - 'Run Load Tests Python GBK reiterate Dataflow Batch', - 'Load Tests Python GBK reiterate Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming', - 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'streaming') - } - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_GBK_reiterate_Dataflow_Streaming', - 'Run Load Tests Python GBK reiterate Dataflow Streaming', - 'Load Tests Python GBK reiterate Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'streaming') - } diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Dataflow_V2_Java11.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Dataflow_V2_Java11.groovy deleted file mode 100644 index a4535d52e6cf7..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Dataflow_V2_Java11.groovy +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def commonLoadTestConfig = { jobType, isStreaming -> - [ - [ - title : 'Load test: ParDo 2GB 100 byte records 10 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${jobType}_ParDo_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_1", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 10, - numberOfCounters : 1, - numberOfCounterOperations: 0, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 200 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${jobType}_ParDo_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_2", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 200, - numberOfCounters : 1, - numberOfCounterOperations: 0, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: ParDo 2GB 100 byte records 10 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${jobType}_ParDo_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_3", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 10, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 100 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java11_Dataflow_V2_${jobType}_ParDo_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_4", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java11" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 100, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def final JOB_SPECIFIC_SWITCHES = [ - '-Prunner.version="V2"', - '-PcompileAndRunTestsWithJava11', - "-Pjava11Home=${commonJobProperties.JAVA_11_HOME}" -] - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, commonLoadTestConfig('batch', false), - "ParDo", "batch", JOB_SPECIFIC_SWITCHES) -} - -def streamingLoadTestJob = {scope, triggeringContext -> - scope.description('Runs Java 11 ParDo load tests on Dataflow runner V2 in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in commonLoadTestConfig('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, - testConfiguration.pipelineOptions, testConfiguration.test, JOB_SPECIFIC_SWITCHES) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_Java11', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_Java11', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_Java11', - 'Run Load Tests Java 11 ParDo Dataflow V2 Batch', - 'Load Tests Java 11 ParDo Dataflow V2 Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_Java11', - 'Run Load Tests Java 11 ParDo Dataflow V2 Streaming', - 'Load Tests Java 11 ParDo Dataflow V2 Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Dataflow_V2_Java17.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Dataflow_V2_Java17.groovy deleted file mode 100644 index f0f2179ebb3bd..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Dataflow_V2_Java17.groovy +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def commonLoadTestConfig = { jobType, isStreaming -> - [ - [ - title : 'Load test: ParDo 2GB 100 byte records 10 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${jobType}_ParDo_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_1", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 10, - numberOfCounters : 1, - numberOfCounterOperations: 0, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 200 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${jobType}_ParDo_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_2", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 200, - numberOfCounters : 1, - numberOfCounterOperations: 0, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: ParDo 2GB 100 byte records 10 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${jobType}_ParDo_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_3", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 10, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 100 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java17_Dataflow_V2_${jobType}_ParDo_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_4", - influxTags : """ - { - "runnerVersion": "v2", - "jdk": "java17" - } - """.trim().replaceAll("\\s", ""), - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 100, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def final JOB_SPECIFIC_SWITCHES = [ - '-Prunner.version="V2"', - '-PcompileAndRunTestsWithJava17', - "-Pjava17Home=${commonJobProperties.JAVA_17_HOME}" -] - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, commonLoadTestConfig('batch', false), - "ParDo", "batch", JOB_SPECIFIC_SWITCHES) -} - -def streamingLoadTestJob = {scope, triggeringContext -> - scope.description('Runs Java 17 ParDo load tests on Dataflow runner V2 in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in commonLoadTestConfig('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, - testConfiguration.pipelineOptions, testConfiguration.test, JOB_SPECIFIC_SWITCHES) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_Java17', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_Java17', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_ParDo_Dataflow_V2_Batch_Java17', - 'Run Load Tests Java 17 ParDo Dataflow V2 Batch', - 'Load Tests Java 17 ParDo Dataflow V2 Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_ParDo_Dataflow_V2_Streaming_Java17', - 'Run Load Tests Java 17 ParDo Dataflow V2 Streaming', - 'Load Tests Java 17 ParDo Dataflow V2 Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Go.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Go.groovy deleted file mode 100644 index df20312f27b5a..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Go.groovy +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.GO_SDK_CONTAINER - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - - -def batchScenarios = { - [ - [ - title : 'ParDo Go Load test: 20M 100 byte records 10 iterations', - test : 'pardo', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-pardo-1-${now}", - influx_measurement : 'go_batch_pardo_1', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 10, - number_of_counter_operations: 0, - number_of_counters : 0, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'ParDo Go Load test: 20M 100 byte records 200 times', - test : 'pardo', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-pardo-2-${now}", - influx_measurement : 'go_batch_pardo_2', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 200, - number_of_counter_operations: 0, - number_of_counters : 0, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'ParDo Go Load test: 20M 100 byte records 10 counters', - test : 'pardo', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-pardo-3-${now}", - influx_measurement : 'go_batch_pardo_3', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - number_of_counter_operations: 10, - number_of_counters : 1, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'ParDo Go Load test: 20M 100 byte records 100 counters', - test : 'pardo', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-pardo-4-${now}", - influx_measurement : 'go_batch_pardo_4', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - number_of_counter_operations: 100, - number_of_counters : 1, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - def numberOfWorkers = 5 - - Flink flink = new Flink(scope, "beam_LoadTests_Go_ParDo_Flink_${mode.capitalize()}") - flink.setUp( - [ - GO_SDK_CONTAINER - ], - numberOfWorkers, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, batchScenarios(), 'ParDo', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_ParDo_Flink_Batch', - 'Run Load Tests Go ParDo Flink Batch', - 'Load Tests Go ParDo Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_ParDo_Flink_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fix this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy deleted file mode 100644 index 4af2efd1be6a6..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy +++ /dev/null @@ -1,374 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_CONTAINER_REGISTRY -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.DOCKER_BEAM_SDK_IMAGE - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -/** - * The test results for these load tests reside in BigQuery in the load_test/load_test_PRs table of the - * apache-beam-testing project. A dashboard is available here: - * https://apache-beam-testing.appspot.com/explore?dashboard=5751884853805056 - * - * For example: - * SELECT - * timestamp, value - * FROM - * apache-beam-testing.load_test_PRs.python_flink_batch_pardo_1 - * ORDER BY - * timestamp - * - * The following query has been been used to visualize the checkpoint results of python_flink_streaming_pardo_6: - * Select timestamp, min, sum/count as avg, max - * FROM ( - * SELECT - * timestamp, - * MAX(IF(metric LIKE "%\\_min\\_%", value, null)) min, - * MAX(IF(metric LIKE "%\\_sum\\_%", value, null)) sum, - * MAX(IF(metric LIKE "%\\_count\\_%", value, null)) count, - * MAX(IF(metric LIKE "%\\_max\\_%", value, null)) max - * FROM apache-beam-testing.load_test_PRs.python_flink_streaming_pardo_6 - * WHERE metric like "%loadgenerator/impulse%" - * GROUP BY test_id, timestamp - * ORDER BY timestamp - * ); - * - * Subsumed by the new Grafana dashboard: - * http://metrics.beam.apache.org/d/MOi-kf3Zk/pardo-load-tests?orgId=1&var-processingType=streaming&var-sdk=python - */ - -def batchScenarios = { datasetName -> - [ - [ - title : 'ParDo Python Load test: 20M 100 byte records 10 iterations', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-batch-pardo-1-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_batch_pardo_1', - influx_measurement : 'python_batch_pardo_1', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 10, - number_of_counter_operations: 0, - number_of_counters : 0, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - // TODO(BEAM-10270): Takes too long time to execute (currently more than 3 hours). Re-enable - // the test after its overhead is reduced. - // [ - // title : 'ParDo Python Load test: 20M 100 byte records 200 times', - // test : 'apache_beam.testing.load_tests.pardo_test', - // runner : CommonTestProperties.Runner.PORTABLE, - // pipelineOptions: [ - // job_name : 'load-tests-python-flink-batch-pardo-2-' + now, - // project : 'apache-beam-testing', - // publish_to_big_query : true, - // metrics_dataset : datasetName, - // metrics_table : 'python_flink_batch_pardo_2', - // influx_measurement : 'python_batch_pardo_2', - // input_options : '\'{' + - // '"num_records": 20000000,' + - // '"key_size": 10,' + - // '"value_size": 90,' + - // '"algorithm": "lcg"}\'', - // iterations : 200, - // number_of_counter_operations: 0, - // number_of_counters : 0, - // parallelism : 5, - // job_endpoint : 'localhost:8099', - // environment_type : 'DOCKER', - // environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - // ] - // ], - [ - title : 'ParDo Python Load test: 20M 100 byte records 10 counters', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-batch-pardo-3-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_batch_pardo_3', - influx_measurement : 'python_batch_pardo_3', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - number_of_counter_operations: 10, - number_of_counters : 1, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - [ - title : 'ParDo Python Load test: 20M 100 byte records 100 counters', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-batch-pardo-4-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_batch_pardo_4', - influx_measurement : 'python_batch_pardo_4', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - number_of_counter_operations: 100, - number_of_counters : 1, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def streamingScenarios = { datasetName -> - [ - [ - title : 'ParDo Python Stateful Streaming Load test: 2M 100 byte records', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-streaming-pardo-1-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - // Keep the old name to not break the legacy dashboard - metrics_table : 'python_flink_streaming_pardo_5', - influx_measurement : 'python_streaming_pardo_1', - input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 10, - number_of_counter_operations: 0, - number_of_counters : 0, - parallelism : 5, - // Turn on streaming mode (flags are indicated with null values) - streaming : null, - stateful : null, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - use_stateful_load_generator: null, - ] - ], - [ - title : 'ParDo Python Load test: 20M 100 byte records 200 times', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-streaming-pardo-2-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_streaming_pardo_2', - influx_measurement : 'python_streaming_pardo_2', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 200, - number_of_counter_operations: 0, - number_of_counters : 0, - parallelism : 5, - streaming : null, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - use_stateful_load_generator: null, - ] - ], - [ - title : 'ParDo Python Load test: 20M 100 byte records 10 counters', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-streaming-pardo-3-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_streaming_pardo_3', - influx_measurement : 'python_streaming_pardo_3', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - number_of_counter_operations: 10, - number_of_counters : 1, - parallelism : 5, - streaming : null, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - use_stateful_load_generator: null, - ] - ], - [ - title : 'ParDo Python Load test: 20M 100 byte records 100 counters', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-streaming-pardo-4-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_streaming_pardo_4', - influx_measurement : 'python_streaming_pardo_4', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - number_of_counter_operations: 100, - number_of_counters : 1, - parallelism : 5, - streaming : null, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - use_stateful_load_generator: null, - ] - ], - [ - title : 'ParDo Python Stateful Streaming with Checkpointing test: 2M 100 byte records', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - job_name : 'load-tests-python-flink-streaming-pardo-6-' + now, - project : 'apache-beam-testing', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_streaming_pardo_6', - influx_measurement : 'python_streaming_pardo_6', - input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 5, - number_of_counter_operations: 10, - number_of_counters : 3, - parallelism : 5, - // Turn on streaming mode (flags are indicated with null values) - streaming : null, - stateful : null, - // Enable checkpointing every 10 seconds - checkpointing_interval : 10000, - // Report checkpointing stats to this namespace - report_checkpoint_duration : 'python_flink_streaming_pardo_6', - // Ensure that we can checkpoint the pipeline for at least 5 minutes to gather checkpointing stats - shutdown_sources_after_idle_ms: 300000, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - use_stateful_load_generator: null, - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - def numberOfWorkers = 5 - List testScenarios = mode == 'batch' ? batchScenarios(datasetName) : streamingScenarios(datasetName) - - Flink flink = new Flink(scope, "beam_LoadTests_Python_ParDo_Flink_${mode.capitalize()}") - flink.setUp( - [ - "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" - ], - numberOfWorkers, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, testScenarios, 'ParDo', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_ParDo_Flink_Batch', - 'Run Load Tests Python ParDo Flink Batch', - 'Load Tests Python ParDo Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_ParDo_Flink_Streaming', - 'Run Load Tests Python ParDo Flink Streaming', - 'Load Tests Python ParDo Flink Streaming suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'streaming') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_ParDo_Flink_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fix this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} - -CronJobBuilder.cronJob('beam_LoadTests_Python_ParDo_Flink_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fix this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'streaming') -} diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Go.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Go.groovy deleted file mode 100644 index a45a146ca93ca..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Go.groovy +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.GO_SDK_CONTAINER - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - - -def batchScenarios = { - [ - [ - title : 'ParDo Go Load test: 20M 100 byte records 10 iterations', - test : 'pardo', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-pardo-1-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_pardo_1', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 10, - number_of_counter_operations: 0, - number_of_counters : 0, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'ParDo Go Load test: 20M 100 byte records 200 times', - test : 'pardo', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-pardo-2-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_pardo_2', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 200, - number_of_counter_operations: 0, - number_of_counters : 0, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'ParDo Go Load test: 20M 100 byte records 10 counters', - test : 'pardo', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-pardo-3-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_pardo_3', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - number_of_counter_operations: 10, - number_of_counters : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'ParDo Go Load test: 20M 100 byte records 100 counters', - test : 'pardo', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-pardo-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_pardo_4', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90}\'', - iterations : 1, - number_of_counter_operations: 100, - number_of_counters : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, batchScenarios(), 'ParDo', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_ParDo_Dataflow_Batch', - 'Run Load Tests Go ParDo Dataflow Batch', - 'Load Tests Go ParDo Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_ParDo_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Java.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Java.groovy deleted file mode 100644 index 974ae6b4d4f39..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Java.groovy +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def commonLoadTestConfig = { jobType, isStreaming -> - [ - [ - title : 'Load test: ParDo 2GB 100 byte records 10 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${jobType}_ParDo_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 10, - numberOfCounters : 1, - numberOfCounterOperations: 0, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 200 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${jobType}_ParDo_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_2", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 200, - numberOfCounters : 1, - numberOfCounterOperations: 0, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ], - [ - - title : 'Load test: ParDo 2GB 100 byte records 10 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${jobType}_ParDo_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_3", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 10, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 100 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - appName : "load_tests_Java_Dataflow_${jobType}_ParDo_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 100, - numWorkers : 5, - autoscalingAlgorithm: "NONE", - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, commonLoadTestConfig('batch', false), "ParDo", "batch") -} - -def streamingLoadTestJob = {scope, triggeringContext -> - scope.description('Runs Java ParDo load tests on Dataflow runner in streaming mode') - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - for (testConfiguration in commonLoadTestConfig('streaming', true)) { - testConfiguration.pipelineOptions << [inputWindowDurationSec: 1200] - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.JAVA, testConfiguration.pipelineOptions, testConfiguration.test) - } -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_ParDo_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -CronJobBuilder.cronJob('beam_LoadTests_Java_ParDo_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_ParDo_Dataflow_Batch', - 'Run Load Tests Java ParDo Dataflow Batch', - 'Load Tests Java ParDo Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_ParDo_Dataflow_Streaming', - 'Run Load Tests Java ParDo Dataflow Streaming', - 'Load Tests Java ParDo Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - streamingLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Java_spark_structured_streaming.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Java_spark_structured_streaming.groovy deleted file mode 100644 index 99639636bf644..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Java_spark_structured_streaming.groovy +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import CronJobBuilder -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def commonLoadTestConfig = { jobType, isStreaming -> - [ - [ - title : 'Load test: ParDo 2GB 100 byte records 10 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${jobType}_ParDo_1", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_1", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 10, - numberOfCounters : 1, - numberOfCounterOperations: 0, - streaming : isStreaming - ] - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 200 times', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${jobType}_ParDo_2", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_2", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 200, - numberOfCounters : 1, - numberOfCounterOperations: 0, - streaming : isStreaming - ] - ], - [ - - title : 'Load test: ParDo 2GB 100 byte records 10 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${jobType}_ParDo_3", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_3", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 10, - streaming : isStreaming - ] - - ], - [ - title : 'Load test: ParDo 2GB 100 byte records 100 counters', - test : 'org.apache.beam.sdk.loadtests.ParDoLoadTest', - runner : CommonTestProperties.Runner.SPARK_STRUCTURED_STREAMING, - pipelineOptions: [ - project : 'apache-beam-testing', - appName : "load_tests_Java_SparkStructuredStreaming_${jobType}_ParDo_4", - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - influxMeasurement : "java_${jobType}_pardo_4", - publishToInfluxDB : true, - sourceOptions : """ - { - "numRecords": 20000000, - "keySizeBytes": 10, - "valueSizeBytes": 90 - } - """.trim().replaceAll("\\s", ""), - iterations : 1, - numberOfCounters : 1, - numberOfCounterOperations: 100, - streaming : isStreaming - ] - ] - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - - -def batchLoadTestJob = { scope, triggeringContext -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.JAVA, commonLoadTestConfig('batch', false), "ParDo", "batch") -} - - -CronJobBuilder.cronJob('beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT) -} - - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Java_ParDo_SparkStructuredStreaming_Batch', - 'Run Load Tests Java ParDo SparkStructuredStreaming Batch', - 'Load Tests Java ParDo SparkStructuredStreaming Batch suite', - this - ) { - additionalPipelineArgs = [:] - batchLoadTestJob(delegate, CommonTestProperties.TriggeringContext.PR) - } - diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Python.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Python.groovy deleted file mode 100644 index 31a64fade3194..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Python.groovy +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def loadTestConfigurations = { mode, datasetName -> - [ - [ - title : 'ParDo Python Load test: 2GB 100 byte records 10 times', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-pardo-1-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_pardo_1", - influx_measurement : "python_${mode}_pardo_1", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 10, - number_of_counter_operations: 0, - number_of_counters : 0, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'ParDo Python Load test: 2GB 100 byte records 200 times', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-pardo-2-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_pardo_2", - influx_measurement : "python_${mode}_pardo_2", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 200, - number_of_counter_operations: 0, - number_of_counters : 0, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'ParDo Python Load test: 2GB 100 byte records 10 counters', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-pardo-3-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_pardo_3", - influx_measurement : "python_${mode}_pardo_3", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - number_of_counter_operations: 10, - number_of_counters : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - [ - title : 'ParDo Python Load test: 2GB 100 byte records 100 counters', - test : 'apache_beam.testing.load_tests.pardo_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-pardo-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_pardo_4", - influx_measurement : "python_${mode}_pardo_4", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"algorithm": "lcg"}\'', - iterations : 1, - number_of_counter_operations: 100, - number_of_counters : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .each{ test -> (mode != 'streaming') ?: addStreamingOptions(test) } -} - -def addStreamingOptions(test) { - test.pipelineOptions << [ - streaming: null, - // Use the new Dataflow runner, which offers improved efficiency of Dataflow jobs. - // See https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-runner-v2 - // for more details. - experiments: 'use_runner_v2', - ] -} - -def loadTestJob = { scope, triggeringContext, jobType -> - scope.description("Runs Python ParDo load tests on Dataflow runner in ${jobType} mode") - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 200) - - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - for (testConfiguration in loadTestConfigurations(jobType, datasetName)) { - loadTestsBuilder.loadTest(scope, testConfiguration.title, testConfiguration.runner, CommonTestProperties.SDK.PYTHON, testConfiguration.pipelineOptions, testConfiguration.test) - } -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_ParDo_Dataflow_Batch', - 'Run Load Tests Python ParDo Dataflow Batch', - 'Load Tests Python ParDo Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, "batch") - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_ParDo_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, "batch") -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_ParDo_Dataflow_Streaming', - 'Run Python Load Tests ParDo Dataflow Streaming', - 'Load Tests Python ParDo Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, "streaming") - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_ParDo_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, "streaming") -} diff --git a/.test-infra/jenkins/job_LoadTests_Python_Smoke.groovy b/.test-infra/jenkins/job_LoadTests_Python_Smoke.groovy deleted file mode 100644 index 82a6e958021fc..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_Python_Smoke.groovy +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def smokeTestConfigurations = { datasetName -> - [ - [ - title : 'GroupByKey Python load test Direct', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DIRECT, - pipelineOptions: [ - publish_to_big_query: true, - project : 'apache-beam-testing', - metrics_dataset : datasetName, - metrics_table : 'python_direct_gbk', - input_options : '\'{"num_records": 100000,' + - '"key_size": 1,' + - '"value_size":1}\'', - - ] - ], - [ - title : 'GroupByKey Python load test Dataflow', - test : 'apache_beam.testing.load_tests.group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'load-tests-python-dataflow-batch-gbk-smoke-' + now, - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/smoketests', - publish_to_big_query: true, - metrics_dataset : datasetName, - metrics_table : 'python_dataflow_gbk', - input_options : '\'{"num_records": 100000,' + - '"key_size": 1,' + - '"value_size":1}\'', - max_num_workers : 1, - ] - ], - ] -} - -// Runs a tiny version load test suite to ensure nothing is broken. -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_Python_LoadTests_Smoke', - 'Run Python Load Tests Smoke', - 'Python Load Tests Smoke', - this - ) { - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test_SMOKE', CommonTestProperties.TriggeringContext.PR) - loadTestsBuilder.loadTests(delegate, CommonTestProperties.SDK.PYTHON, smokeTestConfigurations(datasetName), "GBK", "smoke") - } diff --git a/.test-infra/jenkins/job_LoadTests_SideInput_Flink_Go.groovy b/.test-infra/jenkins/job_LoadTests_SideInput_Flink_Go.groovy deleted file mode 100644 index bd0eaa4f23e6b..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_SideInput_Flink_Go.groovy +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.GO_SDK_CONTAINER - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def batchScenarios = { - [ - [ - title : 'SideInput Go Load test: 400mb-1kb-10workers-1window-first-iterable', - test : 'sideinput', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-sideinput-3-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_sideinput_3', - input_options : '\'{' + - '"num_records": 400000,' + - '"key_size": 100,' + - '"value_size": 900}\'', - access_percentage : 1, - parallelism : 10, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'SideInput Go Load test: 400mb-1kb-10workers-1window-iterable', - test : 'sideinput', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-sideinput-4-${now}", - influx_namespace : 'flink', - influx_measurement : 'go_batch_sideinput_4', - input_options : '\'{' + - '"num_records": 400000,' + - '"key_size": 100,' + - '"value_size": 900}\'', - parallelism : 10, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - def numberOfWorkers = 10 - - Flink flink = new Flink(scope, "beam_LoadTests_Go_SideInput_Flink_${mode.capitalize()}") - flink.setUp( - [ - GO_SDK_CONTAINER - ], - numberOfWorkers, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, - batchScenarios(), 'SideInput', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_SideInput_Flink_Batch', - 'Run Load Tests Go SideInput Flink Batch', - 'Load Tests Go SideInput Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_SideInput_Flink_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fix this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_SideInput_Go.groovy b/.test-infra/jenkins/job_LoadTests_SideInput_Go.groovy deleted file mode 100644 index 35484d4cc3ea6..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_SideInput_Go.groovy +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.GO_SDK_CONTAINER - -String now = new Date().format('MMddHHmmss', TimeZone.getTimeZone('UTC')) - -def batchScenarios = { - [ - [ - title : 'SideInput Go Load test: 10gb-1kb-10workers-1window-first-iterable', - test : 'sideinput', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-sideinput-3-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_sideinput_3', - input_options : '\'{' + - '"num_records": 10000000,' + - '"key_size": 100,' + - '"value_size": 900}\'', - access_percentage: 1, - num_workers : 10, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'SideInput Go Load test: 10gb-1kb-10workers-1window-iterable', - test : 'sideinput', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-sideinput-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_namespace : 'dataflow', - influx_measurement : 'go_batch_sideinput_4', - input_options : '\'{' + - '"num_records": 10000000,' + - '"key_size": 100,' + - '"value_size": 900}\'', - num_workers : 10, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ] - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, batchScenarios(), 'sideinput', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_SideInput_Dataflow_Batch', - 'Run Load Tests Go SideInput Dataflow Batch', - 'Load Tests Go SideInput Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_SideInput_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy b/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy deleted file mode 100644 index f6655bfe43244..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def fromTemplate = { mode, name, id, datasetName, testSpecificOptions -> - [ - title : "SideInput Python Load test: ${name}", - test : 'apache_beam.testing.load_tests.sideinput_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-python-dataflow-${mode}-sideinput-${id}-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_sideinput_${id}", - influx_measurement : "python_${mode}_sideinput_${id}", - num_workers : 10, - autoscaling_algorithm: 'NONE', - experiments : 'use_runner_v2', - ] << testSpecificOptions - ] -} - -def loadTestConfigurations = { mode, datasetName -> - [ - [ - name: '1gb-1kb-10workers-1window-1key-percent-dict', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 1000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'dict', - access_percentage: 1, - ] - ], - [ - name: '1gb-1kb-10workers-1window-99key-percent-dict', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 1000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'dict', - access_percentage: 99, - ] - ], - [ - name: '10gb-1kb-10workers-1window-first-iterable', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 10000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'iter', - access_percentage: 1, - ] - ], - [ - name: '10gb-1kb-10workers-1window-iterable', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 10000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'iter', - ] - ], - [ - name: '1gb-1kb-10workers-1window-first-list', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 1000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'list', - access_percentage: 1, - ] - ], - [ - name: '1gb-1kb-10workers-1window-list', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 1000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'list', - ] - ], - [ - name: '1gb-1kb-10workers-1000window-1key-percent-dict', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 1000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'dict', - access_percentage: 1, - window_count : 1000, - ] - ], - [ - name: '1gb-1kb-10workers-1000window-99key-percent-dict', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 1000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'dict', - access_percentage: 99, - window_count : 1000, - ] - ], - [ - name: '10gb-1kb-10workers-1000window-first-iterable', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 10000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'iter', - access_percentage: 1, - window_count : 1000, - ] - ], - [ - name: '10gb-1kb-10workers-1000window-iterable', - testSpecificOptions: [ - input_options : '\'{' + - '"num_records": 10000000,' + - '"key_size": 100,' + - '"value_size": 900,' + - '"algorithm": "lcg"}\'', - side_input_type : 'iter', - window_count : 1000, - ] - ], - ].indexed().collect { index, it -> - fromTemplate(mode, it.name, index + 1, datasetName, it.testSpecificOptions << additionalPipelineArgs) - } -} - - -def loadTestJob = { scope, triggeringContext, mode -> - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, - loadTestConfigurations(mode, datasetName), 'SideInput', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_SideInput_Dataflow_Batch', - 'Run Load Tests Python SideInput Dataflow Batch', - 'Load Tests Python SideInput Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_SideInput_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Go.groovy b/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Go.groovy deleted file mode 100644 index 8c7a60e724f9a..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Go.groovy +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER -import static LoadTestsBuilder.GO_SDK_CONTAINER - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -// TODO(BEAM-11398): Skipping the first test because it is too slow. -def TESTS_TO_SKIP = [ - 'load-tests-go-flink-batch-cogbk-1-', -] - -def batchScenarios = { - [ - [ - title : 'CoGroupByKey Go Load test: 2GB of 100B records with a single key', - test : 'cogbk', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-cogbk-1-${now}", - influx_measurement : 'go_batch_cogbk_1', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 1, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'CoGroupByKey Go Load test: 2GB of 100B records with multiple keys', - test : 'cogbk', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-cogbk-2-${now}", - influx_measurement : 'go_batch_cogbk_2', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 5,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 1, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'CoGroupByKey Go Load test: reiterate 4 times 10kB values', - test : 'cogbk', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-cogbk-3-${now}", - influx_measurement : 'go_batch_cogbk_3', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 200000,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 4, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'CoGroupByKey Go Load test: reiterate 4 times 2MB values', - test : 'cogbk', - runner : CommonTestProperties.Runner.FLINK, - pipelineOptions: [ - job_name : "load-tests-go-flink-batch-cogbk-4-${now}", - influx_measurement : 'go_batch_cogbk_4', - influx_namespace : 'flink', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 4, - parallelism : 5, - endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .collectMany { test -> - TESTS_TO_SKIP.any { element -> test.pipelineOptions.job_name.startsWith(element) } ? []: [test] - } -} - -def loadTestJob = { scope, triggeringContext, mode -> - def numberOfWorkers = 5 - - def flink = new Flink(scope, "beam_LoadTests_Go_CoGBK_Flink_${mode.capitalize()}") - flink.setUp( - [ - GO_SDK_CONTAINER - ], - numberOfWorkers, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, batchScenarios(), 'CoGBK', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_CoGBK_Flink_Batch', - 'Run Load Tests Go CoGBK Flink Batch', - 'Load Tests Go CoGBK Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_CoGBK_Flink_batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - // TODO(BEAM): Fixe this test. - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy deleted file mode 100644 index 9a0798f8107b2..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import Flink -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.DOCKER_CONTAINER_REGISTRY -import static LoadTestsBuilder.DOCKER_BEAM_SDK_IMAGE -import static LoadTestsBuilder.DOCKER_BEAM_JOBSERVER - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def scenarios = { datasetName -> - [ - [ - title : 'CoGroupByKey Python Load test: 2GB of 100B records with a single key', - test : 'apache_beam.testing.load_tests.co_group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - project : 'apache-beam-testing', - job_name : 'load-tests-python-flink-batch-cogbk-1-' + now, - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_flink_batch_cogbk_1", - influx_measurement : 'python_batch_cogbk_1', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 1, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - [ - title : 'CoGroupByKey Python Load test: 2GB of 100B records with multiple keys', - test : 'apache_beam.testing.load_tests.co_group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - project : 'apache-beam-testing', - job_name : 'load-tests-python-flink-batch-cogbk-2-' + now, - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : 'python_flink_batch_cogbk_2', - influx_measurement : 'python_batch_cogbk_2', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 5,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 1, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - [ - title : 'CoGroupByKey Python Load test: reiterate 4 times 10kB values', - test : 'apache_beam.testing.load_tests.co_group_by_key_test', - runner : CommonTestProperties.Runner.PORTABLE, - pipelineOptions: [ - project : 'apache-beam-testing', - job_name : 'load-tests-python-flink-batch-cogbk-3-' + now, - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_flink_batch_cogbk_3", - influx_measurement : 'python_batch_cogbk_3', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 200000,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 4, - parallelism : 5, - job_endpoint : 'localhost:8099', - environment_type : 'DOCKER', - environment_config : "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}", - ] - ], - ].each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTest = { scope, triggeringContext -> - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - def numberOfWorkers = 5 - List testScenarios = scenarios(datasetName) - - def flink = new Flink(scope, 'beam_LoadTests_Python_CoGBK_Flink_Batch') - flink.setUp( - [ - "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" - ], - numberOfWorkers, - "${DOCKER_BEAM_JOBSERVER}/beam_flink${CommonTestProperties.getFlinkVersion()}_job_server:latest") - - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, testScenarios, 'CoGBK', 'batch') -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_CoGBK_Flink_Batch', - 'Run Load Tests Python CoGBK Flink Batch', - 'Load Tests Python CoGBK Flink Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTest(delegate, CommonTestProperties.TriggeringContext.PR) - } - -// TODO(https://github.com/apache/beam/issues/20146) Re-enable auto builds after these tests pass. diff --git a/.test-infra/jenkins/job_LoadTests_coGBK_Go.groovy b/.test-infra/jenkins/job_LoadTests_coGBK_Go.groovy deleted file mode 100644 index cab73e55968ab..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_coGBK_Go.groovy +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -import static LoadTestsBuilder.GO_SDK_CONTAINER - -String now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def batchScenarios = { - [ - [ - title : 'CoGroupByKey Go Load test: 2GB of 100B records with a single key', - test : 'cogbk', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-cogbk-1-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_cogbk_1', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'CoGroupByKey Go Load test: 2GB of 100B records with multiple keys', - test : 'cogbk', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-cogbk-2-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_cogbk_2', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 5,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'CoGroupByKey Go Load test: reiterate 4 times 10kB values', - test : 'cogbk', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-cogbk-3-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_cogbk_3', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 200000,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 4, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - [ - title : 'CoGroupByKey Go Load test: reiterate 4 times 2MB values', - test : 'cogbk', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : "load-tests-go-dataflow-batch-cogbk-4-${now}", - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - staging_location : 'gs://temp-storage-for-perf-tests/loadtests', - influx_measurement : 'go_batch_cogbk_4', - influx_namespace : 'dataflow', - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1}\'', - iterations : 4, - num_workers : 5, - autoscaling_algorithm: 'NONE', - environment_type : 'DOCKER', - environment_config : GO_SDK_CONTAINER, - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } -} - -def loadTestJob = { scope, triggeringContext, mode -> - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.GO, batchScenarios(), 'CoGBK', mode) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Go_CoGBK_Dataflow_Batch', - 'Run Load Tests Go CoGBK Dataflow Batch', - 'Load Tests Go CoGBK Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Go_CoGBK_Dataflow_batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} diff --git a/.test-infra/jenkins/job_LoadTests_coGBK_Python.groovy b/.test-infra/jenkins/job_LoadTests_coGBK_Python.groovy deleted file mode 100644 index 2e20a1369efd6..0000000000000 --- a/.test-infra/jenkins/job_LoadTests_coGBK_Python.groovy +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import CronJobBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def loadTestConfigurations = { mode, datasetName -> - [ - [ - title : 'CoGroupByKey Python Load test: 2GB of 100B records with a single key', - test : 'apache_beam.testing.load_tests.co_group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - job_name : "load-tests-python-dataflow-${mode}-cogbk-1-${now}", - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_cogbk_1", - influx_measurement : "python_${mode}_cogbk_1", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - iterations : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE' - ] - ], - [ - title : 'CoGroupByKey Python Load test: 2GB of 100B records with multiple keys', - test : 'apache_beam.testing.load_tests.co_group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - job_name : "load-tests-python-dataflow-${mode}-cogbk-2-${now}", - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_cogbk_2", - influx_measurement : "python_${mode}_cogbk_2", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 5,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - iterations : 1, - num_workers : 5, - autoscaling_algorithm: 'NONE' - ] - ], - [ - title : 'CoGroupByKey Python Load test: reiterate 4 times 10kB values', - test : 'apache_beam.testing.load_tests.co_group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - job_name : "load-tests-python-dataflow-${mode}-cogbk-3-${now}", - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_cogbk_3", - influx_measurement : "python_${mode}_cogbk_3", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 200000,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - iterations : 4, - num_workers : 5, - autoscaling_algorithm: 'NONE' - ] - ], - [ - title : 'CoGroupByKey Python Load test: reiterate 4 times 2MB values', - test : 'apache_beam.testing.load_tests.co_group_by_key_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - project : 'apache-beam-testing', - region : 'us-central1', - job_name : "load-tests-python-dataflow-${mode}-cogbk-4-${now}", - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : datasetName, - metrics_table : "python_dataflow_${mode}_cogbk_4", - influx_measurement : "python_${mode}_cogbk_4", - input_options : '\'{' + - '"num_records": 20000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - co_input_options : '\'{' + - '"num_records": 2000000,' + - '"key_size": 10,' + - '"value_size": 90,' + - '"num_hot_keys": 1000,' + - '"hot_key_fraction": 1,' + - '"algorithm": "lcg"}\'', - iterations : 4, - num_workers : 5, - autoscaling_algorithm: 'NONE' - ] - ], - ] - .each { test -> test.pipelineOptions.putAll(additionalPipelineArgs) } - .each { test -> (mode != 'streaming') ?: addStreamingOptions(test) } -} - -def addStreamingOptions(test) { - // Use highmem workers to prevent out of memory issues. - test.pipelineOptions << [streaming: null, - worker_machine_type: 'n1-highmem-4' - ] -} - -def loadTestJob = { scope, triggeringContext, mode -> - def datasetName = loadTestsBuilder.getBigQueryDataset('load_test', triggeringContext) - loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, - loadTestConfigurations(mode, datasetName), 'CoGBK', mode) -} - -CronJobBuilder.cronJob('beam_LoadTests_Python_CoGBK_Dataflow_Batch', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'batch') -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_CoGBK_Dataflow_Batch', - 'Run Load Tests Python CoGBK Dataflow Batch', - 'Load Tests Python CoGBK Dataflow Batch suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'batch') - } - -CronJobBuilder.cronJob('beam_LoadTests_Python_CoGBK_Dataflow_Streaming', 'H H * * *', this) { - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.POST_COMMIT, 'streaming') -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_LoadTests_Python_CoGBK_Dataflow_Streaming', - 'Run Load Tests Python CoGBK Dataflow Streaming', - 'Load Tests Python CoGBK Dataflow Streaming suite', - this - ) { - additionalPipelineArgs = [:] - loadTestJob(delegate, CommonTestProperties.TriggeringContext.PR, 'streaming') - } diff --git a/.test-infra/jenkins/job_MetricsCredentialsRotation.groovy b/.test-infra/jenkins/job_MetricsCredentialsRotation.groovy deleted file mode 100644 index 800899d0cd939..0000000000000 --- a/.test-infra/jenkins/job_MetricsCredentialsRotation.groovy +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties - -job('Rotate Metrics Cluster Credentials') { - description('Rotates Certificates and performs an IP rotation for Metrics cluster') - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Sets that this is a cron job. - commonJobProperties.setCronJob(delegate, 'H 2 1 * *')// At 00:02am every month. - def date = new Date().format('E MMM dd HH:mm:ss z yyyy') - - steps { - //Starting credential rotation - shell('''gcloud container clusters update metrics \ - --start-credential-rotation --zone=us-central1-a --quiet''') - - //Rebuilding the nodes - shell('''gcloud container clusters upgrade metrics \ - --node-pool=default-pool --zone=us-central1-a --quiet''') - - //Completing the rotation - shell('''gcloud container clusters update metrics \ - --complete-credential-rotation --zone=us-central1-a --quiet''') - } - - publishers { - extendedEmail { - triggers { - failure { - subject('Credentials Rotation Failure on Metrics cluster') - content("Something went wrong during the automatic credentials rotation for Metrics Cluster, performed at ${date}. It may be necessary to check the state of the cluster certificates. For further details refer to the following links:\n * Failing job: https://ci-beam.apache.org/job/Rotate%20Metrics%20Cluster%20Credentials/ \n * Job configuration: https://github.com/apache/beam/blob/master/.test-infra/jenkins/job_MetricsCredentialsRotation.groovy \n * Cluster URL: https://pantheon.corp.google.com/kubernetes/clusters/details/us-central1-a/metrics/details?mods=dataflow_dev&project=apache-beam-testing") - recipientList('dev@beam.apache.org') - } - } - } - } -} diff --git a/.test-infra/jenkins/job_Metrics_Report.groovy b/.test-infra/jenkins/job_Metrics_Report.groovy deleted file mode 100644 index d1d3582212133..0000000000000 --- a/.test-infra/jenkins/job_Metrics_Report.groovy +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import InfluxDBCredentialsHelper - -job('beam_Metrics_Report') { - description('Runs Beam metrics report.') - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties( - delegate, 'master', 100, true, 'beam', false) - InfluxDBCredentialsHelper.useCredentials(delegate) - - def influxDb = InfluxDBCredentialsHelper.InfluxDBDatabaseName - def influxHost = InfluxDBCredentialsHelper.InfluxDBHost - def influxPort = InfluxDBCredentialsHelper.InfluxDBPort - - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - 'Beam Metrics Report', - 'Run Metrics Report', - false - ) - - commonJobProperties.setAutoJob( - delegate, - '@weekly') - - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - commonJobProperties.setGradleSwitches(delegate) - switches("-PinfluxDb=${influxDb}") - switches("-PinfluxHost=${influxHost}") - switches("-PinfluxPort=${influxPort}") - tasks(':beam-test-jenkins:generateMetricsReport') - } - } - - def date = new Date().format('yyyy-MM-dd') - publishers { - extendedEmail { - triggers { - always { - recipientList('dev@beam.apache.org') - contentType('text/html') - subject("Beam Metrics Report (${date})") - content('''${FILE, path="src/.test-infra/jenkins/metrics_report/beam-metrics_report.html"}''') - } - } - } - archiveArtifacts { - pattern('src/.test-infra/jenkins/metrics_report/beam-metrics_report.html') - onlyIfSuccessful() - } - wsCleanup { - excludePattern('src/.test-infra/jenkins/metrics_report/beam-metrics_report.html') - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Java.groovy b/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Java.groovy deleted file mode 100644 index 270aea0b7334b..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Java.groovy +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def jobConfigs = [ - [ - title : 'BigQueryIO Performance Test Streaming Java 10 GB', - triggerPhrase: 'Run BigQueryIO Streaming Performance Test Java', - name : 'beam_PerformanceTests_BiqQueryIO_Streaming_Java', - itClass : 'org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT', - properties: [ - project : 'apache-beam-testing', - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - tempRoot : 'gs://temp-storage-for-perf-tests/loadtests', - writeMethod : 'STREAMING_INSERTS', - writeFormat : 'JSON', - pipelineTimeout : '1200', - testBigQueryDataset : 'beam_performance', - testBigQueryTable : 'bqio_write_10GB_java_stream_' + now, - metricsBigQueryDataset: 'beam_performance', - metricsBigQueryTable : 'bqio_10GB_results_java_stream', - influxMeasurement : 'bqio_10GB_results_java_stream', - sourceOptions : """ - { - "numRecords": "10485760", - "keySizeBytes": "1", - "valueSizeBytes": "1024" - } - """.trim().replaceAll("\\s", ""), - runner : 'DataflowRunner', - maxNumWorkers : '5', - numWorkers : '5', - autoscalingAlgorithm : 'NONE', - ] - ], - [ - title : 'BigQueryIO Performance Test Batch Java 10 GB JSON', - triggerPhrase: 'Run BigQueryIO Batch Performance Test Java Json', - name : 'beam_PerformanceTests_BiqQueryIO_Batch_Java_Json', - itClass : 'org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT', - properties: [ - project : 'apache-beam-testing', - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - tempRoot : 'gs://temp-storage-for-perf-tests/loadtests', - writeMethod : 'FILE_LOADS', - writeFormat : 'JSON', - testBigQueryDataset : 'beam_performance', - testBigQueryTable : 'bqio_write_10GB_java_json_' + now, - metricsBigQueryDataset: 'beam_performance', - metricsBigQueryTable : 'bqio_10GB_results_java_batch_json', - influxMeasurement : 'bqio_10GB_results_java_batch_json', - sourceOptions : """ - { - "numRecords": "10485760", - "keySizeBytes": "1", - "valueSizeBytes": "1024" - } - """.trim().replaceAll("\\s", ""), - runner : "DataflowRunner", - maxNumWorkers : '5', - numWorkers : '5', - autoscalingAlgorithm : 'NONE', - ] - ], - [ - title : 'BigQueryIO Performance Test Batch Java 10 GB AVRO', - triggerPhrase: 'Run BigQueryIO Batch Performance Test Java Avro', - name : 'beam_PerformanceTests_BiqQueryIO_Batch_Java_Avro', - itClass : 'org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT', - properties: [ - project : 'apache-beam-testing', - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - tempRoot : 'gs://temp-storage-for-perf-tests/loadtests', - writeMethod : 'FILE_LOADS', - writeFormat : 'AVRO', - testBigQueryDataset : 'beam_performance', - testBigQueryTable : 'bqio_write_10GB_java_avro_' + now, - metricsBigQueryDataset: 'beam_performance', - metricsBigQueryTable : 'bqio_10GB_results_java_batch_avro', - influxMeasurement : 'bqio_10GB_results_java_batch_avro', - sourceOptions : """ - { - "numRecords": "10485760", - "keySizeBytes": "1", - "valueSizeBytes": "1024" - } - """.trim().replaceAll("\\s", ""), - runner : "DataflowRunner", - maxNumWorkers : '5', - numWorkers : '5', - autoscalingAlgorithm : 'NONE', - ] - ] -] - -jobConfigs.forEach { jobConfig -> createPostCommitJob(jobConfig)} - -private void createPostCommitJob(jobConfig) { - job(jobConfig.name) { - description(jobConfig.description) - common.setTopLevelMainJobProperties(delegate) - common.enablePhraseTriggeringFromPullRequest(delegate, jobConfig.title, jobConfig.triggerPhrase) - common.setAutoJob(delegate, 'H H/12 * * *') - publishers { - archiveJunit('**/build/test-results/**/*.xml') - } - InfluxDBCredentialsHelper.useCredentials(delegate) - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - jobConfig.properties.putAll(additionalPipelineArgs) - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinOptionsWithNestedJsonValues(jobConfig.properties)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:bigquery-io-perf-tests:integrationTest --tests ${jobConfig.itClass}") - } - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Python.groovy b/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Python.groovy deleted file mode 100644 index 571410cb3563b..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Python.groovy +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def bqio_read_test = [ - title : 'BigQueryIO Read Performance Test Python 10 GB', - test : 'apache_beam.io.gcp.bigquery_read_perf_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'performance-tests-bqio-read-python-10gb' + now, - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - input_dataset : 'beam_performance', - input_table : 'bqio_read_10GB', - publish_to_big_query : true, - metrics_dataset : 'beam_performance', - metrics_table : 'bqio_read_10GB_results', - influx_measurement : 'python_bqio_read_10GB_results', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - input_options : '\'{' + - '"num_records": 10485760,' + - '"key_size": 1,' + - '"value_size": 1024,' + - '"algorithm": "lcg"}\'', - num_workers : 5, - autoscaling_algorithm: 'NONE', // Disable autoscale the worker pool. - ] -] - -def bqio_write_test = [ - title : 'BigQueryIO Write Performance Test Python Batch 10 GB', - test : 'apache_beam.io.gcp.bigquery_write_perf_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'performance-tests-bqio-write-python-batch-10gb' + now, - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - output_dataset : 'beam_performance', - output_table : 'bqio_write_10GB', - publish_to_big_query : true, - metrics_dataset : 'beam_performance', - metrics_table : 'bqio_write_10GB_results', - influx_measurement : 'python_bqio_write_10GB_results', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - input_options : '\'{' + - '"num_records": 10485760,' + - '"key_size": 1,' + - '"value_size": 1024,' + - '"algorithm": "lcg"}\'', - num_workers : 5, - autoscaling_algorithm: 'NONE', // Disable autoscale the worker pool. - ] -] - -def executeJob = { scope, testConfig -> - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - loadTestsBuilder.loadTest(scope, testConfig.title, testConfig.runner, CommonTestProperties.SDK.PYTHON, testConfig.pipelineOptions, testConfig.test) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_PerformanceTests_BiqQueryIO_Read_Python', - 'Run BigQueryIO Read Performance Test Python', - 'BigQueryIO Read Performance Test Python', - this - ) { - executeJob(delegate, bqio_read_test) - } - -CronJobBuilder.cronJob('beam_PerformanceTests_BiqQueryIO_Read_Python', 'H H * * *', this) { - executeJob(delegate, bqio_read_test) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_PerformanceTests_BiqQueryIO_Write_Python_Batch', - 'Run BigQueryIO Write Performance Test Python Batch', - 'BigQueryIO Write Performance Test Python Batch', - this - ) { - executeJob(delegate, bqio_write_test) - } - -CronJobBuilder.cronJob('beam_PerformanceTests_BiqQueryIO_Write_Python_Batch', 'H H * * *', this) { - executeJob(delegate, bqio_write_test) -} diff --git a/.test-infra/jenkins/job_PerformanceTests_CdapIO.groovy b/.test-infra/jenkins/job_PerformanceTests_CdapIO.groovy deleted file mode 100644 index 94b4d677587b4..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_CdapIO.groovy +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import Kubernetes -import InfluxDBCredentialsHelper - -String jobName = "beam_PerformanceTests_Cdap" - -job(jobName) { - common.setTopLevelMainJobProperties(delegate) - common.setAutoJob(delegate, 'H H/12 * * *') - common.enablePhraseTriggeringFromPullRequest( - delegate, - 'Java CdapIO Performance Test', - 'Run Java CdapIO Performance Test') - InfluxDBCredentialsHelper.useCredentials(delegate) - - String namespace = common.getKubernetesNamespace(jobName) - String kubeconfig = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfig, namespace) - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/postgres/postgres-service-for-local-dev.yml")) - String postgresHostName = "LOAD_BALANCER_IP" - k8s.loadBalancerIP("postgres-for-dev", postgresHostName) - - Map pipelineOptions = [ - tempRoot : 'gs://temp-storage-for-perf-tests', - project : 'apache-beam-testing', - runner : 'DataflowRunner', - numberOfRecords : '5000000', - bigQueryDataset : 'beam_performance', - bigQueryTable : 'cdapioit_results', - influxMeasurement : 'cdapioit_results', - influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl, - postgresUsername : 'postgres', - postgresPassword : 'uuinkks', - postgresDatabaseName : 'postgres', - postgresServerName : "\$${postgresHostName}", - postgresSsl : false, - postgresPort : '5432', - numWorkers : '5', - autoscalingAlgorithm : 'NONE' - ] - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinPipelineOptions(pipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:cdap:integrationTest --tests org.apache.beam.sdk.io.cdap.CdapIOIT") - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_IT.groovy b/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_IT.groovy deleted file mode 100644 index c2208d8b00ad4..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_IT.groovy +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import InfluxDBCredentialsHelper - -def jobs = [ - [ - name : 'beam_PerformanceTests_TextIOIT', - description : 'Runs performance tests for TextIOIT', - test : 'org.apache.beam.sdk.io.text.TextIOIT', - githubTitle : 'Java TextIO Performance Test', - githubTriggerPhrase: 'Run Java TextIO Performance Test', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'textioit_results', - influxMeasurement : 'textioit_results', - numberOfRecords : '25000000', - expectedHash : 'f8453256ccf861e8a312c125dfe0e436', - datasetSize : '1062290000', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_Compressed_TextIOIT', - description : 'Runs performance tests for TextIOIT with GZIP compression', - test : 'org.apache.beam.sdk.io.text.TextIOIT', - githubTitle : 'Java CompressedTextIO Performance Test', - githubTriggerPhrase: 'Run Java CompressedTextIO Performance Test', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'compressed_textioit_results', - influxMeasurement : 'compressed_textioit_results', - numberOfRecords : '450000000', - expectedHash : '8a3de973354abc6fba621c6797cc0f06', - datasetSize : '1097840000', - compressionType : 'GZIP', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_ManyFiles_TextIOIT', - description : 'Runs performance tests for TextIOIT with many output files', - test : 'org.apache.beam.sdk.io.text.TextIOIT', - githubTitle : 'Java ManyFilesTextIO Performance Test', - githubTriggerPhrase: 'Run Java ManyFilesTextIO Performance Test', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'many_files_textioit_results', - influxMeasurement : 'many_files_textioit_results', - reportGcsPerformanceMetrics: 'true', - gcsPerformanceMetrics : 'true', - numberOfRecords : '25000000', - expectedHash : 'f8453256ccf861e8a312c125dfe0e436', - datasetSize : '1062290000', - numberOfShards : '1000', - numWorkers : '5', - autoscalingAlgorithm : 'NONE' - ] - - ], - [ - name : 'beam_PerformanceTests_AvroIOIT', - description : 'Runs performance tests for AvroIOIT', - test : 'org.apache.beam.sdk.io.avro.AvroIOIT', - githubTitle : 'Java AvroIO Performance Test', - githubTriggerPhrase: 'Run Java AvroIO Performance Test', - pipelineOptions : [ - numberOfRecords : '225000000', - expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb', - datasetSize : '1089730000', - bigQueryDataset : 'beam_performance', - bigQueryTable : 'avroioit_results', - influxMeasurement : 'avroioit_results', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_TFRecordIOIT', - description : 'Runs performance tests for beam_PerformanceTests_TFRecordIOIT', - test : 'org.apache.beam.sdk.io.tfrecord.TFRecordIOIT', - githubTitle : 'Java TFRecordIO Performance Test', - githubTriggerPhrase: 'Run Java TFRecordIO Performance Test', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'tfrecordioit_results', - influxMeasurement : 'tfrecordioit_results', - numberOfRecords : '18000000', - expectedHash : '543104423f8b6eb097acb9f111c19fe4', - datasetSize : '1019380000', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_XmlIOIT', - description : 'Runs performance tests for beam_PerformanceTests_XmlIOIT', - test : 'org.apache.beam.sdk.io.xml.XmlIOIT', - githubTitle : 'Java XmlIOPerformance Test', - githubTriggerPhrase: 'Run Java XmlIO Performance Test', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'xmlioit_results', - influxMeasurement : 'xmlioit_results', - numberOfRecords : '12000000', - expectedHash : 'b3b717e7df8f4878301b20f314512fb3', - datasetSize : '1076590000', - charset : 'UTF-8', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_ParquetIOIT', - description : 'Runs performance tests for beam_PerformanceTests_ParquetIOIT', - test : 'org.apache.beam.sdk.io.parquet.ParquetIOIT', - githubTitle : 'Java ParquetIOPerformance Test', - githubTriggerPhrase: 'Run Java ParquetIO Performance Test', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'parquetioit_results', - influxMeasurement : 'parquetioit_results', - numberOfRecords : '225000000', - expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb', - datasetSize : '1087370000', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_TextIOIT_HDFS', - description : 'Runs performance tests for TextIOIT on HDFS', - test : 'org.apache.beam.sdk.io.text.TextIOIT', - githubTitle : 'Java TextIO Performance Test on HDFS', - githubTriggerPhrase: 'Run Java TextIO Performance Test HDFS', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'textioit_hdfs_results', - influxMeasurement : 'textioit_hdfs_results', - numberOfRecords : '25000000', - expectedHash : 'f8453256ccf861e8a312c125dfe0e436', - datasetSize : '1062290000', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - - ], - [ - name : 'beam_PerformanceTests_Compressed_TextIOIT_HDFS', - description : 'Runs performance tests for TextIOIT with GZIP compression on HDFS', - test : 'org.apache.beam.sdk.io.text.TextIOIT', - githubTitle : 'Java CompressedTextIO Performance Test on HDFS', - githubTriggerPhrase: 'Run Java CompressedTextIO Performance Test HDFS', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'compressed_textioit_hdfs_results', - influxMeasurement : 'compressed_textioit_hdfs_results', - numberOfRecords : '450000000', - expectedHash : '8a3de973354abc6fba621c6797cc0f06', - datasetSize : '1097840000', - compressionType : 'GZIP', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_ManyFiles_TextIOIT_HDFS', - description : 'Runs performance tests for TextIOIT with many output files on HDFS', - test : 'org.apache.beam.sdk.io.text.TextIOIT', - githubTitle : 'Java ManyFilesTextIO Performance Test on HDFS', - githubTriggerPhrase: 'Run Java ManyFilesTextIO Performance Test HDFS', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'many_files_textioit_hdfs_results', - influxMeasurement : 'many_files_textioit_hdfs_results', - reportGcsPerformanceMetrics: 'true', - gcsPerformanceMetrics : 'true', - numberOfRecords : '25000000', - expectedHash : 'f8453256ccf861e8a312c125dfe0e436', - datasetSize : '1062290000', - numberOfShards : '1000', - numWorkers : '5', - autoscalingAlgorithm : 'NONE' - ] - - ], - [ - name : 'beam_PerformanceTests_AvroIOIT_HDFS', - description : 'Runs performance tests for AvroIOIT on HDFS', - test : 'org.apache.beam.sdk.io.avro.AvroIOIT', - githubTitle : 'Java AvroIO Performance Test on HDFS', - githubTriggerPhrase: 'Run Java AvroIO Performance Test HDFS', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'avroioit_hdfs_results', - influxMeasurement : 'avroioit_hdfs_results', - numberOfRecords : '225000000', - expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb', - datasetSize : '1089730000', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_TFRecordIOIT_HDFS', - description : 'Runs performance tests for beam_PerformanceTests_TFRecordIOIT on HDFS', - test : 'org.apache.beam.sdk.io.tfrecord.TFRecordIOIT', - githubTitle : 'Java TFRecordIO Performance Test on HDFS', - githubTriggerPhrase: 'Run Java TFRecordIO Performance Test HDFS', - pipelineOptions : [ - numberOfRecords : '18000000', - expectedHash : '543104423f8b6eb097acb9f111c19fe4', - datasetSize : '1019380000', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_XmlIOIT_HDFS', - description : 'Runs performance tests for beam_PerformanceTests_XmlIOIT on HDFS', - test : 'org.apache.beam.sdk.io.xml.XmlIOIT', - githubTitle : 'Java XmlIOPerformance Test on HDFS', - githubTriggerPhrase: 'Run Java XmlIO Performance Test HDFS', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'xmlioit_hdfs_results', - influxMeasurement : 'xmlioit_hdfs_results', - numberOfRecords : '12000000', - expectedHash : 'b3b717e7df8f4878301b20f314512fb3', - datasetSize : '1076590000', - charset : 'UTF-8', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ], - [ - name : 'beam_PerformanceTests_ParquetIOIT_HDFS', - description : 'Runs performance tests for beam_PerformanceTests_ParquetIOIT on HDFS', - test : 'org.apache.beam.sdk.io.parquet.ParquetIOIT', - githubTitle : 'Java ParquetIOPerformance Test on HDFS', - githubTriggerPhrase: 'Run Java ParquetIO Performance Test HDFS', - pipelineOptions : [ - bigQueryDataset : 'beam_performance', - bigQueryTable : 'parquetioit_hdfs_results', - influxMeasurement : 'parquetioit_hdfs_results', - numberOfRecords : '225000000', - expectedHash : '2f9f5ca33ea464b25109c0297eb6aecb', - datasetSize : '1087370000', - numWorkers : '5', - autoscalingAlgorithm: 'NONE' - ] - ] -] - -jobs.findAll { - it.name in [ - 'beam_PerformanceTests_TextIOIT', - 'beam_PerformanceTests_Compressed_TextIOIT', - 'beam_PerformanceTests_ManyFiles_TextIOIT', - 'beam_PerformanceTests_AvroIOIT', - 'beam_PerformanceTests_TFRecordIOIT', - 'beam_PerformanceTests_XmlIOIT', - 'beam_PerformanceTests_ParquetIOIT' - ] -}.forEach { testJob -> createGCSFileBasedIOITTestJob(testJob) } - -private void createGCSFileBasedIOITTestJob(testJob) { - job(testJob.name) { - description(testJob.description) - common.setTopLevelMainJobProperties(delegate) - common.enablePhraseTriggeringFromPullRequest(delegate, testJob.githubTitle, testJob.githubTriggerPhrase) - common.setAutoJob(delegate, 'H H/12 * * *') - InfluxDBCredentialsHelper.useCredentials(delegate) - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - testJob.pipelineOptions.putAll(additionalPipelineArgs) - - def dataflowSpecificOptions = [ - runner : 'DataflowRunner', - project : 'apache-beam-testing', - tempRoot : 'gs://temp-storage-for-perf-tests', - filenamePrefix: "gs://temp-storage-for-perf-tests/${testJob.name}/\${BUILD_ID}/", - ] - - Map allPipelineOptions = dataflowSpecificOptions << testJob.pipelineOptions - String runner = "dataflow" - String filesystem = "gcs" - String testTask = ":sdks:java:io:file-based-io-tests:integrationTest" - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinPipelineOptions(allPipelineOptions)}\'") - switches("-Dfilesystem=\'${filesystem}\'") - switches("-DintegrationTestRunner=\'${runner}\'") - tasks("${testTask} --tests ${testJob.test}") - } - } - } -} - -jobs.findAll { - it.name in [ - 'beam_PerformanceTests_TextIOIT_HDFS', - 'beam_PerformanceTests_Compressed_TextIOIT_HDFS', - 'beam_PerformanceTests_ManyFiles_TextIOIT_HDFS', - // TODO(https://github.com/apache/beam/issues/18796) TFRecord performance test is failing only when running on hdfs. - // We need to fix this before enabling this job on jenkins. - //'beam_PerformanceTests_TFRecordIOIT_HDFS', - 'beam_PerformanceTests_AvroIOIT_HDFS', - 'beam_PerformanceTests_XmlIOIT_HDFS', - 'beam_PerformanceTests_ParquetIOIT_HDFS' - ] -}.forEach { testJob -> createHDFSFileBasedIOITTestJob(testJob) } - -private void createHDFSFileBasedIOITTestJob(testJob) { - job(testJob.name) { - description(testJob.description) - common.setTopLevelMainJobProperties(delegate) - common.enablePhraseTriggeringFromPullRequest(delegate, testJob.githubTitle, testJob.githubTriggerPhrase) - common.setAutoJob(delegate, 'H H/12 * * *') - InfluxDBCredentialsHelper.useCredentials(delegate) - additionalPipelineArgs = [ - influxDatabase: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - testJob.pipelineOptions.putAll(additionalPipelineArgs) - - String namespace = common.getKubernetesNamespace(testJob.name) - String kubeconfig = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfig, namespace) - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/hadoop/LargeITCluster/hdfs-multi-datanode-cluster.yml")) - String hostName = "LOAD_BALANCER_IP" - k8s.loadBalancerIP("hadoop", hostName) - - Map additionalOptions = [ - runner : 'DataflowRunner', - project : 'apache-beam-testing', - tempRoot : 'gs://temp-storage-for-perf-tests', - hdfsConfiguration: /[{\\\"fs.defaultFS\\\":\\\"hdfs:$${hostName}:9000\\\",\\\"dfs.replication\\\":1}]/, - filenamePrefix : "hdfs://\$${hostName}:9000/TEXTIO_IT_" - ] - - Map allPipelineOptions = testJob.pipelineOptions << additionalOptions - String runner = "dataflow" - String filesystem = "hdfs" - String testTask = ":sdks:java:io:file-based-io-tests:integrationTest" - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinPipelineOptions(allPipelineOptions)}\'") - switches("-Dfilesystem=\'${filesystem}\'") - switches("-DintegrationTestRunner=\'${runner}\'") - tasks("${testTask} --tests ${testJob.test}") - } - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_Python.groovy b/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_Python.groovy deleted file mode 100644 index 21fef23138950..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_FileBasedIO_Python.groovy +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import LoadTestsBuilder as loadTestsBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def jobs = [ - [ - name : 'beam_PerformanceTests_TextIOIT_Python', - description : 'Runs performance tests for Python TextIOIT', - test : 'apache_beam.io.filebasedio_perf_test', - githubTitle : 'Python TextIO Performance Test', - githubTriggerPhrase: 'Run Python TextIO Performance Test', - pipelineOptions : [ - publish_to_big_query : true, - metrics_dataset : 'beam_performance', - metrics_table : 'python_textio_1GB_results', - influx_measurement : 'python_textio_1GB_results', - test_class : 'TextIOPerfTest', - input_options : '\'{' + - '"num_records": 25000000,' + - '"key_size": 9,' + - '"value_size": 21,' + - '"algorithm": "lcg"}\'', - dataset_size : '1050000000', - num_workers : '5', - autoscaling_algorithm: 'NONE' - ] - ] -] - -jobs.findAll { - it.name in [ - 'beam_PerformanceTests_TextIOIT_Python', - ] -}.forEach { testJob -> createGCSFileBasedIOITTestJob(testJob) } - -private void createGCSFileBasedIOITTestJob(testJob) { - job(testJob.name) { - description(testJob.description) - common.setTopLevelMainJobProperties(delegate) - common.enablePhraseTriggeringFromPullRequest(delegate, testJob.githubTitle, testJob.githubTriggerPhrase) - common.setAutoJob(delegate, 'H H * * *') - InfluxDBCredentialsHelper.useCredentials(delegate) - additionalPipelineArgs = [ - influx_db_name: InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname: InfluxDBCredentialsHelper.InfluxDBHostUrl, - ] - testJob.pipelineOptions.putAll(additionalPipelineArgs) - - def dataflowSpecificOptions = [ - runner : 'DataflowRunner', - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/', - filename_prefix : "gs://temp-storage-for-perf-tests/${testJob.name}/\${BUILD_ID}/", - ] - - Map allPipelineOptions = dataflowSpecificOptions << testJob.pipelineOptions - - loadTestsBuilder.loadTest( - delegate, testJob.name, CommonTestProperties.Runner.DATAFLOW, CommonTestProperties.SDK.PYTHON, allPipelineOptions, testJob.test) - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_HadoopFormat.groovy b/.test-infra/jenkins/job_PerformanceTests_HadoopFormat.groovy deleted file mode 100644 index 3c267e4a2da77..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_HadoopFormat.groovy +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import Kubernetes -import InfluxDBCredentialsHelper - -String jobName = "beam_PerformanceTests_HadoopFormat" - -job(jobName) { - common.setTopLevelMainJobProperties(delegate) - common.setAutoJob(delegate, 'H H/12 * * *') - common.enablePhraseTriggeringFromPullRequest( - delegate, - 'Java HadoopFormatIO Performance Test', - 'Run Java HadoopFormatIO Performance Test') - InfluxDBCredentialsHelper.useCredentials(delegate) - - String namespace = common.getKubernetesNamespace(jobName) - String kubeconfig = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfig, namespace) - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/postgres/postgres-service-for-local-dev.yml")) - String postgresHostName = "LOAD_BALANCER_IP" - k8s.loadBalancerIP("postgres-for-dev", postgresHostName) - - Map pipelineOptions = [ - tempRoot : 'gs://temp-storage-for-perf-tests', - project : 'apache-beam-testing', - runner : 'DataflowRunner', - numberOfRecords : '5000000', - bigQueryDataset : 'beam_performance', - bigQueryTable : 'hadoopformatioit_results', - influxMeasurement : 'hadoopformatioit_results', - influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl, - postgresUsername : 'postgres', - postgresPassword : 'uuinkks', - postgresDatabaseName : 'postgres', - postgresServerName : "\$${postgresHostName}", - postgresSsl : false, - postgresPort : '5432', - numWorkers : '5', - autoscalingAlgorithm : 'NONE' - ] - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinPipelineOptions(pipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:hadoop-format:integrationTest --tests org.apache.beam.sdk.io.hadoop.format.HadoopFormatIOIT") - } - } -} - diff --git a/.test-infra/jenkins/job_PerformanceTests_JDBC.groovy b/.test-infra/jenkins/job_PerformanceTests_JDBC.groovy deleted file mode 100644 index d4885ae851056..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_JDBC.groovy +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import Kubernetes -import InfluxDBCredentialsHelper - -String jobName = "beam_PerformanceTests_JDBC" - -job(jobName) { - common.setTopLevelMainJobProperties(delegate) - common.setAutoJob(delegate, 'H H/12 * * *') - common.enablePhraseTriggeringFromPullRequest( - delegate, - 'Java JdbcIO Performance Test', - 'Run Java JdbcIO Performance Test') - InfluxDBCredentialsHelper.useCredentials(delegate) - - String namespace = common.getKubernetesNamespace(jobName) - String kubeconfig = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfig, namespace) - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/postgres/postgres-service-for-local-dev.yml")) - String postgresHostName = "LOAD_BALANCER_IP" - k8s.loadBalancerIP("postgres-for-dev", postgresHostName) - - Map pipelineOptions = [ - tempRoot : 'gs://temp-storage-for-perf-tests', - project : 'apache-beam-testing', - runner : 'DataflowRunner', - numberOfRecords : '5000000', - bigQueryDataset : 'beam_performance', - bigQueryTable : 'jdbcioit_results', - influxMeasurement : 'jdbcioit_results', - influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl, - postgresUsername : 'postgres', - postgresPassword : 'uuinkks', - postgresDatabaseName : 'postgres', - postgresServerName : "\$${postgresHostName}", - postgresSsl : false, - postgresPort : '5432', - autoscalingAlgorithm : 'NONE', - numWorkers : '5' - ] - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinPipelineOptions(pipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:jdbc:integrationTest --tests org.apache.beam.sdk.io.jdbc.JdbcIOIT") - } - } -} - diff --git a/.test-infra/jenkins/job_PerformanceTests_KafkaIO_IT.groovy b/.test-infra/jenkins/job_PerformanceTests_KafkaIO_IT.groovy deleted file mode 100644 index d513dd96a7e20..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_KafkaIO_IT.groovy +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import Kubernetes -import InfluxDBCredentialsHelper - -String jobName = "beam_PerformanceTests_Kafka_IO" -String HIGH_RANGE_PORT = "32767" - -/** - * This job runs the Kafka IO performance tests. - It runs on a kafka cluster that is build by applying the folder .test-infra/kubernetes/kafka-cluster, - in an existing kubernetes cluster (DEFAULT_CLUSTER in Kubernetes.groovy). - The services created to run this test are: - Pods: 3 kafka pods, 3 zookeeper pods, 1 kafka-config pod which run a job that creates topics. - Services: 1 bootstrap, 1 broker, 3 outside, 1 zookeeper - Job: job.batch/kafka-config-eff079ec - When the performance tests finish all resources are cleaned up by a postBuild step in Kubernetes.groovy - */ -job(jobName) { - common.setTopLevelMainJobProperties(delegate, 'master', 120) - common.setAutoJob(delegate, 'H H/12 * * *') - common.enablePhraseTriggeringFromPullRequest( - delegate, - 'Java KafkaIO Performance Test', - 'Run Java KafkaIO Performance Test') - InfluxDBCredentialsHelper.useCredentials(delegate) - - String namespace = common.getKubernetesNamespace(jobName) - String kubeconfig = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfig, namespace) - - String kafkaDir = common.makePathAbsolute("src/.test-infra/kubernetes/kafka-cluster") - String kafkaTopicJob = "job.batch/kafka-config-eff079ec" - - /** - * Specifies steps to avoid port collisions when the Kafka outside services (1,2,3) are created. - Function k8s.availablePort finds unused ports in the Kubernetes cluster in a range from 32400 - to 32767 by querying used ports, those ports are stored in env vars like KAFKA_SERVICE_PORT_${service}, - which are used to replace default ports for outside-${service}.yml files, before the apply command. - */ - steps { - String[] configuredPorts = ["32400", "32401", "32402"] - (0..2).each { service -> - k8s.availablePort(service == 0 ? configuredPorts[service] : "\$KAFKA_SERVICE_PORT_${service-1}", - HIGH_RANGE_PORT, "KAFKA_SERVICE_PORT_$service") - shell("sed -i -e s/${configuredPorts[service]}/\$KAFKA_SERVICE_PORT_$service/ \ - ${kafkaDir}/04-outside-services/outside-${service}.yml") - } - } - k8s.apply(kafkaDir) - (0..2).each { k8s.loadBalancerIP("outside-$it", "KAFKA_BROKER_$it") } - k8s.waitForJob(kafkaTopicJob,"40m") - - Map pipelineOptions = [ - tempRoot : 'gs://temp-storage-for-perf-tests', - project : 'apache-beam-testing', - runner : 'DataflowRunner', - sourceOptions : """ - { - "numRecords": "100000000", - "keySizeBytes": "10", - "valueSizeBytes": "90" - } - """.trim().replaceAll("\\s", ""), - bigQueryDataset : 'beam_performance', - bigQueryTable : 'kafkaioit_results', - influxMeasurement : 'kafkaioit_results', - influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl, - kafkaBootstrapServerAddresses: "\$KAFKA_BROKER_0:\$KAFKA_SERVICE_PORT_0,\$KAFKA_BROKER_1:\$KAFKA_SERVICE_PORT_1," + - "\$KAFKA_BROKER_2:\$KAFKA_SERVICE_PORT_2", //KAFKA_BROKER_ represents IP and KAFKA_SERVICE_ port of outside services - kafkaTopic : 'beam-batch', - readTimeout : '1800', - numWorkers : '5', - autoscalingAlgorithm : 'NONE' - ] - - // We are using a smaller number of records for streaming test since streaming read is much slower - // than batch read. - Map dataflowRunnerV2SdfPipelineOptions = pipelineOptions + [ - sourceOptions : """ - { - "numRecords": "100000000", - "keySizeBytes": "10", - "valueSizeBytes": "90" - } - """.trim().replaceAll("\\s", ""), - kafkaTopic : 'beam-sdf', - readTimeout : '1500', - bigQueryTable : 'kafkaioit_results_runner_v2', - influxMeasurement : 'kafkaioit_results_runner_v2', - experiments : 'use_runner_v2,use_unified_worker', - ] - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinOptionsWithNestedJsonValues(dataflowRunnerV2SdfPipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:kafka:integrationTest --tests org.apache.beam.sdk.io.kafka.KafkaIOIT.testKafkaIOReadsAndWritesCorrectlyInStreaming") - } - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinOptionsWithNestedJsonValues(pipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:kafka:integrationTest --tests org.apache.beam.sdk.io.kafka.KafkaIOIT.testKafkaIOReadsAndWritesCorrectlyInBatch") - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_MongoDBIO_IT.groovy b/.test-infra/jenkins/job_PerformanceTests_MongoDBIO_IT.groovy deleted file mode 100644 index 73ef6c4a53ea6..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_MongoDBIO_IT.groovy +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import Kubernetes -import InfluxDBCredentialsHelper - -String jobName = "beam_PerformanceTests_MongoDBIO_IT" - -job(jobName) { - common.setTopLevelMainJobProperties(delegate) - common.setAutoJob(delegate,'H H/12 * * *') - common.enablePhraseTriggeringFromPullRequest( - delegate, - 'Java MongoDBIO Performance Test', - 'Run Java MongoDBIO Performance Test') - InfluxDBCredentialsHelper.useCredentials(delegate) - - String namespace = common.getKubernetesNamespace(jobName) - String kubeconfigPath = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfigPath, namespace) - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/mongodb/load-balancer/mongo.yml")) - String mongoHostName = "LOAD_BALANCER_IP" - k8s.loadBalancerIP("mongo-load-balancer-service", mongoHostName) - - Map pipelineOptions = [ - tempRoot : 'gs://temp-storage-for-perf-tests', - project : 'apache-beam-testing', - numberOfRecords : '10000000', - bigQueryDataset : 'beam_performance', - bigQueryTable : 'mongodbioit_results', - influxMeasurement : 'mongodbioit_results', - influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl, - mongoDBDatabaseName : 'beam', - mongoDBHostName : "\$${mongoHostName}", - mongoDBPort : 27017, - mongoDBUsername : 'root', - mongoDBPassword : 'uuinkkS', - runner : 'DataflowRunner', - autoscalingAlgorithm: 'NONE', - numWorkers : '5' - ] - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinPipelineOptions(pipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:mongodb:integrationTest --tests org.apache.beam.sdk.io.mongodb.MongoDBIOIT") - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_PubsubIO_Python.groovy b/.test-infra/jenkins/job_PerformanceTests_PubsubIO_Python.groovy deleted file mode 100644 index 8ec84dc9b5e4e..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_PubsubIO_Python.groovy +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder - -import static java.util.UUID.randomUUID - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def psio_test = [ - title : 'PubsubIO Write Performance Test Python 2GB', - test : 'apache_beam.io.gcp.pubsub_io_perf_test', - runner : CommonTestProperties.Runner.TEST_DATAFLOW, - pipelineOptions: [ - job_name : 'performance-tests-psio-python-2gb' + now, - project : 'apache-beam-testing', - region : 'us-central1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - publish_to_big_query : true, - metrics_dataset : 'beam_performance', - metrics_table : 'psio_io_2GB_results', - influx_measurement : 'python_psio_2GB_results', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - input_options : '\'{' + - '"num_records": 2097152,' + - '"key_size": 1,' + - '"value_size": 1024,' + - '"algorithm": "lcg"}\'', - num_workers : 5, - autoscaling_algorithm : 'NONE', // Disable autoscale the worker pool. - pubsub_namespace_prefix : 'pubsub_io_performance_', - wait_until_finish_duration: 1000 * 60 * 12, // in milliseconds - ] -] - -def executeJob = { scope, testConfig -> - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 240) - - loadTestsBuilder.loadTest(scope, testConfig.title, testConfig.runner, - CommonTestProperties.SDK.PYTHON, testConfig.pipelineOptions, testConfig.test) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_PerformanceTests_PubsubIOIT_Python_Streaming', - 'Run PubsubIO Performance Test Python', - 'PubsubIO Performance Test Python', - this - ) { - executeJob(delegate, psio_test) - } - -CronJobBuilder.cronJob('beam_PerformanceTests_PubsubIOIT_Python_Streaming', 'H H * * *', this) { - executeJob(delegate, psio_test) -} diff --git a/.test-infra/jenkins/job_PerformanceTests_Python.groovy b/.test-infra/jenkins/job_PerformanceTests_Python.groovy deleted file mode 100644 index 04c8fc9995307..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_Python.groovy +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -// Common pipeline args for Dataflow job. -def dataflowPipelineArgs = [ - project : 'apache-beam-testing', - region : 'us-central1', - staging_location: 'gs://temp-storage-for-end-to-end-tests/staging-it', - temp_location : 'gs://temp-storage-for-end-to-end-tests/temp-it', -] - -testConfigurations = [] -pythonVersions = ['38'] - -for (pythonVersion in pythonVersions) { - testConfigurations.add([ - jobName : "beam_PerformanceTests_WordCountIT_Py${pythonVersion}", - jobDescription : "Python SDK Performance Test - Run WordCountIT in Py${pythonVersion} with 1Gb files", - jobTriggerPhrase : "Run Python${pythonVersion} WordCountIT Performance Test", - test : "apache_beam/examples/wordcount_it_test.py::WordCountIT::test_wordcount_it", - gradleTaskName : ":sdks:python:test-suites:dataflow:py${pythonVersion}:runPerformanceTest", - pipelineOptions : dataflowPipelineArgs + [ - job_name : "performance-tests-wordcount-python${pythonVersion}-batch-1gb${now}", - runner : 'TestDataflowRunner', - publish_to_big_query : true, - metrics_dataset : 'beam_performance', - metrics_table : "wordcount_py${pythonVersion}_pkb_results", - influx_measurement : "wordcount_py${pythonVersion}_results", - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - input : "gs://apache-beam-samples/input_small_files/ascii_sort_1MB_input.0000*", // 1Gb - output : "gs://temp-storage-for-end-to-end-tests/py-it-cloud/output", - expect_checksum : "ea0ca2e5ee4ea5f218790f28d0b9fe7d09d8d710", - num_workers : '10', - autoscaling_algorithm: "NONE", // Disable autoscale the worker pool. - ] - ]) -} - -for (testConfig in testConfigurations) { - createPythonPerformanceTestJob(testConfig) -} - -private void createPythonPerformanceTestJob(Map testConfig) { - // This job runs the Beam Python performance tests - job(testConfig.jobName) { - // Set default Beam job properties. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - InfluxDBCredentialsHelper.useCredentials(delegate) - - // Run job in postcommit, don't trigger every push. - commonJobProperties.setAutoJob(delegate, 'H H * * *') - - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - testConfig.jobDescription, - testConfig.jobTriggerPhrase, - ) - - publishers { - archiveJunit('**/pytest*.xml') - } - - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - switches("--info") - switches("-Ptest-pipeline-options=\"${commonJobProperties.mapToArgString(testConfig.pipelineOptions)}\"") - switches("-Ptest=${testConfig.test}") - tasks(testConfig.gradleTaskName) - commonJobProperties.setGradleSwitches(delegate) - } - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_SQLIO_Java.groovy b/.test-infra/jenkins/job_PerformanceTests_SQLIO_Java.groovy deleted file mode 100644 index ceded537bb4ce..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_SQLIO_Java.groovy +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import CommonJobProperties as common - -def jobConfigs = [ - [ - title : 'SQL BigQueryIO with push-down Batch Performance Test Java', - triggerPhrase: 'Run SQLBigQueryIO Batch Performance Test Java', - name : 'beam_PerformanceTests_SQLBigQueryIO_Batch_Java', - previousName : 'beam_SQLBigQueryIO_Batch_Performance_Test_Java/', - itClass : 'org.apache.beam.sdk.extensions.sql.meta.provider.bigquery.BigQueryIOPushDownIT', - properties: [ - project : 'apache-beam-testing', - tempLocation : 'gs://temp-storage-for-perf-tests/loadtests', - tempRoot : 'gs://temp-storage-for-perf-tests/loadtests', - metricsBigQueryDataset: 'beam_performance', - metricsBigQueryTable : 'sql_bqio_read_java_batch', - runner : "DataflowRunner", - maxNumWorkers : '5', - numWorkers : '5', - autoscalingAlgorithm : 'NONE', - ] - ] -] - -jobConfigs.forEach { jobConfig -> createPostCommitJob(jobConfig)} - -private void createPostCommitJob(jobConfig) { - job(jobConfig.name) { - description(jobConfig.description) - common.setTopLevelMainJobProperties(delegate) - common.enablePhraseTriggeringFromPullRequest(delegate, jobConfig.title, jobConfig.triggerPhrase) - common.setAutoJob(delegate, 'H H/12 * * *') - if (jobConfig.containsKey('previousName')) { - previousNames(jobConfig.previousName) - } - publishers { - archiveJunit('**/build/test-results/**/*.xml') - } - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinOptionsWithNestedJsonValues(jobConfig.properties)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:extensions:sql:perf-tests:integrationTest --tests ${jobConfig.itClass}") - } - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_SingleStoreIO.groovy b/.test-infra/jenkins/job_PerformanceTests_SingleStoreIO.groovy deleted file mode 100644 index a5d54f03213ff..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_SingleStoreIO.groovy +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import Kubernetes -import InfluxDBCredentialsHelper - -String jobName = "beam_PerformanceTests_SingleStoreIO" - -void waitForPodWithLabel(job, Kubernetes k8s, String label) { - job.steps { - shell("${k8s.KUBERNETES_DIR}/singlestore/wait-for-pod-with-label.sh ${label} 600") - } -} - -void waitFor(job, Kubernetes k8s, String resource) { - job.steps { - shell("${k8s.KUBERNETES_DIR}/singlestore/wait-for.sh ${resource} 600") - } -} - -job(jobName) { - common.setTopLevelMainJobProperties(delegate) - common.setAutoJob(delegate,'H H/12 * * *') - common.enablePhraseTriggeringFromPullRequest( - delegate, - 'Java SingleStoreIO Performance Test', - 'Run Java SingleStoreIO Performance Test') - InfluxDBCredentialsHelper.useCredentials(delegate) - - - String namespace = common.getKubernetesNamespace(jobName) - String kubeconfigPath = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfigPath, namespace) - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/singlestore/sdb-rbac.yaml")) - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/singlestore/sdb-cluster-crd.yaml")) - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/singlestore/sdb-operator.yaml")) - waitForPodWithLabel(delegate, k8s, "sdb-operator") - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/singlestore/sdb-cluster.yaml")) - waitFor(delegate, k8s, "memsqlclusters.memsql.com") - - String singlestoreHostName = "LOAD_BALANCER_IP" - k8s.loadBalancerIP("svc-sdb-cluster-ddl", singlestoreHostName) - - Map pipelineOptions = [ - tempRoot : 'gs://temp-storage-for-perf-tests', - project : 'apache-beam-testing', - runner : 'DataflowRunner', - singleStoreServerName : "\$${singlestoreHostName}", - singleStoreUsername : "admin", - singleStorePassword : "secretpass", - singleStorePort: "3306", - numberOfRecords: "5000000", - influxMeasurement : 'singlestoreioit_results', - influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl - ] - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinPipelineOptions(pipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:singlestore:integrationTest --tests org.apache.beam.sdk.io.singlestore.SingleStoreIOPerformanceIT") - } - } -} diff --git a/.test-infra/jenkins/job_PerformanceTests_SpannerIO_Python.groovy b/.test-infra/jenkins/job_PerformanceTests_SpannerIO_Python.groovy deleted file mode 100644 index ed7afff43b791..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_SpannerIO_Python.groovy +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import LoadTestsBuilder as loadTestsBuilder -import PhraseTriggeringPostCommitBuilder -import InfluxDBCredentialsHelper - -def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) - -def spannerio_read_test_2gb = [ - title : 'SpannerIO Read Performance Test Python 2 GB', - test : 'apache_beam.io.gcp.experimental.spannerio_read_perf_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'performance-tests-spanner-read-python-2gb' + now, - project : 'apache-beam-testing', - // Run in us-west1 to colocate with beam-test spanner instance (BEAM-13222) - region : 'us-west1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - spanner_instance : 'beam-test', - spanner_database : 'pyspanner_read_2gb', - publish_to_big_query : true, - metrics_dataset : 'beam_performance', - metrics_table : 'pyspanner_read_2GB_results', - influx_measurement : 'python_spannerio_read_2GB_results', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - input_options : '\'{' + - '"num_records": 2097152,' + - '"key_size": 1,' + - '"value_size": 1024,' + - '"algorithm": "lcg"}\'', - num_workers : 5, - autoscaling_algorithm: 'NONE', // Disable autoscale the worker pool. - ] -] - -def spannerio_write_test_2gb = [ - title : 'SpannerIO Write Performance Test Python Batch 2 GB', - test : 'apache_beam.io.gcp.experimental.spannerio_write_perf_test', - runner : CommonTestProperties.Runner.DATAFLOW, - pipelineOptions: [ - job_name : 'performance-tests-spannerio-write-python-batch-2gb' + now, - project : 'apache-beam-testing', - // Run in us-west1 to colocate with beam-test spanner instance (BEAM-13222) - region : 'us-west1', - temp_location : 'gs://temp-storage-for-perf-tests/loadtests', - spanner_instance : 'beam-test', - spanner_database : 'pyspanner_write_2gb', - publish_to_big_query : true, - metrics_dataset : 'beam_performance', - metrics_table : 'pyspanner_write_2GB_results', - influx_measurement : 'python_spanner_write_2GB_results', - influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, - input_options : '\'{' + - '"num_records": 2097152,' + - '"key_size": 1,' + - '"value_size": 1024,' + - '"algorithm": "lcg"}\'', - num_workers : 5, - autoscaling_algorithm: 'NONE', // Disable autoscale the worker pool. - ] -] - -def executeJob = { scope, testConfig -> - commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 480) - - loadTestsBuilder.loadTest(scope, testConfig.title, testConfig.runner, CommonTestProperties.SDK.PYTHON, testConfig.pipelineOptions, testConfig.test) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_PerformanceTests_SpannerIO_Read_2GB_Python', - 'Run SpannerIO Read 2GB Performance Test Python', - 'SpannerIO Read 2GB Performance Test Python', - this - ) { - executeJob(delegate, spannerio_read_test_2gb) - } - -CronJobBuilder.cronJob('beam_PerformanceTests_SpannerIO_Read_2GB_Python', 'H H * * *', this) { - executeJob(delegate, spannerio_read_test_2gb) -} - -PhraseTriggeringPostCommitBuilder.postCommitJob( - 'beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch', - 'Run SpannerIO Write 2GB Performance Test Python Batch', - 'SpannerIO Write 2GB Performance Test Python Batch', - this - ) { - executeJob(delegate, spannerio_write_test_2gb) - } - -CronJobBuilder.cronJob('beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch', 'H H * * *', this) { - executeJob(delegate, spannerio_write_test_2gb) -} diff --git a/.test-infra/jenkins/job_PerformanceTests_SparkReceiverIO_IT.groovy b/.test-infra/jenkins/job_PerformanceTests_SparkReceiverIO_IT.groovy deleted file mode 100644 index 6a2328749f058..0000000000000 --- a/.test-infra/jenkins/job_PerformanceTests_SparkReceiverIO_IT.groovy +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as common -import Kubernetes -import InfluxDBCredentialsHelper - -String jobName = "beam_PerformanceTests_SparkReceiver_IO" - -/** - * This job runs the SparkReceiver IO performance tests. - It runs on a RabbitMQ cluster that is build by applying the folder .test-infra/kubernetes/rabbit, - in an existing kubernetes cluster (DEFAULT_CLUSTER in Kubernetes.groovy). - The services created to run this test are: - Pods: 1 RabbitMq pods. - Services: 1 broker - When the performance tests finish all resources are cleaned up by a postBuild step in Kubernetes.groovy - */ -job(jobName) { - common.setTopLevelMainJobProperties(delegate, 'master', 120) - common.setAutoJob(delegate, 'H H/12 * * *') - common.enablePhraseTriggeringFromPullRequest( - delegate, - 'Java SparkReceiverIO Performance Test', - 'Run Java SparkReceiverIO Performance Test') - InfluxDBCredentialsHelper.useCredentials(delegate) - - String namespace = common.getKubernetesNamespace(jobName) - String kubeconfig = common.getKubeconfigLocationForNamespace(namespace) - Kubernetes k8s = Kubernetes.create(delegate, kubeconfig, namespace) - - k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/rabbit/rabbitmq.yaml")) - String rabbitMqHostName = "LOAD_BALANCER_IP" - k8s.loadBalancerIP("rabbitmq", rabbitMqHostName) - - Map pipelineOptions = [ - tempRoot : 'gs://temp-storage-for-perf-tests', - project : 'apache-beam-testing', - runner : 'DataflowRunner', - sourceOptions : """ - { - "numRecords": "5000000", - "keySizeBytes": "1", - "valueSizeBytes": "90" - } - """.trim().replaceAll("\\s", ""), - bigQueryDataset : 'beam_performance', - bigQueryTable : 'sparkreceiverioit_results', - influxMeasurement : 'sparkreceiverioit_results', - influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, - influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl, - rabbitMqBootstrapServerAddress: "amqp://guest:guest@\$${rabbitMqHostName}:5672", - streamName : 'rabbitMqTestStream', - readTimeout : '1800', - numWorkers : '1', - autoscalingAlgorithm : 'NONE' - ] - - steps { - gradle { - rootBuildScriptDir(common.checkoutDir) - common.setGradleSwitches(delegate) - switches("--info") - switches("-DintegrationTestPipelineOptions=\'${common.joinOptionsWithNestedJsonValues(pipelineOptions)}\'") - switches("-DintegrationTestRunner=dataflow") - tasks(":sdks:java:io:sparkreceiver:2:integrationTest --tests org.apache.beam.sdk.io.sparkreceiver.SparkReceiverIOIT") - } - } -} diff --git a/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java11.groovy b/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java11.groovy index 6687ae0e6f8a2..6229f7c48a72d 100644 --- a/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java11.groovy +++ b/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java11.groovy @@ -43,7 +43,7 @@ PostcommitJobBuilder.postCommitJob('beam_PostCommit_Java_Examples_Dataflow_V2_ja commonJobProperties.setGradleSwitches(delegate, 3 * Runtime.runtime.availableProcessors()) switches '-PdisableSpotlessCheck=true' switches '-PdisableCheckStyle=true' - switches '-PcompileAndRunTestsWithJava11' + switches '-PtestJavaVersion=11' switches '-PskipCheckerFramework' switches "-Pjava11Home=${commonJobProperties.JAVA_11_HOME}" } diff --git a/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java17.groovy b/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java17.groovy index b275fe9276d95..7e52a7e097892 100644 --- a/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java17.groovy +++ b/.test-infra/jenkins/job_PostCommit_Java_Examples_Dataflow_V2_Java17.groovy @@ -43,7 +43,7 @@ PostcommitJobBuilder.postCommitJob('beam_PostCommit_Java_Examples_Dataflow_V2_ja commonJobProperties.setGradleSwitches(delegate, 3 * Runtime.runtime.availableProcessors()) switches '-PdisableSpotlessCheck=true' switches '-PdisableCheckStyle=true' - switches '-PcompileAndRunTestsWithJava17' + switches '-PtestJavaVersion=17' switches '-PskipCheckerFramework' switches "-Pjava17Home=${commonJobProperties.JAVA_17_HOME}" } diff --git a/.test-infra/jenkins/job_PostCommit_Java_Jpms_Dataflow_Java17.groovy b/.test-infra/jenkins/job_PostCommit_Java_Jpms_Dataflow_Java17.groovy index 4e26c164319eb..f518985ca7a8c 100644 --- a/.test-infra/jenkins/job_PostCommit_Java_Jpms_Dataflow_Java17.groovy +++ b/.test-infra/jenkins/job_PostCommit_Java_Jpms_Dataflow_Java17.groovy @@ -42,7 +42,7 @@ PostcommitJobBuilder.postCommitJob('beam_PostCommit_Java_Jpms_Dataflow_Java17', tasks(':sdks:java:testing:jpms-tests:dataflowRunnerIntegrationTest') commonJobProperties.setGradleSwitches(delegate) switches("-PskipCheckerFramework") - switches("-PcompileAndRunTestsWithJava17") + switches("-PtestJavaVersion=17") switches("-Pjava17Home=${commonJobProperties.JAVA_17_HOME}") // Specify maven home on Jenkins, needed by Maven archetype integration tests. switches('-Pmaven_home=/home/jenkins/tools/maven/apache-maven-3.5.4') diff --git a/.test-infra/jenkins/job_PostCommit_Java_Jpms_Direct_Java17.groovy b/.test-infra/jenkins/job_PostCommit_Java_Jpms_Direct_Java17.groovy index f31373ecaadac..04c31389ecbe2 100644 --- a/.test-infra/jenkins/job_PostCommit_Java_Jpms_Direct_Java17.groovy +++ b/.test-infra/jenkins/job_PostCommit_Java_Jpms_Direct_Java17.groovy @@ -42,7 +42,7 @@ PostcommitJobBuilder.postCommitJob('beam_PostCommit_Java_Jpms_Direct_Java17', 'R tasks(':sdks:java:testing:jpms-tests:directRunnerIntegrationTest') commonJobProperties.setGradleSwitches(delegate) switches("-PskipCheckerFramework") - switches("-PcompileAndRunTestsWithJava17") + switches("-PtestJavaVersion=17") switches("-Pjava17Home=${commonJobProperties.JAVA_17_HOME}") // Specify maven home on Jenkins, needed by Maven archetype integration tests. switches('-Pmaven_home=/home/jenkins/tools/maven/apache-maven-3.5.4') diff --git a/.test-infra/jenkins/job_PostCommit_TransformService_Direct.groovy b/.test-infra/jenkins/job_PostCommit_TransformService_Direct.groovy index 0d7f58e717064..03d29069a52c8 100644 --- a/.test-infra/jenkins/job_PostCommit_TransformService_Direct.groovy +++ b/.test-infra/jenkins/job_PostCommit_TransformService_Direct.groovy @@ -43,7 +43,7 @@ PostcommitJobBuilder.postCommitJob('beam_PostCommit_TransformService_Direct', rootBuildScriptDir(commonJobProperties.checkoutDir) tasks(':sdks:python:test-suites:direct:xlang:transformServicePythonUsingJava') commonJobProperties.setGradleSwitches(delegate) - switches '-PcompileAndRunTestsWithJava11' + switches '-PtestJavaVersion=11' switches "-Pjava11Home=${commonJobProperties.JAVA_11_HOME}" switches("-PuseWheelDistribution") switches("-PpythonVersion=${pythonVersion}") diff --git a/.test-infra/jenkins/job_PreCommit_Java.groovy b/.test-infra/jenkins/job_PreCommit_Java.groovy index 41a3b418a015a..d1acb1ac73157 100644 --- a/.test-infra/jenkins/job_PreCommit_Java.groovy +++ b/.test-infra/jenkins/job_PreCommit_Java.groovy @@ -54,6 +54,7 @@ def excludePaths = [ 'io/pulsar', 'io/rabbitmq', 'io/redis', + 'io/rrio', 'io/singlestore', 'io/snowflake', 'io/solr', diff --git a/.test-infra/jenkins/job_PreCommit_Java_Examples_Dataflow_Java11.groovy b/.test-infra/jenkins/job_PreCommit_Java_Examples_Dataflow_Java11.groovy deleted file mode 100644 index cabf5dbdd4576..0000000000000 --- a/.test-infra/jenkins/job_PreCommit_Java_Examples_Dataflow_Java11.groovy +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder -import CommonJobProperties as properties - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Java_Examples_Dataflow_Java11', - gradleTask: ':clean', - gradleSwitches: [ - '-PdisableSpotlessCheck=true', - '-PdisableCheckStyle=true', - '-PskipCheckerFramework' // Gradle itself is running under JDK8 so plugin configures wrong for JDK11 - ], // spotless checked in separate pre-commit - triggerPathPatterns: [ - '^model/.*$', - '^sdks/java/.*$', - '^runners/google-cloud-dataflow-java/.*$', - '^examples/java/.*$', - '^examples/kotlin/.*$', - '^release/.*$', - ], - timeoutMins: 60, - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') - } - - steps { - gradle { - rootBuildScriptDir(properties.checkoutDir) - tasks ':runners:google-cloud-dataflow-java:examples:preCommit' - switches '-PdisableSpotlessCheck=true' - switches '-PdisableCheckStyle=true' - switches '-PskipCheckerFramework' // Gradle itself is running under JDK8 so plugin configures wrong for JDK11 - switches '-PcompileAndRunTestsWithJava11' - switches "-Pjava11Home=${properties.JAVA_11_HOME}" - properties.setGradleSwitches(delegate, 3 * Runtime.runtime.availableProcessors()) - } - } -} diff --git a/.test-infra/jenkins/job_PreCommit_Java_IOs.groovy b/.test-infra/jenkins/job_PreCommit_Java_IOs.groovy index edeeed5f09708..09bf1982d1271 100644 --- a/.test-infra/jenkins/job_PreCommit_Java_IOs.groovy +++ b/.test-infra/jenkins/job_PreCommit_Java_IOs.groovy @@ -73,34 +73,34 @@ def ioModulesMap = [ // These projects are also covered by 'Java_IOs_Direct', and won't trigger on default patterns. false: [ - 'amqp', - 'cassandra', - 'cdap', - 'clickhouse', - 'csv', - 'debezium', - 'elasticsearch', - 'file-schema-transform', + // 'amqp', + // 'cassandra', + // 'cdap', + // 'clickhouse', + // 'csv', + // 'debezium', + // 'elasticsearch', + // 'file-schema-transform', 'google-ads', - 'hbase', - 'hcatalog', - 'influxdb', - 'jdbc', - 'jms', - 'kafka', - 'kudu', - 'mongodb', - 'mqtt', - 'neo4j', - 'parquet', - 'rabbitmq', - 'redis', - 'singlestore', - 'snowflake', - 'solr', - 'splunk', - 'thrift', - 'tika' + // 'hbase', + // 'hcatalog', + // 'influxdb', + // 'jdbc', + // 'jms', + // 'kafka', + // 'kudu', + // 'mongodb', + // 'mqtt', + // 'neo4j', + // 'parquet', + // 'rabbitmq', + // 'redis', + // 'singlestore', + // 'snowflake', + // 'solr', + // 'splunk', + // 'thrift', + // 'tika' ] ] diff --git a/.test-infra/jenkins/job_PreCommit_Java_PortableValidatesRunner_Flink_Docker.groovy b/.test-infra/jenkins/job_PreCommit_Java_PortableValidatesRunner_Flink_Docker.groovy deleted file mode 100644 index bb14a792291cd..0000000000000 --- a/.test-infra/jenkins/job_PreCommit_Java_PortableValidatesRunner_Flink_Docker.groovy +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonTestProperties -import PrecommitJobBuilder - -// This job runs a limited subset of ValidatesRunner tests against the Flink runner in the docker environment. -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Java_PVR_Flink_Docker', - gradleTask: ":runners:flink:${CommonTestProperties.getFlinkVersion()}:job-server:validatesPortableRunnerDocker", - timeoutMins: 240, - triggerPathPatterns: [ - '^sdks/java/core/src/test/java/org/apache/beam/sdk/.*$', - '^sdks/java/container/.*$', - '^sdks/java/harness/.*$', - '^runners/flink/.*$', - '^runners/java-fn-execution/.*$', - ], - ) -builder.build { - // Publish all test results to Jenkins. - publishers { - archiveJunit('**/build/test-results/**/*.xml') - } -} diff --git a/.test-infra/jenkins/job_PreCommit_Portable_Python.groovy b/.test-infra/jenkins/job_PreCommit_Portable_Python.groovy deleted file mode 100644 index 2992cbbd0d06d..0000000000000 --- a/.test-infra/jenkins/job_PreCommit_Portable_Python.groovy +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import PrecommitJobBuilder -import static PythonTestProperties.LOWEST_SUPPORTED -import static PythonTestProperties.HIGHEST_SUPPORTED - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Portable_Python', - gradleTask: ':clean', // Do nothing here. Add test configs below. - triggerPathPatterns: [ - '^model/.*$', - '^runners/core-construction-java/.*$', - '^runners/core-java/.*$', - '^runners/extensions-java/.*$', - '^runners/flink/.*$', - '^runners/java-fn-execution/.*$', - '^runners/reference/.*$', - '^sdks/python/.*$', - '^release/.*$', - ] - ) - -builder.build { - // Due to BEAM-7993, run multiple Python version of portable precommit - // tests in parallel could lead python3 container crash. We manually - // config gradle steps here to run tests in sequential. - def lowestSupported = LOWEST_SUPPORTED.replace('.', '') - def highestSupported = HIGHEST_SUPPORTED.replace('.', '') - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks(":sdks:python:test-suites:portable:py${lowestSupported}:preCommitPy${lowestSupported}") - commonJobProperties.setGradleSwitches(delegate) - } - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks(":sdks:python:test-suites:portable:py${highestSupported}:preCommitPy${highestSupported}") - commonJobProperties.setGradleSwitches(delegate) - } - } -} diff --git a/.test-infra/jenkins/job_PreCommit_Python.groovy b/.test-infra/jenkins/job_PreCommit_Python.groovy index 0e439d7888773..9c9740e3c97ee 100644 --- a/.test-infra/jenkins/job_PreCommit_Python.groovy +++ b/.test-infra/jenkins/job_PreCommit_Python.groovy @@ -29,6 +29,7 @@ PrecommitJobBuilder builder = new PrecommitJobBuilder( '^release/.*$', ], gradleSwitches: [ + '-PuseWheelDistribution', '-Pposargs=\"--ignore=apache_beam/dataframe/ --ignore=apache_beam/examples/ --ignore=apache_beam/runners/ --ignore=apache_beam/transforms/\"' // All these tests are covered by different jobs. ], numBuildsToRetain: 40 diff --git a/.test-infra/jenkins/job_PreCommit_Python_Coverage.groovy b/.test-infra/jenkins/job_PreCommit_Python_Coverage.groovy index c0cb48cf62319..43a204fd7cfc8 100644 --- a/.test-infra/jenkins/job_PreCommit_Python_Coverage.groovy +++ b/.test-infra/jenkins/job_PreCommit_Python_Coverage.groovy @@ -22,6 +22,9 @@ PrecommitJobBuilder builder = new PrecommitJobBuilder( scope: this, nameBase: 'Python_Coverage', gradleTask: ':sdks:python:test-suites:tox:py38:preCommitPyCoverage', + gradleSwitches: [ + '-PuseWheelDistribution' + ], timeoutMins: 180, triggerPathPatterns: [ '^model/.*$', diff --git a/.test-infra/jenkins/job_PreCommit_Python_Dataframes.groovy b/.test-infra/jenkins/job_PreCommit_Python_Dataframes.groovy index e2914e9bdb8e0..dea034f613a58 100644 --- a/.test-infra/jenkins/job_PreCommit_Python_Dataframes.groovy +++ b/.test-infra/jenkins/job_PreCommit_Python_Dataframes.groovy @@ -23,7 +23,8 @@ PrecommitJobBuilder builder = new PrecommitJobBuilder( nameBase: 'Python_Dataframes', gradleTask: ':pythonPreCommit', gradleSwitches: [ - '-Pposargs=apache_beam/dataframe/' + '-Pposargs=apache_beam/dataframe/', + '-PuseWheelDistribution' ], timeoutMins: 180, triggerPathPatterns: [ diff --git a/.test-infra/jenkins/job_PreCommit_Python_Examples.groovy b/.test-infra/jenkins/job_PreCommit_Python_Examples.groovy index f4ef9f51d7fbe..3dd7bf6f6f47c 100644 --- a/.test-infra/jenkins/job_PreCommit_Python_Examples.groovy +++ b/.test-infra/jenkins/job_PreCommit_Python_Examples.groovy @@ -23,7 +23,8 @@ PrecommitJobBuilder builder = new PrecommitJobBuilder( nameBase: 'Python_Examples', gradleTask: ':pythonPreCommit', gradleSwitches: [ - '-Pposargs=apache_beam/examples/' + '-Pposargs=apache_beam/examples/', + '-PuseWheelDistribution' ], timeoutMins: 180, triggerPathPatterns: [ diff --git a/.test-infra/jenkins/job_PreCommit_Python_Runners.groovy b/.test-infra/jenkins/job_PreCommit_Python_Runners.groovy index e80dba6cf5cd8..4ae1d283b7a9b 100644 --- a/.test-infra/jenkins/job_PreCommit_Python_Runners.groovy +++ b/.test-infra/jenkins/job_PreCommit_Python_Runners.groovy @@ -23,7 +23,8 @@ PrecommitJobBuilder builder = new PrecommitJobBuilder( nameBase: 'Python_Runners', gradleTask: ':pythonPreCommit', gradleSwitches: [ - '-Pposargs=apache_beam/runners/' + '-Pposargs=apache_beam/runners/', + '-PuseWheelDistribution' ], timeoutMins: 180, triggerPathPatterns: [ diff --git a/.test-infra/jenkins/job_PreCommit_Python_Transforms.groovy b/.test-infra/jenkins/job_PreCommit_Python_Transforms.groovy index dd16d48b1731a..ccd3f08b78ab0 100644 --- a/.test-infra/jenkins/job_PreCommit_Python_Transforms.groovy +++ b/.test-infra/jenkins/job_PreCommit_Python_Transforms.groovy @@ -23,7 +23,8 @@ PrecommitJobBuilder builder = new PrecommitJobBuilder( nameBase: 'Python_Transforms', gradleTask: ':pythonPreCommit', gradleSwitches: [ - '-Pposargs=apache_beam/transforms/' + '-Pposargs=apache_beam/transforms/', + '-PuseWheelDistribution' ], timeoutMins: 180, triggerPathPatterns: [ diff --git a/.test-infra/jenkins/job_PreCommit_SQL.groovy b/.test-infra/jenkins/job_PreCommit_SQL.groovy deleted file mode 100644 index 3cd81e330cac4..0000000000000 --- a/.test-infra/jenkins/job_PreCommit_SQL.groovy +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'SQL', - gradleTask: ':sqlPreCommit', - gradleSwitches: [ - '-PdisableSpotlessCheck=true', - '-PdisableCheckStyle=true', - '-PenableJacocoReport' - ], // spotless checked in job_PreCommit_Spotless - triggerPathPatterns: [ - '^sdks/java/extensions/sql.*$', - ], - numBuildsToRetain: 40 - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') - recordIssues { - tools { - errorProne() - spotBugs { - pattern('**/build/reports/spotbugs/*.xml') - } - } - enabledForFailure(true) - } - jacocoCodeCoverage { - execPattern('**/build/jacoco/*.exec') - exclusionPattern('**/AutoValue_*') - inclusionPattern("**/org/apache/beam/sdk/extensions/sql/**") - } - } -} diff --git a/.test-infra/jenkins/job_PreCommit_SQL_Java11.groovy b/.test-infra/jenkins/job_PreCommit_SQL_Java11.groovy deleted file mode 100644 index 9742ab756cf70..0000000000000 --- a/.test-infra/jenkins/job_PreCommit_SQL_Java11.groovy +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder -import CommonJobProperties as properties - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'SQL_Java11', - gradleTask: ':sqlPreCommit', - gradleSwitches: [ - '-PdisableSpotlessCheck=true', - '-PdisableCheckStyle=true', - '-PcompileAndRunTestsWithJava11', - '-PskipCheckerFramework', - // Gradle itself is running under JDK8 so plugin configures wrong for JDK11 - "-Pjava11Home=${properties.JAVA_11_HOME}" - ], // spotless checked in job_PreCommit_Spotless - triggerPathPatterns: [ - '^sdks/java/extensions/sql.*$', - ] - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') - recordIssues { - tools { - errorProne() - java() - spotBugs { - pattern('**/build/reports/spotbugs/*.xml') - } - } - enabledForFailure(true) - } - } -} diff --git a/.test-infra/jenkins/job_PreCommit_SQL_Java17.groovy b/.test-infra/jenkins/job_PreCommit_SQL_Java17.groovy deleted file mode 100644 index 158fa683c1a8a..0000000000000 --- a/.test-infra/jenkins/job_PreCommit_SQL_Java17.groovy +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder -import CommonJobProperties as properties - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'SQL_Java17', - gradleTask: ':sqlPreCommit', - gradleSwitches: [ - '-PdisableSpotlessCheck=true', - '-PdisableCheckStyle=true', - '-PcompileAndRunTestsWithJava17', - '-PskipCheckerFramework', - // Gradle itself is running under JDK8 so plugin configures wrong for JDK17 - "-Pjava17Home=${properties.JAVA_17_HOME}" - ], // spotless checked in job_PreCommit_Spotless - triggerPathPatterns: [ - '^sdks/java/extensions/sql.*$', - ] - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') - recordIssues { - tools { - java() - spotBugs { - pattern('**/build/reports/spotbugs/*.xml') - } - } - enabledForFailure(true) - } - } -} diff --git a/.test-infra/jenkins/job_Precommit_Java_Examples_Dataflow_Java17.groovy b/.test-infra/jenkins/job_Precommit_Java_Examples_Dataflow_Java17.groovy deleted file mode 100644 index 3654a4c75edbf..0000000000000 --- a/.test-infra/jenkins/job_Precommit_Java_Examples_Dataflow_Java17.groovy +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder -import CommonJobProperties as properties - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Java_Examples_Dataflow_Java17', - gradleTask: ':clean', - gradleSwitches: [ - '-PdisableSpotlessCheck=true', - '-PdisableCheckStyle=true', - '-PskipCheckerFramework' // Gradle itself is running under JDK8 so plugin configures wrong for JDK17 - ], // spotless checked in separate pre-commit - triggerPathPatterns: [ - '^model/.*$', - '^sdks/java/.*$', - '^runners/google-cloud-dataflow-java/.*$', - '^examples/java/.*$', - '^examples/kotlin/.*$', - '^release/.*$', - ], - timeoutMins: 60, - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') - } - - steps { - gradle { - rootBuildScriptDir(properties.checkoutDir) - tasks ':runners:google-cloud-dataflow-java:examples:preCommit' - switches '-PdisableSpotlessCheck=true' - switches '-PdisableCheckStyle=true' - switches '-PskipCheckerFramework' // Gradle itself is running under JDK8 so plugin configures wrong for JDK17 - switches '-PcompileAndRunTestsWithJava17' - switches "-Pjava17Home=${properties.JAVA_17_HOME}" - properties.setGradleSwitches(delegate, 3 * Runtime.runtime.availableProcessors()) - } - } -} diff --git a/.test-infra/jenkins/job_Publish_Docker_Snapshots.groovy b/.test-infra/jenkins/job_Publish_Docker_Snapshots.groovy deleted file mode 100644 index 510acd8f37d8e..0000000000000 --- a/.test-infra/jenkins/job_Publish_Docker_Snapshots.groovy +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import CommonTestProperties -import static PythonTestProperties.SUPPORTED_CONTAINER_TASKS - -job('beam_Publish_Docker_Snapshots') { - description('Builds SDK harness images and job server images for testing purposes.') - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - 'Beam Publish Docker Snapshots', - 'Publish Docker Snapshots', - false - ) - - // Runs once per day. - commonJobProperties.setAutoJob(delegate, '@daily') - - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - commonJobProperties.setGradleSwitches(delegate) - tasks(":runners:spark:${CommonTestProperties.getSparkVersion()}:job-server:container:dockerPush") - tasks(":runners:flink:${CommonTestProperties.getFlinkVersion()}:job-server-container:dockerPush") - switches("-Pdocker-repository-root=gcr.io/apache-beam-testing/beam_portability") - switches("-Pdocker-tag=latest") - } - } -} diff --git a/.test-infra/jenkins/job_Publish_SDK_Image_Snapshots.groovy b/.test-infra/jenkins/job_Publish_SDK_Image_Snapshots.groovy deleted file mode 100644 index 77b593ab30851..0000000000000 --- a/.test-infra/jenkins/job_Publish_SDK_Image_Snapshots.groovy +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties -import static JavaTestProperties.SUPPORTED_CONTAINER_TASKS as SUPPORTED_JAVA_CONTAINER_TASKS -import static PythonTestProperties.SUPPORTED_CONTAINER_TASKS as SUPPORTED_PYTHON_CONTAINER_TASKS - -// This job publishes regular snapshots of the SDK harness containers for -// testing purposes. It builds and pushes the SDK container to the -// specified GCR repo, tagged at the current Git commit. -job('beam_Publish_Beam_SDK_Snapshots') { - description('Builds SDK harness images snapshots regularly for testing purposes.') - - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Runs once every four hours. - commonJobProperties.setAutoJob(delegate, 'H H/4 * * *') - - // Use jenkins env var interpolation - leave in single quotes - def imageRepo = 'gcr.io/apache-beam-testing/beam-sdk' - def imageTag = '${GIT_COMMIT}' - - steps { - shell("echo 'Pushing SDK snapshots to ${imageRepo} at tag: ${imageTag}'") - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - commonJobProperties.setGradleSwitches(delegate) - tasks(':sdks:go:container:dockerTagPush') - SUPPORTED_JAVA_CONTAINER_TASKS.each { taskVer -> - tasks(":sdks:java:container:${taskVer}:dockerTagPush") - } - SUPPORTED_PYTHON_CONTAINER_TASKS.each { taskVer -> - tasks(":sdks:python:container:${taskVer}:dockerTagPush") - } - switches("-Pdocker-repository-root=${imageRepo}") - switches("-Pdocker-tag-list=${imageTag},latest") - switches("-Pjava11Home=${commonJobProperties.JAVA_11_HOME}") - } - } -} diff --git a/.test-infra/jenkins/job_Release_NightlySnapshot.groovy b/.test-infra/jenkins/job_Release_NightlySnapshot.groovy deleted file mode 100644 index 2833c263b0577..0000000000000 --- a/.test-infra/jenkins/job_Release_NightlySnapshot.groovy +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import CommonJobProperties as commonJobProperties - -// This creates the nightly snapshot build. -// Into https://repository.apache.org/content/groups/snapshots/org/apache/beam. -job('beam_Release_NightlySnapshot') { - description('Publish a nightly snapshot.') - previousNames(/beam_Release_Gradle_NightlySnapshot/) - - // Execute concurrent builds if necessary. - concurrentBuild() - - // Set common parameters. Timeout is longer, to avoid [BEAM-5774]. - commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 200, true, 'ubuntu') - - // This is a post-commit job that runs once per day, not for every push. - commonJobProperties.setAutoJob( - delegate, - '@daily', - 'builds@beam.apache.org') - - - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - './gradlew publish', - 'Run Gradle Publish') - - steps { - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks('clean') - } - /* - * Skipping verification on 'ubuntu' labelled nodes since they don't have access to the - * some required GCP resources. - * TODO: Uncomment this after we publishing snapshots on 'beam' nodes. - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks('build') - commonJobProperties.setGradleSwitches(delegate) - switches('--no-parallel') - switches('--continue') - } - */ - gradle { - rootBuildScriptDir(commonJobProperties.checkoutDir) - tasks('publish') - commonJobProperties.setGradleSwitches(delegate) - // Publish a snapshot build. - switches("-Ppublishing") - // No need to run checker framework for snapshot publishing - switches("-PskipCheckerFramework") - // Don't run tasks in parallel, currently the maven-publish/signing plugins - // cause build failures when run in parallel with messages like 'error snapshotting' - switches('--no-parallel') - switches('--continue') - } - } -} - diff --git a/.test-infra/jenkins/metrics_report/report_generator.py b/.test-infra/jenkins/metrics_report/report_generator.py index 7923fcce9dab9..bdaada04f30dc 100644 --- a/.test-infra/jenkins/metrics_report/report_generator.py +++ b/.test-infra/jenkins/metrics_report/report_generator.py @@ -27,11 +27,14 @@ INFLUXDB_USER = os.getenv("INFLUXDB_USER") INFLUXDB_USER_PASSWORD = os.getenv("INFLUXDB_USER_PASSWORD") -WORKING_SPACE = os.getenv("WORKSPACE", "") +WORKING_SPACE = os.getenv("GITHUB_WORKSPACE", os.getenv("WORKSPACE", "")) +if "GITHUB_WORKSPACE" in os.environ: + path_prefix = "" +else: + path_prefix= "src/" PERF_DASHBOARDS = os.path.join( WORKING_SPACE, - "src/.test-infra/metrics/grafana/dashboards/perftests_metrics/") - + path_prefix+".test-infra/metrics/grafana/dashboards/perftests_metrics/") TABLE_FIELD_NAMES = [ "Measurement", "Metric", diff --git a/.test-infra/jenkins/metrics_report/tox.ini b/.test-infra/jenkins/metrics_report/tox.ini index dbf68016c57b2..026db5dc4860c 100644 --- a/.test-infra/jenkins/metrics_report/tox.ini +++ b/.test-infra/jenkins/metrics_report/tox.ini @@ -32,5 +32,5 @@ commands = python -m unittest dashboards_parser.py [testenv:py38-generate-report] deps = -r requirements.txt -passenv = WORKSPACE,INFLUXDB_USER,INFLUXDB_USER_PASSWORD +passenv = WORKSPACE,INFLUXDB_USER,INFLUXDB_USER_PASSWORD,GITHUB_WORKSPACE commands = python report_generator.py {posargs} diff --git a/.test-infra/metrics/build.gradle b/.test-infra/metrics/build.gradle index febe2849ef565..f1ecba05f84df 100644 --- a/.test-infra/metrics/build.gradle +++ b/.test-infra/metrics/build.gradle @@ -106,7 +106,7 @@ task deploy { standardOutput = stdout } - // All images have the same tag, it doesn't matter which we choose. + // All images have the same tag, it doesn't matter which we choose. String image = (stdout.toString().split(' ') as List)[0] String currentImageTag = (image.split(':') as List)[1] println "Current image tag: ${currentImageTag}" diff --git a/.test-infra/metrics/docker-compose.yml b/.test-infra/metrics/docker-compose.yml index 77f07bad13fa6..3d847ff796761 100644 --- a/.test-infra/metrics/docker-compose.yml +++ b/.test-infra/metrics/docker-compose.yml @@ -85,7 +85,10 @@ services: - DB_DBNAME=beam_metrics - DB_DBUSERNAME=admin - DB_DBPWD= - - GH_ACCESS_TOKEN= + - GH_APP_ID= + - GH_APP_INSTALLATION_ID= + - GH_PEM_KEY= + - GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH=30 syncjenkins: image: syncjenkins container_name: beamsyncjenkins diff --git a/.test-infra/metrics/grafana/dashboards/GA-Post-Commits_status_dashboard.json b/.test-infra/metrics/grafana/dashboards/GA-Post-Commits_status_dashboard.json index 380b31dcd074d..3d0ed21734feb 100644 --- a/.test-infra/metrics/grafana/dashboards/GA-Post-Commits_status_dashboard.json +++ b/.test-infra/metrics/grafana/dashboards/GA-Post-Commits_status_dashboard.json @@ -19,20 +19,8507 @@ ] }, "editable": true, - "gnetId": null, + "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": 35, "links": [], + "liveNow": false, "panels": [ { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 0 + }, "id": 2, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'core_infra'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Core Infrastructure", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 11, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'core_java'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Core Java Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 10, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'dataflow_java'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Dataflow Java Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 12, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'runners_java'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Java Runners Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 80 + }, + "id": 3, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'load_perf_java'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Java Load/Perf Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 100 + }, + "id": 9, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'core_python'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Core Python Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 120 + }, + "id": 7, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'runners_python'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Python Runners Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 140 + }, + "id": 8, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'load_perf_python'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Python Load/Perf Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": "BeamPSQL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "semi-dark-red", + "index": 0, + "text": "Fail" + }, + "1": { + "color": "semi-dark-green", + "index": 1, + "text": "Success" + }, + "2": { + "color": "semi-dark-yellow", + "index": 2, + "text": "Pending" + }, + "3": { + "color": "semi-dark-purple", + "index": 3, + "text": "Cancelled" + }, + "4": { + "color": "light-blue", + "index": 4, + "text": "None" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job_name" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://github.com/apache/beam/actions/${__data.fields.job_yml_filename}" + } + ] + }, + { + "id": "custom.align", + "value": "auto" + }, + { + "id": "mappings", + "value": [] + }, + { + "id": "displayName", + "value": "Job Name" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job_yml_filename" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/run\\d+id/" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align", + "value": "center" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_1" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run1id}" + } + ] + }, + { + "id": "displayName", + "value": "Run" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_2" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run2id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 2" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_3" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run3id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 3" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_4" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run4id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 4" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_5" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run5id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 5" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_6" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run6id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 6" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_7" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run7id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 7" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_8" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run8id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 8" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_9" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run9id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 9" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "run_10" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + }, + { + "id": "custom.hidden", + "value": false + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "semi-dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "semi-dark-green", + "value": 1 + }, + { + "color": "semi-dark-yellow", + "value": 2 + }, + { + "color": "light-blue", + "value": 3 + }, + { + "color": "purple", + "value": 4 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.align", + "value": "center" + }, + { + "id": "links", + "value": [ + { + "title": "", + "url": "${__data.fields.run10id}" + } + ] + }, + { + "id": "displayName", + "value": "Run 10" + } + ] + } + ] + }, "gridPos": { - "h": 32, + "h": 20, "w": 24, "x": 0, - "y": 0 + "y": 160 + }, + "id": 13, + "links": [], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true }, - "type": "table", - "title": "GA Post-Commits Status", + "pluginVersion": "10.0.3", + "targets": [ + { + "aggregation": "Last", + "alias": "job", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "format": "table", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'go'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "units": "none", + "valueHandler": "Number Threshold", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Go Tests", "transformations": [ { "id": "merge", @@ -41,46 +8528,52 @@ } } ], + "type": "table" + }, + { "datasource": "BeamPSQL", - "pluginVersion": "9.2.0", - "links": [], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "custom": { "align": "auto", - "displayMode": "auto", + "cellOptions": { + "type": "auto" + }, "inspect": false }, "mappings": [ { - "type": "value", "options": { "0": { + "color": "semi-dark-red", "index": 0, - "text": "Fail", - "color": "semi-dark-red" + "text": "Fail" }, "1": { + "color": "semi-dark-green", "index": 1, - "text": "Success", - "color": "semi-dark-green" + "text": "Success" }, "2": { + "color": "semi-dark-yellow", "index": 2, - "text": "Pending", - "color": "semi-dark-yellow" + "text": "Pending" }, "3": { + "color": "semi-dark-purple", "index": 3, - "text": "Cancelled", - "color": "semi-dark-purple" + "text": "Cancelled" }, "4": { + "color": "light-blue", "index": 4, - "text": "None", - "color": "light-blue" + "text": "None" } - } + }, + "type": "value" } ], "thresholds": { @@ -91,9 +8584,6 @@ "value": null } ] - }, - "color": { - "mode": "thresholds" } }, "overrides": [ @@ -172,8 +8662,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -194,27 +8684,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -244,8 +8737,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -266,27 +8759,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -316,8 +8812,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -338,27 +8834,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -388,8 +8887,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -410,27 +8909,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -460,8 +8962,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -482,27 +8984,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -532,8 +9037,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -554,27 +9059,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -604,8 +9112,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -626,27 +9134,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -676,8 +9187,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -698,27 +9209,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -748,8 +9262,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -770,27 +9284,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -820,8 +9337,8 @@ { "id": "color", "value": { - "mode": "fixed", - "fixedColor": "dark-green" + "fixedColor": "dark-green", + "mode": "fixed" } }, { @@ -842,27 +9359,30 @@ "value": 0 }, { - "value": 1, - "color": "semi-dark-green" + "color": "semi-dark-green", + "value": 1 }, { - "value": 2, - "color": "semi-dark-yellow" + "color": "semi-dark-yellow", + "value": 2 }, { - "value": 3, - "color": "light-blue" + "color": "light-blue", + "value": 3 }, { - "value": 4, - "color": "purple" + "color": "purple", + "value": 4 } ] } }, { - "id": "custom.displayMode", - "value": "color-background-solid" + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } }, { "id": "custom.align", @@ -885,16 +9405,27 @@ } ] }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 180 + }, + "id": 6, + "links": [], "options": { - "showHeader": true, + "cellHeight": "sm", "footer": { - "show": false, + "countRows": false, + "fields": "", "reducer": [ "sum" ], - "fields": "" - } + "show": false + }, + "showHeader": true }, + "pluginVersion": "10.0.3", "targets": [ { "aggregation": "Last", @@ -903,12 +9434,13 @@ "displayAliasType": "Warning / Critical", "displayType": "Regular", "displayValueWithAlias": "Never", + "editorMode": "code", "format": "table", "group": [], "hide": false, "metricColumn": "none", "rawQuery": true, - "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", + "rawSql": "with workflows as (\n select\n *,\n case when run1 like 'success' then 1 when run1 like 'in_progress' then 2 when run1 like 'queued' then 2 when run1 like 'waiting' then 2 when run1 like 'cancelled' then 3 when run1 like 'None' then 4 else 0 end as run_1,\n case when run2 like 'success' then 1 when run2 like 'in_progress' then 2 when run2 like 'queued' then 2 when run2 like 'waiting' then 2 when run2 like 'cancelled' then 3 when run2 like 'None' then 4 else 0 end as run_2,\n case when run3 like 'success' then 1 when run3 like 'in_progress' then 2 when run3 like 'queued' then 2 when run3 like 'waiting' then 2 when run3 like 'cancelled' then 3 when run3 like 'None' then 4 else 0 end as run_3,\n case when run4 like 'success' then 1 when run4 like 'in_progress' then 2 when run4 like 'queued' then 2 when run4 like 'waiting' then 2 when run4 like 'cancelled' then 3 when run4 like 'None' then 4 else 0 end as run_4,\n case when run5 like 'success' then 1 when run5 like 'in_progress' then 2 when run5 like 'queued' then 2 when run5 like 'waiting' then 2 when run5 like 'cancelled' then 3 when run5 like 'None' then 4 else 0 end as run_5,\n case when run6 like 'success' then 1 when run6 like 'in_progress' then 2 when run6 like 'queued' then 2 when run6 like 'waiting' then 2 when run6 like 'cancelled' then 3 when run6 like 'None' then 4 else 0 end as run_6,\n case when run7 like 'success' then 1 when run7 like 'in_progress' then 2 when run7 like 'queued' then 2 when run7 like 'waiting' then 2 when run7 like 'cancelled' then 3 when run7 like 'None' then 4 else 0 end as run_7,\n case when run8 like 'success' then 1 when run8 like 'in_progress' then 2 when run8 like 'queued' then 2 when run8 like 'waiting' then 2 when run8 like 'cancelled' then 3 when run8 like 'None' then 4 else 0 end as run_8,\n case when run9 like 'success' then 1 when run9 like 'in_progress' then 2 when run9 like 'queued' then 2 when run9 like 'waiting' then 2 when run9 like 'cancelled' then 3 when run9 like 'None' then 4 else 0 end as run_9,\n case when run10 like 'success' then 1 when run10 like 'in_progress' then 2 when run10 like 'queued' then 2 when run10 like 'waiting' then 2 when run10 like 'cancelled' then 3 when run10 like 'None' then 4 else 0 end as run_10\n from\n github_workflows\n where\n dashboard_category = 'misc'\n)\nselect\n job_name,\n job_yml_filename,\n run_1,\n run1Id,\n run_2,\n run2Id,\n run_3,\n run3Id,\n run_4,\n run4Id,\n run_5,\n run5Id,\n run_6,\n run6Id,\n run_7,\n run7Id,\n run_8,\n run8Id,\n run_9,\n run9Id,\n run_10,\n run10Id\nfrom\n workflows;", "refId": "A", "select": [ [ @@ -931,10 +9463,21 @@ } ] } - ] + ], + "title": "Misc Tests", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" } ], - "schemaVersion": 30, + "refresh": "", + "schemaVersion": 38, "style": "dark", "tags": [], "templating": { @@ -948,5 +9491,6 @@ "timezone": "", "title": "GA Post-Commits Status", "uid": "CTYdoxP4z", - "version": 1 -} + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/.test-infra/metrics/sync/github/requirements.txt b/.test-infra/metrics/sync/github/requirements.txt index 14a64686e55a4..5b231565459fd 100644 --- a/.test-infra/metrics/sync/github/requirements.txt +++ b/.test-infra/metrics/sync/github/requirements.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -requests +aiohttp +backoff psycopg2-binary -ddt - +PyGithub \ No newline at end of file diff --git a/.test-infra/metrics/sync/github/sync_workflows.py b/.test-infra/metrics/sync/github/sync_workflows.py index 0b4ddfec38022..a2c062b175ea4 100644 --- a/.test-infra/metrics/sync/github/sync_workflows.py +++ b/.test-infra/metrics/sync/github/sync_workflows.py @@ -1,4 +1,3 @@ -# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -6,180 +5,647 @@ # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + ''' -This module queries GitHub to collect Beam-related workflows metrics and put them in -PostgreSQL. -This Script is running every 3 hours in a cloud function in apache-beam-testing project. -This cloud function is triggered by a pubsub topic. -You can find the cloud function in the next link +This module queries GitHub API to collect Beam-related workflows metrics and +put them in PostgreSQL. +This script is running every 3 hours as a cloud function +"github_actions_workflows_dashboard_sync" in apache-beam-testing project: https://console.cloud.google.com/functions/details/us-central1/github_actions_workflows_dashboard_sync?env=gen1&project=apache-beam-testing -Pub sub topic : https://console.cloud.google.com/cloudpubsub/topic/detail/github_actions_workflows_sync?project=apache-beam-testing -Cron Job : https://console.cloud.google.com/cloudscheduler/jobs/edit/us-central1/github_actions_workflows_dashboard_sync?project=apache-beam-testing -Writing the latest 10 runs of every workflow in master branch in a beammetrics database +This cloud function is triggered by a pubsub topic: +https://console.cloud.google.com/cloudpubsub/topic/detail/github_actions_workflows_sync?project=apache-beam-testing +Cron Job: +https://console.cloud.google.com/cloudscheduler/jobs/edit/us-central1/github_actions_workflows_dashboard_sync?project=apache-beam-testing ''' +import asyncio +import aiohttp +import backoff +import math import os import sys import time import re -import requests import psycopg2 - -from datetime import datetime -from github import GithubIntegration +from github import GithubIntegration DB_HOST = os.environ['DB_HOST'] DB_PORT = os.environ['DB_PORT'] DB_NAME = os.environ['DB_DBNAME'] DB_USER_NAME = os.environ['DB_DBUSERNAME'] DB_PASSWORD = os.environ['DB_DBPWD'] -GH_WORKFLOWS_TABLE_NAME = "github_workflows" -# Number of workflows that fetch github API -GH_NUMBER_OF_WORKFLOWS = 100 -GH_WORKFLOWS_NUMBER_EXECUTIONS = 100 -WORKFLOWS_OBJECT_LIST = [] +GH_APP_ID = os.environ['GH_APP_ID'] +GH_APP_INSTALLATION_ID = os.environ['GH_APP_INSTALLATION_ID'] +GH_PEM_KEY = os.environ['GH_PEM_KEY'] +GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH =\ + os.environ['GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH'] + +# Maps workflows to dashboard category. Any workflows not in one of these lists +# will get auto-mapped to misc. +CORE_JAVA_TESTS = [ + 'PreCommit SQL Java17', + 'PreCommit SQL Java11', + 'LoadTests Java GBK Smoke', + 'PreCommit Java Amazon-Web-Services IO Direct', + 'PreCommit Java Amqp IO Direct', + 'PreCommit Java Amazon-Web-Services2 IO Direct', + 'PreCommit Java', + 'PreCommit Java Cassandra IO Direct', + 'PreCommit Java Azure IO Direct', + 'PreCommit Java Cdap IO Direct', + 'PreCommit Java Clickhouse IO Direct', + 'PreCommit Java Csv IO Direct', + 'Java Tests', + 'PostCommit Java Avro Versions', + 'PreCommit Java Debezium IO Direct', + 'PreCommit Java File-schema-transform IO Direct', + 'PostCommit Java', + 'PreCommit Java GCP IO Direct', + 'PostCommit Java BigQueryEarlyRollout', + 'PreCommit Java Google-ads IO Direct', + 'PreCommit Java HBase IO Direct', + 'PreCommit Java ElasticSearch IO Direct', + 'PreCommit Java HCatalog IO Direct', + 'PreCommit Java Hadoop IO Direct', + 'PreCommit Java IOs Direct', + 'PostCommit Java Hadoop Versions', + 'PreCommit Java Jms IO Direct', + 'PostCommit Java ValidatesRunner Direct JavaVersions', + 'PreCommit Java Kafka IO Direct', + 'PostCommit Java Examples Direct', + 'PreCommit Java JDBC IO Direct', + 'PostCommit Java ValidatesRunner Samza', + 'PreCommit Java Mqtt IO Direct', + 'PreCommit Java Kinesis IO Direct', + 'PreCommit Java MongoDb IO Direct', + 'PostCommit Java IO Performance Tests', + 'PreCommit Java Kudu IO Direct', + 'PostCommit Java InfluxDbIO Integration Test', + 'PostCommit Java Jpms Direct Java21', + 'PostCommit Java ValidatesRunner Twister2', + 'PreCommit Java Neo4j IO Direct', + 'PostCommit Java Jpms Direct Java11', + 'PostCommit Javadoc', + 'PostCommit Java Jpms Direct Java17', + 'PreCommit Java Pulsar IO Direct', + 'PostCommit Java ValidatesRunner ULR', + 'PreCommit Java Parquet IO Direct', + 'PreCommit Java Redis IO Direct', + 'Java JMH', + 'PreCommit Java RabbitMq IO Direct', + 'PreCommit Java RequestResponse IO Direct', + 'PostCommit Java Nexmark Direct', + 'PreCommit Java Splunk IO Direct', + 'PreCommit Java Thrift IO Direct', + 'PreCommit Java Snowflake IO Direct', + 'PreCommit Java Solr IO Direct', + 'PostCommit Java PVR Samza', + 'PreCommit Java Tika IO Direct', + 'PostCommit Java SingleStoreIO IT', + 'PostCommit Java Sickbay', + 'PostCommit Java ValidatesRunner Direct', + 'PreCommit Java SingleStore IO Direct', + 'PreCommit Java InfluxDb IO Direct', + 'PreCommit Spotless', + 'PreCommit Kotlin Examples' +] + +DATAFLOW_JAVA_TESTS = [ + 'PostCommit XVR GoUsingJava Dataflow', + 'PostCommit XVR PythonUsingJavaSQL Dataflow', + 'PostCommit XVR JavaUsingPython Dataflow', + 'PostCommit XVR PythonUsingJava Dataflow', + 'PreCommit Java Examples Dataflow Java11', + 'PreCommit Java Examples Dataflow Java17', + 'PreCommit Java Examples Dataflow Java21', + 'PreCommit Java Examples Dataflow', + 'PostCommit Java ValidatesRunner Dataflow', + 'PostCommit Java Dataflow V1', + 'PostCommit Java ValidatesRunner Dataflow Streaming', + 'PostCommit Java Dataflow V2', + 'PostCommit Java ValidatesRunner Dataflow V2', + 'PostCommit Java Examples Dataflow', + 'PostCommit Java Examples Dataflow ARM', + 'PostCommit Java ValidatesRunner Dataflow V2 Streaming', + 'PostCommit Java ValidatesRunner Dataflow JavaVersions', + 'PostCommit Java Examples Dataflow Java', + 'PostCommit Java Examples Dataflow V2 Java', + 'PostCommit Java Jpms Dataflow Java11', + 'PostCommit Java Jpms Dataflow Java17', + 'PostCommit Java Nexmark Dataflow', + 'PostCommit Java Nexmark Dataflow V2', + 'PostCommit Java Nexmark Dataflow V2 Java', + 'PostCommit Java Tpcds Dataflow', + 'PostCommit Java Examples Dataflow V2' +] + +RUNNERS_JAVA_TESTS = [ + 'PostCommit Java PVR Spark3 Streaming', + 'PostCommit Java ValidatesRunner Spark', + 'PostCommit Java Examples Spark', + 'PostCommit Java ValidatesRunner SparkStructuredStreaming', + 'PostCommit Java ValidatesRunner Spark Java11', + 'PostCommit Java PVR Spark Batch', + 'PreCommit Java Spark3 Versions', + 'PostCommit Java Tpcds Spark', + 'PostCommit Java Jpms Spark Java11', + 'PostCommit Java Nexmark Spark', + 'PostCommit Java Examples Flink', + 'PostCommit Java Tpcds Flink', + 'PostCommit Java PVR Flink Streaming', + 'PostCommit Java Jpms Flink Java11', + 'PreCommit Java PVR Flink Batch', + 'PostCommit Java Nexmark Flink', + 'PreCommit Java PVR Flink Docker', + 'PreCommit Java Flink Versions', + 'PostCommit Java ValidatesRunner Flink Java11', + 'PostCommit Java ValidatesRunner Flink' +] + +LOAD_PERF_JAVA_TESTS = [ + 'LoadTests Java CoGBK Dataflow Batch', + 'LoadTests Java CoGBK Dataflow V2 Streaming JavaVersions', + 'LoadTests Java CoGBK Dataflow Streaming', + 'LoadTests Java Combine Dataflow Batch', + 'LoadTests Java Combine Dataflow Streaming', + 'LoadTests Java CoGBK Dataflow V2 Batch JavaVersions', + 'LoadTests Java GBK Dataflow Batch', + 'LoadTests Java GBK Dataflow Streaming', + 'LoadTests Java GBK Dataflow V2 Batch Java11', + 'LoadTests Java GBK Dataflow V2 Streaming Java11', + 'LoadTests Java GBK Dataflow V2 Batch Java17', + 'LoadTests Java GBK Dataflow V2 Streaming Java17', + 'LoadTests Java ParDo Dataflow Streaming', + 'LoadTests Java ParDo Dataflow V2 Streaming JavaVersions', + 'LoadTests Java ParDo Dataflow V2 Batch JavaVersions', + 'LoadTests Java ParDo Dataflow Batch', + 'LoadTests Java ParDo SparkStructuredStreaming Batch', + 'LoadTests Java CoGBK SparkStructuredStreaming Batch', + 'LoadTests Java Combine SparkStructuredStreaming Batch', + 'LoadTests Java GBK SparkStructuredStreaming Batch', + 'PerformanceTests BigQueryIO Batch Java Avro', + 'PerformanceTests BigQueryIO Streaming Java', + 'PerformanceTests BigQueryIO Batch Java Json', + 'PerformanceTests SQLBigQueryIO Batch Java', + 'PerformanceTests XmlIOIT', + 'PostCommit XVR Samza', + 'PerformanceTests ManyFiles TextIOIT', + 'PerformanceTests XmlIOIT HDFS', + 'PerformanceTests ParquetIOIT', + 'PerformanceTests ParquetIOIT HDFS', + 'PerformanceTests AvroIOIT', + 'PerformanceTests ManyFiles TextIOIT HDFS', + 'PerformanceTests TFRecordIOIT', + 'PerformanceTests Cdap', + 'PerformanceTests TextIOIT', + 'PerformanceTests AvroIOIT HDFS', + 'PerformanceTests SingleStoreIO', + 'PerformanceTests SparkReceiver IO', + 'PerformanceTests Compressed TextIOIT', + 'PerformanceTests TextIOIT HDFS', + 'PerformanceTests Compressed TextIOIT HDFS', + 'PerformanceTests HadoopFormat', + 'PerformanceTests JDBC', + 'PerformanceTests Kafka IO' +] + +CORE_PYTHON_TESTS = [ + 'Python Dependency Tests', + 'PreCommit Python Dataframes', + 'PreCommit Python Examples', + 'PreCommit Python Integration', + 'PostCommit Python ValidatesRunner Samza', + 'LoadTests Python Smoke', + 'Update Python Depedencies', + 'PreCommit Python Runners', + 'PreCommit Python Transforms', + 'PostCommit Python Xlang Gcp Direct', + 'Build python source distribution and wheels', + 'Python tests', + 'PostCommit Sickbay Python', + 'PostCommit Python', + 'PostCommit Python Arm', + 'PostCommit Python Examples Direct', + 'PreCommit Portable Python', + 'PreCommit Python Coverage', + 'PreCommit Python Docker', + 'PreCommit Python', + 'PostCommit Python MongoDBIO IT', + 'PreCommit Python Docs', + 'PreCommit Python Formatter', + 'PostCommit Python Nexmark Direct', + 'PreCommit Python Lint' +] + +RUNNERS_PYTHON_TESTS = [ + 'PostCommit Python ValidatesRunner Dataflow', + 'Python ValidatesContainer Dataflow ARM', + 'PostCommit Python Xlang Gcp Dataflow', + 'PostCommit Python Xlang IO Dataflow', + 'PostCommit Python Examples Dataflow', + 'PostCommit Python ValidatesContainer Dataflow', + 'PostCommit Python ValidatesContainer Dataflow With RC', + 'PostCommit Python ValidatesRunner Spark', + 'PostCommit Python Examples Spark', + 'PostCommit Python ValidatesRunner Flink', + 'PreCommit Python PVR Flink', + 'PostCommit Python Examples Flink' +] +LOAD_PERF_PYTHON_TESTS = [ + 'PerformanceTests xlang KafkaIO Python', + 'LoadTests Python FnApiRunner Microbenchmark', + 'PerformanceTests SpannerIO Write 2GB Python Batch', + 'PerformanceTests SpannerIO Read 2GB Python', + 'PerformanceTests BiqQueryIO Read Python', + 'PerformanceTests BiqQueryIO Write Python Batch', + 'PerformanceTests TextIOIT Python', + 'PerformanceTests WordCountIT PythonVersions', + 'Performance alerting tool on Python load/performance/benchmark tests.', + 'LoadTests Python SideInput Dataflow Batch', + 'LoadTests Python CoGBK Dataflow Batch', + 'LoadTests Python CoGBK Dataflow Streaming', + 'LoadTests Python Combine Dataflow Batch', + 'Inference Python Benchmarks Dataflow', + 'LoadTests Python Combine Dataflow Streaming', + 'LoadTests Python GBK Dataflow Batch', + 'LoadTests Python GBK Dataflow Streaming', + 'LoadTests Python GBK reiterate Dataflow Batch', + 'LoadTests Python GBK reiterate Dataflow Streaming', + 'LoadTests Python ParDo Dataflow Streaming', + 'CloudML Benchmarks Dataflow', + 'LoadTests Python ParDo Dataflow Batch', + 'LoadTests Python CoGBK Flink Batch', + 'LoadTests Python Combine Flink Batch', + 'LoadTests Python Combine Flink Streaming', + 'PerformanceTests PubsubIOIT Python Streaming', + 'LoadTests Python ParDo Flink Batch', + 'LoadTests Python ParDo Flink Streaming' +] + +GO_TESTS = [ + 'PerformanceTests MongoDBIO IT', + 'PreCommit Go', + 'PreCommit GoPortable', + 'PreCommit GoPrism', + 'PostCommit Go VR Samza', + 'Go tests', + 'PostCommit Go', + 'PostCommit Go Dataflow ARM', + 'LoadTests Go CoGBK Dataflow Batch', + 'LoadTests Go Combine Dataflow Batch', + 'LoadTests Go GBK Dataflow Batch', + 'LoadTests Go ParDo Dataflow Batch', + 'LoadTests Go SideInput Dataflow Batch', + 'PostCommit Go VR Spark', + 'PostCommit Go VR Flink', + 'LoadTests Go CoGBK Flink Batch', + 'LoadTests Go Combine Flink Batch', + 'LoadTests Go GBK Flink Batch', + 'LoadTests Go ParDo Flink Batch', + 'LoadTests Go SideInput Flink Batch' +] + +CORE_INFRA_TESTS = [ + 'Release Nightly Snapshot Python', + 'Rotate Metrics Cluster Credentials', + 'Community Metrics Prober', + 'Publish Docker Snapshots', + 'Clean Up GCP Resources', + 'Clean Up Prebuilt SDK Images', + 'Rotate IO-Datastores Cluster Credentials', + 'Release Nightly Snapshot', + 'Mark issue as triaged when assigned', + 'PostCommit BeamMetrics Publish', + 'PreCommit Community Metrics', + 'Beam Metrics Report', + 'Build and Version Runner Docker Image', + 'PreCommit GHA', + 'pr-bot-prs-needing-attention', + 'PreCommit RAT', + 'Assign or close an issue', + 'PostCommit Website Test', + 'PostCommit Website Publish', + 'PreCommit Website', + 'PreCommit Website Stage GCS', + 'Cleanup Dataproc Resources', + 'PreCommit Whitespace', + 'Publish Beam SDK Snapshots', + 'Cancel Stale Dataflow Jobs', + 'pr-bot-pr-updates', + 'pr-bot-new-prs' +] + +MISC_TESTS = [ + 'Tour of Beam Go integration tests', + 'Tour of Beam Go unittests', + 'Tour Of Beam Frontend Test', + 'PostCommit XVR Spark3', + 'TypeScript Tests', + 'Playground Frontend Test', + 'PostCommit PortableJar Flink', + 'PostCommit SQL', + 'Cancel', + 'PostCommit PortableJar Spark', + 'PreCommit Integration and Load Test Framework', + 'pr-bot-update-reviewers', + 'PostCommit TransformService Direct', + 'Cut Release Branch', + 'Generate issue report', + 'Dask Runner Tests', + 'PreCommit Typescript', + 'PostCommit XVR Direct', + 'Mark and close stale pull requests', + 'PostCommit XVR Flink', + 'IssueTagger', + 'Assign Milestone on issue close', + 'Local environment tests', + 'PreCommit SQL', + 'LabelPrs', + 'build_release_candidate' +] class Workflow: - def __init__(self,id,name,filename): - self.id = id - self.name = name - self.filename = filename - self.listOfRuns = [] - self.runUrl = [] - -# The table will save the latest ten run of every workflow -GH_WORKFLOWS_CREATE_TABLE_QUERY = f""" -CREATE TABLE IF NOT EXISTS {GH_WORKFLOWS_TABLE_NAME} ( - job_name text PRIMARY KEY, - job_yml_filename text""" -for i in range(0,GH_WORKFLOWS_NUMBER_EXECUTIONS): - i = i + 1 - GH_WORKFLOWS_CREATE_TABLE_QUERY += """,\n run{} text, - run{}Id text""".format(str(i),str(i)) -GH_WORKFLOWS_CREATE_TABLE_QUERY += ")\n" - -def githubWorkflowsGrafanaSync(data,context): - print('Started') - print('Updating table with recent workflow runs') - databaseOperations(initDbConnection(),fetchWorkflowData()) - print('Done') - return "Completed" - -def initDbConnection(): - '''Init connection with the Database''' - connection = None - maxRetries = 3 - i = 0 - while connection == None and i < maxRetries: - try: - connection = psycopg2.connect( - f"dbname='{DB_NAME}' user='{DB_USER_NAME}' host='{DB_HOST}'" - f" port='{DB_PORT}' password='{DB_PASSWORD}'") - except Exception as e: - print('Failed to connect to DB; retrying in 1 minute') - print(e) - time.sleep(60) - i = i + 1 - if i >= maxRetries: - print("Number of retries exceded ") - sys.exit(1) - return connection - -def getToken(): - git_integration = GithubIntegration( - os.environ["GH_APP_ID"], - os.environ["GH_PEM_KEY"]) - token=git_integration.get_access_token( - os.environ["GH_APP_INSTALLATION_ID"] - ).token - return token - -def retriesRequest(request): - requestSucceeded = False - retryFactor = 1 - while not requestSucceeded: - retryTime = 60 * retryFactor - if request.status_code != 200: - print('Failed to get the request with code {}'.format(request.status_code)) - time.sleep(retryTime) - retryFactor = retryFactor + retryFactor - if retryFactor * 60 >= 3600: - print("Error: The request take more than an hour") - sys.exit(1) - else: - requestSucceeded = True -def fetchWorkflowData(): - '''Return a json with all the workflows and the latests - ten executions''' - completed = False - page = 1 - workflows = [] + def __init__(self, id, name, filename): + self.id = id + self.name = name + self.filename = filename + self.runs = [] + +def get_dashboard_category(workflow_name): + # If you add or remove categories in this function, make sure to add or + # remove the corresponding panels here: + # https://github.com/apache/beam/blob/master/.test-infra/metrics/grafana/dashboards/GA-Post-Commits_status_dashboard.json + + if workflow_name in CORE_INFRA_TESTS: + return 'core_infra' + if workflow_name in CORE_JAVA_TESTS: + return 'core_java' + if workflow_name in DATAFLOW_JAVA_TESTS: + return 'dataflow_java' + if workflow_name in RUNNERS_JAVA_TESTS: + return 'runners_java' + if workflow_name in LOAD_PERF_JAVA_TESTS: + return 'load_perf_java' + if workflow_name in CORE_PYTHON_TESTS: + return 'core_python' + if workflow_name in RUNNERS_PYTHON_TESTS: + return 'runners_python' + if workflow_name in LOAD_PERF_PYTHON_TESTS: + return 'load_perf_python' + if workflow_name in GO_TESTS: + return 'go' + if workflow_name in MISC_TESTS: + return 'misc' + + print(f'No category found for workflow: {workflow_name}') + print('Falling back to rules based assignment') + + workflow_name = workflow_name.lower() + if 'java' in workflow_name: + if 'dataflow' in workflow_name: + return 'dataflow_java' + if 'spark' in workflow_name or 'flink' in workflow_name: + return 'runners_java' + if 'performancetest' in workflow_name or 'loadtest' in workflow_name: + return 'load_perf_java' + return 'core_java' + elif 'python' in workflow_name: + if 'dataflow' in workflow_name or 'spark' in workflow_name or 'flink' in workflow_name: + return 'runners_python' + if 'performancetest' in workflow_name or 'loadtest' in workflow_name: + return 'load_perf_python' + return 'core_python' + elif 'go' in workflow_name: + return 'go' + + return 'misc' + +def github_workflows_dashboard_sync(data, context): + # Entry point for cloud function, don't change signature + return asyncio.run(sync_workflow_runs()) + +async def sync_workflow_runs(): + print('Started') + print('Updating table with recent workflow runs') + + if not GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH or \ + not GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH.isdigit(): + raise ValueError( + 'The number of workflow runs to fetch is not specified or not an integer' + ) + + database_operations(init_db_connection(), await fetch_workflow_data()) + + print('Done') + return "Completed" + +def init_db_connection(): + '''Init connection with the Database''' + connection = None + maxRetries = 3 + i = 0 + while connection is None and i < maxRetries: try: - while not completed: - url = "https://api.github.com/repos/apache/beam/actions/workflows" - queryOptions = { 'branch' : 'master', 'page': page, 'per_page' : GH_NUMBER_OF_WORKFLOWS } - response = requests.get(url = url, params = queryOptions) - retriesRequest(response) - jsonResponse = response.json() - if jsonResponse['total_count'] >= GH_NUMBER_OF_WORKFLOWS: - page = page + 1 - workflowsPage = jsonResponse['workflows'] - workflows.append(workflowsPage) - else: - completed = True - workflowsPage = jsonResponse['workflows'] - workflows.append(workflowsPage) - for pageItem in workflows: - for item in pageItem: - path = item['path'] - result = re.search('/(.*).yml', path) - path = (result.group(1)) + ".yml" - workflowObject = Workflow(item['id'],item['name'],path) - WORKFLOWS_OBJECT_LIST.append(workflowObject) - url = "https://api.github.com/repos/apache/beam/actions/workflows/" - queryOptions = { 'branch' : 'master', 'per_page' : GH_WORKFLOWS_NUMBER_EXECUTIONS, - 'page' :'1', 'exclude_pull_request':True } - for workflow in WORKFLOWS_OBJECT_LIST: - response = requests.get(url = "{}{}/runs".format(url,workflow.id), - params=queryOptions) - retriesRequest(response) - responseJson = response.json() - workflowsRuns = responseJson['workflow_runs'] - for item in workflowsRuns: - if item['status'] == 'completed': - workflow.runUrl.append(item['html_url']) - workflow.listOfRuns.append(item['conclusion']) - elif item['status'] != 'cancelled': - workflow.listOfRuns.append(item['status']) - workflow.runUrl.append(item['html_url']) - for i in range(0,GH_WORKFLOWS_NUMBER_EXECUTIONS): - if i >= len(workflow.listOfRuns): - workflow.listOfRuns.append('None') - workflow.runUrl.append('None') + connection = psycopg2.connect( + f"dbname='{DB_NAME}' user='{DB_USER_NAME}' host='{DB_HOST}'" + f" port='{DB_PORT}' password='{DB_PASSWORD}'") except Exception as e: - print('Failed to get GHA workflows') - print(e) - -def databaseOperations(connection,fetchWorkflows): - '''Create the table if not exist and update the table with the latest runs - of the workflows ''' - queryInsert = "INSERT INTO {} VALUES ".format(GH_WORKFLOWS_TABLE_NAME) - cursor = connection.cursor() - cursor.execute(GH_WORKFLOWS_CREATE_TABLE_QUERY) - cursor.execute("DELETE FROM {};".format(GH_WORKFLOWS_TABLE_NAME)) - query = "" - for workflow in WORKFLOWS_OBJECT_LIST: - rowInsert = "(\'{}\',\'{}\'".format(workflow.name,workflow.filename) - for run, runUrl in zip(workflow.listOfRuns,workflow.runUrl): - rowInsert += ",\'{}\',\'{}\'".format(run,runUrl) - query = query + rowInsert - query += ")," - query = query[:-1] + ";" - query = queryInsert + query - cursor.execute(query) - cursor.close() - connection.commit() - connection.close() + print('Failed to connect to DB; retrying in 1 minute') + print(e) + time.sleep(60) + i = i + 1 + if i >= maxRetries: + print("Number of retries exceded ") + sys.exit(1) + return connection + +def get_token(): + git_integration = GithubIntegration(GH_APP_ID, GH_PEM_KEY) + token = git_integration.get_access_token(GH_APP_INSTALLATION_ID).token + return f'Bearer {token}' + +@backoff.on_exception(backoff.constant, aiohttp.ClientResponseError, max_tries=5) +async def fetch(url, semaphore, params=None, headers=None, request_id=None): + async with semaphore: + async with aiohttp.ClientSession() as session: + async with session.get(url, params=params, headers=headers) as response: + if response.status == 200: + result = await response.json() + if request_id: + return request_id, result + return result + elif response.status == 403: + print(f'Retry for: {url}') + headers['Authorization'] = get_token() + raise aiohttp.ClientResponseError( + response.request_info, + response.history, + status=response.status, + message=response.reason, + headers=response.headers + ) + +async def fetch_workflow_data(): + def append_workflow_runs(workflow, runs): + for run in runs: + # Getting rid of all runs with a "skipped" status to display + # only actual runs + if run['conclusion'] != 'skipped': + status = '' + if run['status'] == 'completed': + status = run['conclusion'] + elif run['status'] != 'cancelled': + status = run['status'] + workflow.runs.append((int(run['id']), status, run['html_url'])) + + url = "https://api.github.com/repos/apache/beam/actions/workflows" + headers = {'Authorization': get_token()} + page = 1 + number_of_entries_per_page = 100 # The number of results per page (max 100) + params =\ + {'branch': 'master', 'page': page, 'per_page': number_of_entries_per_page} + concurrent_requests = 30 # Number of requests to send simultaneously + semaphore = asyncio.Semaphore(concurrent_requests) + + print("Start fetching recent workflow runs") + workflow_tasks = [] + response = await fetch(url, semaphore, params, headers) + pages_to_fetch =\ + math.ceil(response['total_count'] / number_of_entries_per_page) + while pages_to_fetch >= page: + params = { + 'branch': 'master', + 'page': page, + 'per_page': number_of_entries_per_page + } + workflow_tasks.append(fetch(url, semaphore, params, headers)) + page += 1 + + workflow_run_tasks = [] + for completed_task in asyncio.as_completed(workflow_tasks): + response = await completed_task + workflows = response.get('workflows', []) + for workflow in workflows: + runs_url = f"{url}/{workflow['id']}/runs" + page = 1 + pages_to_fetch = math.ceil( + int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH) / number_of_entries_per_page + ) + while pages_to_fetch >= page: + params = { + 'branch': 'master', + 'page': page, + 'per_page': number_of_entries_per_page, + 'exclude_pull_requests': 'true' + } + workflow_run_tasks.append(fetch(runs_url, semaphore, params, headers)) + page += 1 + print("Successfully fetched workflow runs") + + print("Start fetching workflow runs details") + workflows = {} + workflow_ids_to_fetch_extra_runs = {} + for completed_task in asyncio.as_completed(workflow_run_tasks): + response = await completed_task + workflow_runs = response.get('workflow_runs') + if workflow_runs: + workflow_id = workflow_runs[0]['workflow_id'] + workflow = workflows.get(workflow_id) + if not workflow: + workflow_name = workflow_runs[0]['name'] + workflow_path = workflow_runs[0]['path'] + result = re.search(r'(workflows\/.*)$', workflow_path) + if result: + workflow_path = result.group(1) + workflow = Workflow(workflow_id, workflow_name, workflow_path) + + append_workflow_runs(workflow, workflow_runs) + workflows[workflow_id] = workflow + if len(workflow.runs) < int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH): + workflow_ids_to_fetch_extra_runs[workflow_id] = workflow_id + else: + workflow_ids_to_fetch_extra_runs.pop(workflow_id, None) + print(f"Successfully fetched details for: {workflow.filename}") + + page = math.ceil( + int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH) / number_of_entries_per_page + ) + 1 + # Fetch extra workflow runs if the specified number of runs is not reached + while workflow_ids_to_fetch_extra_runs: + extra_workflow_runs_tasks = [] + for workflow_id in list(workflow_ids_to_fetch_extra_runs.values()): + runs_url = f"{url}/{workflow_id}/runs" + params = { + 'branch': 'master', + 'page': page, + 'per_page': number_of_entries_per_page, + 'exclude_pull_requests': 'true' + } + extra_workflow_runs_tasks.append(fetch(runs_url, semaphore, params, headers, workflow_id)) + for completed_task in asyncio.as_completed(extra_workflow_runs_tasks): + workflow_id, response = await completed_task + workflow = workflows[workflow_id] + print(f"Fetching extra workflow runs for: {workflow.filename}") + workflow_runs = response.get('workflow_runs') + if workflow_runs: + append_workflow_runs(workflow, workflow_runs) + else: + number_of_runs_to_add =\ + int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH) - len(workflow.runs) + workflow.runs.extend([(0, 'None', 'None')] * number_of_runs_to_add) + if len(workflow.runs) >= int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH): + workflow_ids_to_fetch_extra_runs.pop(workflow_id, None) + print(f"Successfully fetched extra workflow runs for: {workflow.filename}") + page += 1 + print("Successfully fetched workflow runs details") + + for workflow in list(workflows.values()): + runs = sorted(workflow.runs, key=lambda r: r[0], reverse=True) + workflow.runs = runs[:int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH)] + + return list(workflows.values()) + +def database_operations(connection, workflows): + # Create the table and update it with the latest workflow runs + if not workflows: + return + cursor = connection.cursor() + workflows_table_name = "github_workflows" + cursor.execute(f"DROP TABLE IF EXISTS {workflows_table_name};") + create_table_query = f""" + CREATE TABLE IF NOT EXISTS {workflows_table_name} ( + workflow_id integer NOT NULL PRIMARY KEY, + job_name text NOT NULL, + job_yml_filename text NOT NULL, + dashboard_category text NOT NULL""" + for i in range(int(GH_NUMBER_OF_WORKFLOW_RUNS_TO_FETCH)): + create_table_query += f""", + run{i+1} text, + run{i+1}Id text""" + create_table_query += ")\n" + cursor.execute(create_table_query) + insert_query = f"INSERT INTO {workflows_table_name} VALUES " + for workflow in workflows: + category = get_dashboard_category(workflow.name) + row_insert =\ + f"(\'{workflow.id}\',\'{workflow.name}\',\'{workflow.filename}\',\'{category}\'" + for _, status, url in workflow.runs: + row_insert += f",\'{status}\',\'{url}\'" + insert_query += f"{row_insert})," + insert_query = insert_query[:-1] + ";" + print(insert_query) + cursor.execute(insert_query) + cursor.close() + connection.commit() + connection.close() + +if __name__ == '__main__': + asyncio.run(github_workflows_dashboard_sync(None, None)) diff --git a/.test-infra/mock-apis/README.md b/.test-infra/mock-apis/README.md new file mode 100644 index 0000000000000..e2148d390a0af --- /dev/null +++ b/.test-infra/mock-apis/README.md @@ -0,0 +1,234 @@ + + +# Overview + +This directory holds code and related artifacts to support API related +integration tests. + +## System review + +The diagram below summarizes the system design. Integration tests use an API +client that makes calls to a backend service. Prior to fulfilling the response, +the service checks and decrements a quota. Said quota persists in a backend +redis instance that is refreshed on an interval by the +[Refresher](./src/main/go/cmd/service/refresher). + +## Echo Service + +The [Echo Service](./src/main/go/cmd/service/echo) implements a simple gRPC +service that echos a payload. See [echo.proto](./proto/echo/v1/echo.proto) +for details. + +```mermaid +flowchart LR + echoClient --> echoSvc + subgraph "Integration Tests" + echoClient[Echo Client] + end + subgraph Backend + echoSvc[Echo Service<./src/main/go/cmd/service/echo>] + refresher[Refresher<./src/main/go/cmd/service/refresher>] + redis[redis://:6739] + refresher -- SetQuota(<string>,<int64>,<time.Duration>) --> redis + echoSvc -- DecrementQuota(<string>) --> redis + end +``` + +# Development Dependencies + +| Dependency | Reason | +|-----------------------------------------------------|----------------------------------------------------------------------------------------| +| [go](https://go.dev) | For making code changes in this directory. See [go.mod](go.mod) for required version. | +| [buf](https://github.com/bufbuild/buf#installation) | Optional for when making changes to proto. | +| [ko](https://ko.build/install/) | To easily build Go container images. | +| [poetry](https://python-poetry.org/) | To manage python dependencies. | + +# Testing + +## Unit + +To run unit tests in this project, execute the following command: + +``` +go test ./src/main/go/internal/... +``` + +## Integration + +Integration tests require the following values. + +### Quota ID + +Each allocated quota corresponds to a unique ID known as the Quota ID. +There exists a one-to-one relationship between the allocated quota and +the +[infrastructure/kubernetes/refresher/overlays](infrastructure/kubernetes/refresher/overlays). + +To query the Kubernetes cluster for allocated Quota IDs: +``` +kubectl get deploy --selector=app.kubernetes.io/name=refresher -o custom-columns='QUOTA_ID:.metadata.labels.quota-id' +``` + +### Service Endpoint + +To list available endpoints, run: + +``` +kubectl get svc -o=custom-columns='NAME:.metadata.name,HOST:.status.loadBalancer.ingress[*].ip,PORT_NAME:.spec.ports[*].name,PORT:.spec.ports[*].port' +``` + +You should see something similar to: + +``` +NAME HOST PORT_NAME PORT +echo 10.n.n.n grpc,http 50051,8080 +``` + +When running tests locally, you will need to first run: +``` +kubectl port-forward service/echo 50051:50051 8080:8080 +``` + +which allows you to access the gRPC via `localhost:50051` and the HTTP via +`http://localhost:8080/v1/echo`. + +When running tests on Dataflow, you supply `10.n.n.n:50051` for gRPC and +`http://10.n.n.n:8080/v1/echo` for HTTP. + +# Local Usage + +## Requirements + +To execute the services on your local machine, you'll need [redis](https://redis.io/docs/getting-started/installation/). + +## Execute services + +Follow these steps to run the services on your local machine. + + +1. Start redis + + Start redis using the following command. + ``` + redis-server + ``` + +1. Start the refresher service in a new terminal. + ``` + export CACHE_HOST=localhost:6379; \ + export QUOTA_ID=$(uuidgen); \ + export QUOTA_REFRESH_INTERVAL=10s; \ + export QUOTA_SIZE=100; \ + go run ./src/main/go/cmd/service/refresher + ``` +1. Start the echo service in a new terminal. + ``` + export HTTP_PORT=8080; \ + export GRPC_PORT=50051; \ + export CACHE_HOST=localhost:6379; \ + go run ./src/main/go/cmd/service/echo + ``` + +# Deployment + +The following has already been performed for the `apache-beam-testing` project +and only needs to be done for a different Google Cloud project. + +To deploy the APIs and dependent services, run the following commands. + +## 1. Provision dependent resources in Google Cloud. + +``` +terraform -chdir=infrastructure/terraform init +terraform -chdir=infrastructure/terraform apply -var-file=apache-beam-testing.tfvars +``` + +## 2. Set the KO_DOCKER_REPO environment variable. + +After the terraform module completes, you will need to set the following: + +``` +export KO_DOCKER_REPO=-docker.pkg.dev// +``` + +where: + +- `region` - is the GCP compute region +- `project` - is the GCP project id i.e. `apache-beam-testing` +- `repository` - is the repository name created by the terraform module. To +find this run: +`gcloud artifacts repositories list --project= --location=`. +For example, +`gcloud artifacts repositories list --project=apache-beam-testing --location=us-west1` + +## 3. Connect to the Kubernetes cluster + +Run the following command to setup credentials to the Kubernetes cluster. + +``` +gcloud container clusters get-credentials --region --project +``` + +where: +- `region` - is the GCP compute region +- `project` - is the GCP project id i.e. `apache-beam-testing` +- `` - is the name of the cluster created by the terraform module. +You can find this by running `gcloud container clusters list --project= --region=` + +## 4. Provision the Redis instance + +``` +kubectl kustomize --enable-helm infrastructure/kubernetes/redis | kubectl apply -f - +``` + +**You will initially see "Unschedulable" while the cluster is applying the helm +chart. It's important to wait until the helm chart completely provisions resources +before proceeding. Using Google Kubernetes Engine (GKE) autopilot may take some +time before this autoscales appropriately. ** + +## 5. Provision the Echo service + +Run the following command to provision the Echo service. + +``` +kubectl kustomize infrastructure/kubernetes/echo | ko resolve -f - | kubectl apply -f - +``` + +Like previously, you may see "Does not have minimum availability" message +showing on the status. It may take some time for GKE autopilot +to scale the node pool. + +## 6. Provision the Refresher services + +The Refresher service relies on [kustomize](https://kustomize.io) overlays +which are located at [infrastructure/kubernetes/refresher/overlays](infrastructure/kubernetes/refresher/overlays). + +Each folder contained in [infrastructure/kubernetes/refresher/overlays](infrastructure/kubernetes/refresher/overlays) +corresponds to an individual Refresher instance that is identified by a unique +string id. You will need to deploy each one individually. + +For example: +``` +kubectl kustomize infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota | ko resolve -f - | kubectl apply -f - +``` + +Like previously, you may see "Does not have minimum availability" message +showing on the status. It may take some time for GKE autopilot +to scale the node pool. diff --git a/.test-infra/mock-apis/buf.gen.yaml b/.test-infra/mock-apis/buf.gen.yaml new file mode 100644 index 0000000000000..e5f6e51c14d8f --- /dev/null +++ b/.test-infra/mock-apis/buf.gen.yaml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# buf.gen.yaml configures proto stub generation using buf. +# +# Requirements: +# - go install google.golang.org/protobuf/cmd/protoc-gen-go@latest +# - go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest +# - https://github.com/grpc/grpc-java/blob/master/compiler/README.md#grpc-java-codegen-plugin-for-protobuf-compiler +# - https://grpc.io/docs/languages/python/quickstart/#grpc-tools +# +# Usage: +# Open a terminal in the same directory as this file and run: +# +# buf generate +# +# See https://buf.build/docs/ for more details. + +version: v1 +plugins: +- name: go + out: src/main/go/internal +- name: go-grpc + out: src/main/go/internal +- name: java + out: src/main/java +- name: grpc-java + out: src/main/java +- plugin: buf.build/protocolbuffers/python:v24.4 + out: src/main/python +- plugin: buf.build/grpc/python:v1.59.1 + out: src/main/python \ No newline at end of file diff --git a/.test-infra/mock-apis/buf.lock b/.test-infra/mock-apis/buf.lock new file mode 100644 index 0000000000000..1304ceb9973e0 --- /dev/null +++ b/.test-infra/mock-apis/buf.lock @@ -0,0 +1,7 @@ +# Generated by buf. DO NOT EDIT. +version: v1 +deps: + - remote: buf.build + owner: googleapis + repository: googleapis + commit: 28151c0d0a1641bf938a7672c500e01d diff --git a/.test-infra/mock-apis/buf.yaml b/.test-infra/mock-apis/buf.yaml new file mode 100644 index 0000000000000..419e020247f94 --- /dev/null +++ b/.test-infra/mock-apis/buf.yaml @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures buf to include various proto dependencies. +# See buf.build for details. +version: v1 +deps: +- buf.build/googleapis/googleapis \ No newline at end of file diff --git a/.test-infra/mock-apis/build.gradle b/.test-infra/mock-apis/build.gradle new file mode 100644 index 0000000000000..64b7e8c614cc3 --- /dev/null +++ b/.test-infra/mock-apis/build.gradle @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + plugins { + id 'org.apache.beam.module' + } + + applyJavaNature( + exportJavadoc: false, + publish: false, + ) + +description = "Apache Beam :: Test Infra :: Mock APIs" +ext.summary = "Supports API related integration tests." + +def guavaVersion = "31.1-jre" +def ioGrpcApiVersion = "1.53.0" +def protobufVersion = "1.55.1" +def protobufJavaVersion = "3.23.2" + + dependencies { + + // Required by autogenerated proto classes. + implementation "io.grpc:grpc-api:${ioGrpcApiVersion}" + implementation "com.google.guava:guava:${guavaVersion}" + implementation "io.grpc:grpc-protobuf:${protobufVersion}" + implementation "com.google.protobuf:protobuf-java:${protobufJavaVersion}" + implementation "io.grpc:grpc-stub:${protobufVersion}" + } \ No newline at end of file diff --git a/.test-infra/mock-apis/go.mod b/.test-infra/mock-apis/go.mod new file mode 100644 index 0000000000000..cc65cfbaac76b --- /dev/null +++ b/.test-infra/mock-apis/go.mod @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This module contains all Go code used for Beam's SDKs. This file is placed +// in this directory in order to cover the go code required for Java and Python +// containers, as well as the entire Go SDK. Placing this file in the repository +// root is not possible because it causes conflicts with a pre-existing vendor +// directory. +module github.com/apache/beam/test-infra/mock-apis + +go 1.21 + +require ( + cloud.google.com/go/logging v1.8.1 + cloud.google.com/go/monitoring v1.16.0 + github.com/google/go-cmp v0.5.9 + github.com/redis/go-redis/v9 v9.2.1 + google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d + google.golang.org/grpc v1.58.3 + google.golang.org/protobuf v1.31.0 +) + +require ( + cloud.google.com/go v0.110.6 // indirect + cloud.google.com/go/compute v1.23.0 // indirect + cloud.google.com/go/compute/metadata v0.2.3 // indirect + cloud.google.com/go/longrunning v0.5.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/google/s2a-go v0.1.4 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.2.4 // indirect + github.com/googleapis/gax-go/v2 v2.12.0 // indirect + go.opencensus.io v0.24.0 // indirect + golang.org/x/crypto v0.14.0 // indirect + golang.org/x/net v0.17.0 // indirect + golang.org/x/oauth2 v0.12.0 // indirect + golang.org/x/sync v0.3.0 // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/text v0.13.0 // indirect + google.golang.org/api v0.128.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect +) diff --git a/.test-infra/mock-apis/go.sum b/.test-infra/mock-apis/go.sum new file mode 100644 index 0000000000000..a928e3dae2f1f --- /dev/null +++ b/.test-infra/mock-apis/go.sum @@ -0,0 +1,214 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.110.6 h1:8uYAkj3YHTP/1iwReuHPxLSbdcyc+dSBbzFMrVwDR6Q= +cloud.google.com/go v0.110.6/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= +cloud.google.com/go/compute v1.23.0 h1:tP41Zoavr8ptEqaW6j+LQOnyBBhO7OkOMAGrgLopTwY= +cloud.google.com/go/compute v1.23.0/go.mod h1:4tCnrn48xsqlwSAiLf1HXMQk8CONslYbdiEZc9FEIbM= +cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= +cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= +cloud.google.com/go/iam v1.1.1 h1:lW7fzj15aVIXYHREOqjRBV9PsH0Z6u8Y46a1YGvQP4Y= +cloud.google.com/go/iam v1.1.1/go.mod h1:A5avdyVL2tCppe4unb0951eI9jreack+RJ0/d+KUZOU= +cloud.google.com/go/logging v1.8.1 h1:26skQWPeYhvIasWKm48+Eq7oUqdcdbwsCVwz5Ys0FvU= +cloud.google.com/go/logging v1.8.1/go.mod h1:TJjR+SimHwuC8MZ9cjByQulAMgni+RkXeI3wwctHJEI= +cloud.google.com/go/longrunning v0.5.1 h1:Fr7TXftcqTudoyRJa113hyaqlGdiBQkp0Gq7tErFDWI= +cloud.google.com/go/longrunning v0.5.1/go.mod h1:spvimkwdz6SPWKEt/XBij79E9fiTkHSQl/fRUUQJYJc= +cloud.google.com/go/monitoring v1.16.0 h1:rlndy4K8yknMY9JuGe2aK4SbCh21FXoCdX7SAGHmRgI= +cloud.google.com/go/monitoring v1.16.0/go.mod h1:Ptp15HgAyM1fNICAojDMoNc/wUmn67mLHQfyqbw+poY= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= +github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4/go.mod h1:6pvJx4me5XPnfI9Z40ddWsdw2W/uZgQLFXToKeRcDiI= +github.com/cncf/xds/go v0.0.0-20210805033703-aa0b78936158/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20210922020428-25de7278fc84/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= +github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/s2a-go v0.1.4 h1:1kZ/sQM3srePvKs3tXAvQzo66XfcReoqFpIpIccE7Oc= +github.com/google/s2a-go v0.1.4/go.mod h1:Ej+mSEMGRnqRzjc7VtF+jdBwYG5fuJfiZ8ELkjEwM0A= +github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.2.4 h1:uGy6JWR/uMIILU8wbf+OkstIrNiMjGpEIyhx8f6W7s4= +github.com/googleapis/enterprise-certificate-proxy v0.2.4/go.mod h1:AwSRAtLfXpU5Nm3pW+v7rGDHp09LsPtGY9MduiEsR9k= +github.com/googleapis/gax-go/v2 v2.12.0 h1:A+gCJKdRfqXkr+BIRGtZLibNXf0m1f9E4HG56etFpas= +github.com/googleapis/gax-go/v2 v2.12.0/go.mod h1:y+aIqrI5eb1YGMVJfuV3185Ts/D7qKpsEkdD5+I6QGU= +github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/redis/go-redis/v9 v9.2.1 h1:WlYJg71ODF0dVspZZCpYmoF1+U1Jjk9Rwd7pq6QmlCg= +github.com/redis/go-redis/v9 v9.2.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= +github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= +go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.12.0 h1:smVPGxink+n1ZI5pkQa8y6fZT0RW0MgCO5bFpepy4B4= +golang.org/x/oauth2 v0.12.0/go.mod h1:A74bZ3aGXgCY0qaIC9Ahg6Lglin4AMAco8cIv9baba4= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/api v0.128.0 h1:RjPESny5CnQRn9V6siglged+DZCgfu9l6mO9dkX9VOg= +google.golang.org/api v0.128.0/go.mod h1:Y611qgqaE92On/7g65MQgxYul3c0rEB894kniWLY750= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 h1:L6iMMGrtzgHsWofoFcihmDEMYeDR9KN/ThbPWGrh++g= +google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5/go.mod h1:oH/ZOT02u4kWEp7oYBGYFFkCdKS/uYR9Z7+0/xuuFp8= +google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d h1:DoPTO70H+bcDXcd39vOqb2viZxgqeBeSGtZ55yZU4/Q= +google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d/go.mod h1:KjSP20unUpOx5kyQUFa7k4OJg0qeJ7DEZflGDu2p6Bk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d h1:uvYuEyMHKNt+lT4K3bN6fGswmK8qSvcreM3BwjDh+y4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0= +google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= +google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= +google.golang.org/grpc v1.45.0/go.mod h1:lN7owxKUQEqMfSyQikvvk5tf/6zMPsrK+ONuO11+0rQ= +google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ= +google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/echo/configmap.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/echo/configmap.yaml new file mode 100644 index 0000000000000..831bca3e67c02 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/echo/configmap.yaml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the Echo Service ConfigMap. +# See https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/cmd/service/echo/main.go +# for details on the Echo service executable and +# https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/internal/environment/variable.go +# for details on various environment variables. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: echo +data: + HTTP_PORT: "8080" + GRPC_PORT: "50051" + # See .test-infra/mock-apis/infrastructure/kubernetes/redis + CACHE_HOST: redis-master.default.svc.cluster.local:6379 diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/echo/deployment.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/echo/deployment.yaml new file mode 100644 index 0000000000000..55f246362d4b7 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/echo/deployment.yaml @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the Echo Deployment on the Kubernetes cluster. +# See https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/cmd/service/echo/main.go +# for details on the Echo service executable. +# Assumes usage of https://ko.build/ to resolve the manifest: +# export KO_DOCKER_REPO=-docker.pkg.dev// +# kubectl kustomize .test-infra/mock-apis/infrastructure/echo | ko resolve -f - | kubectl apply -f - +# See .test-infra/mock-apis/README.md for details + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: echo + labels: + app: echo +spec: + replicas: 3 + selector: + matchLabels: + app: echo + template: + metadata: + labels: + app: echo + spec: + containers: + - name: echo + # Prefixed with ko:// to resolve with the ko utility. + # See https://ko.build/features/k8s/ for details. + image: ko://github.com/apache/beam/test-infra/mock-apis/src/main/go/cmd/service/echo + imagePullPolicy: IfNotPresent + envFrom: + - configMapRef: + name: echo diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/echo/kustomization.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/echo/kustomization.yaml new file mode 100644 index 0000000000000..2bfdec218afcd --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/echo/kustomization.yaml @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the provisioning of the Echo service on the Kubernetes cluster. +# See https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/cmd/service/echo/main.go +# for details on the Echo service executable. + +resources: +- configmap.yaml +- deployment.yaml +- service.yaml diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/echo/service.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/echo/service.yaml new file mode 100644 index 0000000000000..da9a811146e53 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/echo/service.yaml @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the provisioning of the Echo Service on the Kubernetes cluster. +# See https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/cmd/service/echo/main.go +# for details on the Echo service executable. + +apiVersion: v1 +kind: Service +metadata: + name: echo + annotations: + # Configures the LoadBalancer to assign an internal private IP + # instead of an external private IP. + # See https://cloud.google.com/kubernetes-engine/docs/how-to/internal-load-balancing + networking.gke.io/load-balancer-type: "Internal" +spec: + type: LoadBalancer + externalTrafficPolicy: Cluster + selector: + app: echo + # Ports must match the environment variables assigned in the ConfigMap/echo. + # See configmap.yaml. + ports: + - port: 50051 + name: grpc + targetPort: 50051 + - port: 8080 + name: http + targetPort: 8080 diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/redis/kustomization.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/redis/kustomization.yaml new file mode 100644 index 0000000000000..9d3c3256b696b --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/redis/kustomization.yaml @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures a Redis instance using https://bitnami.com/stack/redis/helm. +helmCharts: + - name: redis + releaseName: redis + repo: https://charts.bitnami.com/bitnami + version: 18.1.5 + valuesFile: redis-values.yaml diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/redis/redis-values.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/redis/redis-values.yaml new file mode 100644 index 0000000000000..147534d4a22af --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/redis/redis-values.yaml @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assigns values to the https://bitnami.com/stack/redis/helm chart. +auth: + # The cluster is used for testing only in a private Google Kubernetes Engine + # (GKE) cluster. So setting enabled to false delegates role based + # access control to Google Cloud Identity and Access Management (IAM). + enabled: false + + # We set sentinel to false, since we do not need high availability. + # See https://developer.redis.com/operate/redis-at-scale/high-availability/understanding-sentinels/ + # for more details on the sentinel mode. + sentinel: false \ No newline at end of file diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/configmap.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/configmap.yaml new file mode 100644 index 0000000000000..eebb099b23b59 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/configmap.yaml @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the Refresher ConfigMap. +# See https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/cmd/service/refresher/main.go +# Designed for use with kustomize patch overlays. +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +apiVersion: v1 +kind: ConfigMap +metadata: + name: refresher + labels: + app.kubernetes.io/name: refresher + + # targeted for overlay replacement + quota-id: quota-id-value +data: + CACHE_HOST: redis-master.default.svc.cluster.local:6379 + + # targeted for overlay replacement + QUOTA_ID: quota-id-value + QUOTA_SIZE: "100" + QUOTA_REFRESH_INTERVAL: "10s" + diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/deployment.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/deployment.yaml new file mode 100644 index 0000000000000..d89c4f0601f60 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/deployment.yaml @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the Refresher Deployment. +# See https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/cmd/service/refresher/main.go +# Designed for use with kustomize patch overlays. +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +apiVersion: apps/v1 +kind: Deployment +metadata: + # name created using kustomize nameSuffix as refresher- + name: refresher + + labels: + app.kubernetes.io/name: refresher + + # targeted for overlay replacement + quota-id: quota-id-value +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: refresher + + # targeted for overlay replacement + quota-id: quota-id-value + template: + metadata: + labels: + app.kubernetes.io/name: refresher + + # targeted for overlay replacement + quota-id: quota-id-value + spec: + containers: + - name: refresher + image: ko://github.com/apache/beam/test-infra/mock-apis/src/main/go/cmd/service/refresher + imagePullPolicy: IfNotPresent + envFrom: + - configMapRef: + name: refresher \ No newline at end of file diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/kustomization.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/kustomization.yaml new file mode 100644 index 0000000000000..8dedf1dd485d5 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/base/kustomization.yaml @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the provisioning of the Refresher service on the Kubernetes cluster. +# See https://github.com/apache/beam/blob/master/.test-infra/mock-apis/src/main/go/cmd/service/refresher/main.go +# for details on the Echo service executable. + +resources: +- configmap.yaml +- deployment.yaml diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/configmap.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/configmap.yaml new file mode 100644 index 0000000000000..d78c862c2afdc --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/configmap.yaml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures patch for ../base/configmap.yaml +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +- op: replace + path: /metadata/labels/quota-id + value: echo-10-per-1s-quota +- op: replace + path: /data/QUOTA_ID + value: echo-10-per-1s-quota +- op: replace + path: /data/QUOTA_SIZE + value: "10" +- op: replace + path: /data/QUOTA_REFRESH_INTERVAL + value: 1s diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/deployment.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/deployment.yaml new file mode 100644 index 0000000000000..37af48ec97d4b --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/deployment.yaml @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures patch for ../base/deployment.yaml +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +- op: replace + path: /metadata/labels/quota-id + value: echo-10-per-1s-quota +- op: replace + path: /spec/selector/matchLabels/quota-id + value: echo-10-per-1s-quota +- op: replace + path: /spec/template/metadata/labels/quota-id + value: echo-10-per-1s-quota diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/kustomization.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/kustomization.yaml new file mode 100644 index 0000000000000..d506983544355 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-10-per-1s-quota/kustomization.yaml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the overlay for .test-infra/mock-apis/infrastructure/kubernetes/refresher/base +# Using the Quota Id: +# echo-10-per-1s-quota + +resources: +- ../../base + +nameSuffix: -echo-10-per-1s-quota + +patches: +- path: configmap.yaml + target: + kind: ConfigMap + name: refresher + +- path: deployment.yaml + target: + kind: Deployment + name: refresher diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/configmap.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/configmap.yaml new file mode 100644 index 0000000000000..274ae43ebb895 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/configmap.yaml @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures patch for ../base/configmap.yaml +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +- op: replace + path: /metadata/labels/quota-id + value: echo-should-exceed-quota +- op: replace + path: /data/QUOTA_ID + value: echo-should-exceed-quota +- op: replace + path: /data/QUOTA_SIZE + # We need at least 1 + value: "1" diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/deployment.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/deployment.yaml new file mode 100644 index 0000000000000..e903a6c7c29c0 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/deployment.yaml @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures patch for ../base/deployment.yaml +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +- op: replace + path: /metadata/labels/quota-id + value: echo-should-exceed-quota +- op: replace + path: /spec/selector/matchLabels/quota-id + value: echo-should-exceed-quota +- op: replace + path: /spec/template/metadata/labels/quota-id + value: echo-should-exceed-quota diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/kustomization.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/kustomization.yaml new file mode 100644 index 0000000000000..9330ea4c6c786 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-exceed-quota/kustomization.yaml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the overlay for .test-infra/mock-apis/infrastructure/kubernetes/refresher/base +# Using the Quota Id: +# echo-should-exceed-quota + +resources: +- ../../base + +nameSuffix: -echo-should-exceed-quota + +patches: +- path: configmap.yaml + target: + kind: ConfigMap + name: refresher + +- path: deployment.yaml + target: + kind: Deployment + name: refresher diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/configmap.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/configmap.yaml new file mode 100644 index 0000000000000..409d83a812691 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/configmap.yaml @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures patch for ../base/configmap.yaml +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +- op: replace + path: /metadata/labels/quota-id + value: echo-should-never-exceed-quota +- op: replace + path: /data/QUOTA_ID + value: echo-should-never-exceed-quota +- op: replace + path: /data/QUOTA_SIZE + value: "1000000000" diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/deployment.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/deployment.yaml new file mode 100644 index 0000000000000..d550adf02048d --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/deployment.yaml @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures patch for ../base/deployment.yaml +# See https://kubectl.docs.kubernetes.io/references/kustomize/kustomization/patches/ + +- op: replace + path: /metadata/labels/quota-id + value: echo-should-never-exceed-quota +- op: replace + path: /spec/selector/matchLabels/quota-id + value: echo-should-never-exceed-quota +- op: replace + path: /spec/template/metadata/labels/quota-id + value: echo-should-never-exceed-quota diff --git a/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/kustomization.yaml b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/kustomization.yaml new file mode 100644 index 0000000000000..1f8d23ba01bd3 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/kubernetes/refresher/overlays/echo-should-never-exceed-quota/kustomization.yaml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Configures the overlay for .test-infra/mock-apis/infrastructure/kubernetes/refresher/base +# Using the Quota Id: +# echo-should-never-exceed-quota + +resources: +- ../../base + +nameSuffix: -echo-should-never-exceed-quota + +patches: +- path: configmap.yaml + target: + kind: ConfigMap + name: refresher + +- path: deployment.yaml + target: + kind: Deployment + name: refresher diff --git a/.test-infra/mock-apis/infrastructure/terraform/.terraform.lock.hcl b/.test-infra/mock-apis/infrastructure/terraform/.terraform.lock.hcl new file mode 100644 index 0000000000000..03be0106251e2 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/terraform/.terraform.lock.hcl @@ -0,0 +1,40 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "5.3.0" + hashes = [ + "h1:FGOSaAf5Fcw2GVPlGMlytcmhDxnSt3f2PfAewDS9km4=", + "zh:17849daec20cd82de916c897c730285267c62b5291bc24cd3fbdac5d10be746a", + "zh:1bab50e2eb7382e7342095417a1119e65dee1b62a5c0d93f8df724be4421c3fd", + "zh:3a800e3ea8de0d2b3b69f3256461878a5e0a6cfd0801fd762a087578ad42a207", + "zh:3dc70168baa91f6815a7e1885c4e29cadd2c67f41d9267a9278b6626c8fac594", + "zh:4000c3e16ea1bc3b5636ec18dba080135a90c0d4365597331ead9f30860041af", + "zh:58d812b8869158b2bf9c4a1a9676b6283a1914104234e8e70c36d4e1985abded", + "zh:908ff6a2a144ee76f4b68ce88164533343b2f860b8ee510107ff8e026856f5c1", + "zh:b606b6516151a947b7d9485cf330366b9c1b439677f8732cae6677cc3dc0a71f", + "zh:b623cda8316699b40db50081f79e361935d6b66b07d9dd607ed3598e51a8ffdf", + "zh:e99693fc83a8017dab5136d41a688777bb1e76076e837f2039fd6d69fe5dcfc4", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:fd7a7e58aa0baa9f3dd05ec693a2849ed8f724c34b8c42b3cbc4919399e622cd", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.5.1" + hashes = [ + "h1:VSnd9ZIPyfKHOObuQCaKfnjIHRtR7qTw19Rz8tJxm+k=", + "zh:04e3fbd610cb52c1017d282531364b9c53ef72b6bc533acb2a90671957324a64", + "zh:119197103301ebaf7efb91df8f0b6e0dd31e6ff943d231af35ee1831c599188d", + "zh:4d2b219d09abf3b1bb4df93d399ed156cadd61f44ad3baf5cf2954df2fba0831", + "zh:6130bdde527587bbe2dcaa7150363e96dbc5250ea20154176d82bc69df5d4ce3", + "zh:6cc326cd4000f724d3086ee05587e7710f032f94fc9af35e96a386a1c6f2214f", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:b6d88e1d28cf2dfa24e9fdcc3efc77adcdc1c3c3b5c7ce503a423efbdd6de57b", + "zh:ba74c592622ecbcef9dc2a4d81ed321c4e44cddf7da799faa324da9bf52a22b2", + "zh:c7c5cde98fe4ef1143bd1b3ec5dc04baf0d4cc3ca2c5c7d40d17c0e9b2076865", + "zh:dac4bad52c940cd0dfc27893507c1e92393846b024c5a9db159a93c534a3da03", + "zh:de8febe2a2acd9ac454b844a4106ed295ae9520ef54dc8ed2faf29f12716b602", + "zh:eab0d0495e7e711cca367f7d4df6e322e6c562fc52151ec931176115b83ed014", + ] +} diff --git a/.test-infra/mock-apis/infrastructure/terraform/artifact_registry.tf b/.test-infra/mock-apis/infrastructure/terraform/artifact_registry.tf new file mode 100644 index 0000000000000..347ab86310888 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/terraform/artifact_registry.tf @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +resource "google_artifact_registry_repository" "default" { + depends_on = [google_project_service.required] + format = "DOCKER" + repository_id = local.resource_name + location = var.region +} + +// Bind the node pool service account to the roles/artifactregistry.reader role. +resource "google_artifact_registry_repository_iam_member" "default" { + depends_on = [google_project_service.required] + member = "serviceAccount:${google_service_account.node_pool.email}" + repository = google_artifact_registry_repository.default.id + role = "roles/artifactregistry.reader" +} diff --git a/.test-infra/mock-apis/infrastructure/terraform/cluster.tf b/.test-infra/mock-apis/infrastructure/terraform/cluster.tf new file mode 100644 index 0000000000000..e2a3c734ebcb0 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/terraform/cluster.tf @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Provision the Kubernetes cluster. +resource "google_container_cluster" "default" { + depends_on = [google_project_service.required] + deletion_protection = false + name = local.resource_name + location = var.region + enable_autopilot = true + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = false + } + network = data.google_compute_network.default.id + subnetwork = data.google_compute_network.default.id + node_config { + service_account = google_service_account.node_pool.email + } +} diff --git a/.test-infra/mock-apis/infrastructure/terraform/iam.tf b/.test-infra/mock-apis/infrastructure/terraform/iam.tf new file mode 100644 index 0000000000000..7946e2fa687df --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/terraform/iam.tf @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Provision a custom service account for the node pool. +resource "google_service_account" "node_pool" { + depends_on = [google_project_service.required] + account_id = local.resource_name +} + +// Bind minimally permissive IAM roles to the node pool service account. +// See https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster#permissions +resource "google_project_iam_member" "node_pool" { + for_each = toset([ + "roles/logging.logWriter", + "roles/monitoring.metricWriter", + "roles/monitoring.viewer", + "roles/stackdriver.resourceMetadata.writer", + "roles/autoscaling.metricsWriter" + ]) + member = "serviceAccount:${google_service_account.node_pool.email}" + project = var.project + role = each.key +} diff --git a/.test-infra/mock-apis/infrastructure/terraform/prerequisites.tf b/.test-infra/mock-apis/infrastructure/terraform/prerequisites.tf new file mode 100644 index 0000000000000..ae1534c67102e --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/terraform/prerequisites.tf @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +locals { + resource_name = "${var.resource_name_prefix}-${random_string.postfix.result}" +} + +resource "google_project_service" "required" { + for_each = toset([ + "artifactregistry", + "cloudresourcemanager", + "container", + "iam", + ]) + service = "${each.key}.googleapis.com" + disable_on_destroy = false +} + +resource "random_string" "postfix" { + length = 6 + special = false + upper = false +} + +// Query the VPC network. +data "google_compute_network" "default" { + name = var.network +} + +// Query valid subnetwork configuration. +data "google_compute_subnetwork" "default" { + name = var.subnetwork + region = var.region + lifecycle { + postcondition { + condition = self.private_ip_google_access + error_message = "The subnetwork: regions/${var.region}/subnetworks/${var.subnetwork} in projects/${var.project}/networks/${var.network} does not have private google access enabled" + } + } +} + +// Query valid existence of the router. +data "google_compute_router" "default" { + name = var.router + region = var.region + network = data.google_compute_network.default.id +} diff --git a/.test-infra/mock-apis/infrastructure/terraform/provider.tf b/.test-infra/mock-apis/infrastructure/terraform/provider.tf new file mode 100644 index 0000000000000..313d2f19fad02 --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/terraform/provider.tf @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +provider "google" { + project = var.project + region = var.region +} \ No newline at end of file diff --git a/.test-infra/mock-apis/infrastructure/terraform/variables.tf b/.test-infra/mock-apis/infrastructure/terraform/variables.tf new file mode 100644 index 0000000000000..bd73851e1b11a --- /dev/null +++ b/.test-infra/mock-apis/infrastructure/terraform/variables.tf @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +variable "project" { + type = string + description = "The Google Cloud (GCP) project ID within which this module provisions resources" +} + +variable "region" { + type = string + description = "The Compute region which which this module provisions resources" +} + +variable "resource_name_prefix" { + type = string + description = "The prefix to apply when naming resources followed by a random string" +} + +variable "router" { + type = string + description = "The name of the Compute Network Router" +} + +variable "network" { + type = string + description = "The Virtual Private Cloud (VPC) network ID" +} + +variable "subnetwork" { + type = string + description = "The Virtual Private Cloud (VPC) subnetwork ID" +} diff --git a/.test-infra/mock-apis/poetry.lock b/.test-infra/mock-apis/poetry.lock new file mode 100644 index 0000000000000..b36baff7a74b6 --- /dev/null +++ b/.test-infra/mock-apis/poetry.lock @@ -0,0 +1,219 @@ +# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand. + +[[package]] +name = "beautifulsoup4" +version = "4.12.2" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, + {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +name = "google" +version = "3.0.0" +description = "Python bindings to the Google search engine." +optional = false +python-versions = "*" +files = [ + {file = "google-3.0.0-py2.py3-none-any.whl", hash = "sha256:889cf695f84e4ae2c55fbc0cfdaf4c1e729417fa52ab1db0485202ba173e4935"}, + {file = "google-3.0.0.tar.gz", hash = "sha256:143530122ee5130509ad5e989f0512f7cb218b2d4eddbafbad40fd10e8d8ccbe"}, +] + +[package.dependencies] +beautifulsoup4 = "*" + +[[package]] +name = "grpcio" +version = "1.59.2" +description = "HTTP/2-based RPC framework" +optional = false +python-versions = ">=3.7" +files = [ + {file = "grpcio-1.59.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:d2fa68a96a30dd240be80bbad838a0ac81a61770611ff7952b889485970c4c71"}, + {file = "grpcio-1.59.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:cf0dead5a2c5a3347af2cfec7131d4f2a2e03c934af28989c9078f8241a491fa"}, + {file = "grpcio-1.59.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:e420ced29b5904cdf9ee5545e23f9406189d8acb6750916c2db4793dada065c6"}, + {file = "grpcio-1.59.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b230028a008ae1d0f430acb227d323ff8a619017415cf334c38b457f814119f"}, + {file = "grpcio-1.59.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a4a3833c0e067f3558538727235cd8a49709bff1003200bbdefa2f09334e4b1"}, + {file = "grpcio-1.59.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6b25ed37c27e652db01be341af93fbcea03d296c024d8a0e680017a268eb85dd"}, + {file = "grpcio-1.59.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73abb8584b0cf74d37f5ef61c10722adc7275502ab71789a8fe3cb7ef04cf6e2"}, + {file = "grpcio-1.59.2-cp310-cp310-win32.whl", hash = "sha256:d6f70406695e3220f09cd7a2f879333279d91aa4a8a1d34303b56d61a8180137"}, + {file = "grpcio-1.59.2-cp310-cp310-win_amd64.whl", hash = "sha256:3c61d641d4f409c5ae46bfdd89ea42ce5ea233dcf69e74ce9ba32b503c727e29"}, + {file = "grpcio-1.59.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:3059668df17627f0e0fa680e9ef8c995c946c792612e9518f5cc1503be14e90b"}, + {file = "grpcio-1.59.2-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:72ca2399097c0b758198f2ff30f7178d680de8a5cfcf3d9b73a63cf87455532e"}, + {file = "grpcio-1.59.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:c978f864b35f2261e0819f5cd88b9830b04dc51bcf055aac3c601e525a10d2ba"}, + {file = "grpcio-1.59.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9411e24328a2302e279e70cae6e479f1fddde79629fcb14e03e6d94b3956eabf"}, + {file = "grpcio-1.59.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb7e0fe6ad73b7f06d7e2b689c19a71cf5cc48f0c2bf8608469e51ffe0bd2867"}, + {file = "grpcio-1.59.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c2504eed520958a5b77cc99458297cb7906308cb92327f35fb7fbbad4e9b2188"}, + {file = "grpcio-1.59.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2171c39f355ba5b551c5d5928d65aa6c69807fae195b86ef4a7d125bcdb860a9"}, + {file = "grpcio-1.59.2-cp311-cp311-win32.whl", hash = "sha256:d2794f0e68b3085d99b4f6ff9c089f6fdd02b32b9d3efdfbb55beac1bf22d516"}, + {file = "grpcio-1.59.2-cp311-cp311-win_amd64.whl", hash = "sha256:2067274c88bc6de89c278a672a652b4247d088811ece781a4858b09bdf8448e3"}, + {file = "grpcio-1.59.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:535561990e075fa6bd4b16c4c3c1096b9581b7bb35d96fac4650f1181e428268"}, + {file = "grpcio-1.59.2-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:a213acfbf186b9f35803b52e4ca9addb153fc0b67f82a48f961be7000ecf6721"}, + {file = "grpcio-1.59.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:6959fb07e8351e20501ffb8cc4074c39a0b7ef123e1c850a7f8f3afdc3a3da01"}, + {file = "grpcio-1.59.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e82c5cf1495244adf5252f925ac5932e5fd288b3e5ab6b70bec5593074b7236c"}, + {file = "grpcio-1.59.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:023088764012411affe7db183d1ada3ad9daf2e23ddc719ff46d7061de661340"}, + {file = "grpcio-1.59.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:da2d94c15f88cd40d7e67f7919d4f60110d2b9d5b1e08cf354c2be773ab13479"}, + {file = "grpcio-1.59.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:6009386a2df66159f64ac9f20425ae25229b29b9dd0e1d3dd60043f037e2ad7e"}, + {file = "grpcio-1.59.2-cp312-cp312-win32.whl", hash = "sha256:75c6ecb70e809cf1504465174343113f51f24bc61e22a80ae1c859f3f7034c6d"}, + {file = "grpcio-1.59.2-cp312-cp312-win_amd64.whl", hash = "sha256:cbe946b3e6e60a7b4618f091e62a029cb082b109a9d6b53962dd305087c6e4fd"}, + {file = "grpcio-1.59.2-cp37-cp37m-linux_armv7l.whl", hash = "sha256:f8753a6c88d1d0ba64302309eecf20f70d2770f65ca02d83c2452279085bfcd3"}, + {file = "grpcio-1.59.2-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:f1ef0d39bc1feb420caf549b3c657c871cad4ebbcf0580c4d03816b0590de0cf"}, + {file = "grpcio-1.59.2-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:4c93f4abbb54321ee6471e04a00139c80c754eda51064187963ddf98f5cf36a4"}, + {file = "grpcio-1.59.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:08d77e682f2bf730a4961eea330e56d2f423c6a9b91ca222e5b1eb24a357b19f"}, + {file = "grpcio-1.59.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ff16d68bf453275466a9a46739061a63584d92f18a0f5b33d19fc97eb69867c"}, + {file = "grpcio-1.59.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4abb717e320e74959517dc8e84a9f48fbe90e9abe19c248541e9418b1ce60acd"}, + {file = "grpcio-1.59.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:36f53c2b3449c015880e7d55a89c992c357f176327b0d2873cdaaf9628a37c69"}, + {file = "grpcio-1.59.2-cp37-cp37m-win_amd64.whl", hash = "sha256:cc3e4cd087f07758b16bef8f31d88dbb1b5da5671d2f03685ab52dece3d7a16e"}, + {file = "grpcio-1.59.2-cp38-cp38-linux_armv7l.whl", hash = "sha256:27f879ae604a7fcf371e59fba6f3ff4635a4c2a64768bd83ff0cac503142fef4"}, + {file = "grpcio-1.59.2-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:7cf05053242f61ba94014dd3a986e11a083400a32664058f80bf4cf817c0b3a1"}, + {file = "grpcio-1.59.2-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:e1727c1c0e394096bb9af185c6923e8ea55a5095b8af44f06903bcc0e06800a2"}, + {file = "grpcio-1.59.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d573e70a6fe77555fb6143c12d3a7d3fa306632a3034b4e7c59ca09721546f8"}, + {file = "grpcio-1.59.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31176aa88f36020055ace9adff2405a33c8bdbfa72a9c4980e25d91b2f196873"}, + {file = "grpcio-1.59.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:11168ef43e4a43ff1b1a65859f3e0ef1a173e277349e7fb16923ff108160a8cd"}, + {file = "grpcio-1.59.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:53c9aa5ddd6857c0a1cd0287225a2a25873a8e09727c2e95c4aebb1be83a766a"}, + {file = "grpcio-1.59.2-cp38-cp38-win32.whl", hash = "sha256:3b4368b33908f683a363f376dfb747d40af3463a6e5044afee07cf9436addf96"}, + {file = "grpcio-1.59.2-cp38-cp38-win_amd64.whl", hash = "sha256:0a754aff9e3af63bdc4c75c234b86b9d14e14a28a30c4e324aed1a9b873d755f"}, + {file = "grpcio-1.59.2-cp39-cp39-linux_armv7l.whl", hash = "sha256:1f9524d1d701e399462d2c90ba7c193e49d1711cf429c0d3d97c966856e03d00"}, + {file = "grpcio-1.59.2-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:f93dbf58f03146164048be5426ffde298b237a5e059144847e4940f5b80172c3"}, + {file = "grpcio-1.59.2-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:6da6dea3a1bacf99b3c2187e296db9a83029ed9c38fd4c52b7c9b7326d13c828"}, + {file = "grpcio-1.59.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5f09cffa619adfb44799fa4a81c2a1ad77c887187613fb0a8f201ab38d89ba1"}, + {file = "grpcio-1.59.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c35aa9657f5d5116d23b934568e0956bd50c615127810fffe3ac356a914c176a"}, + {file = "grpcio-1.59.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:74100fecaec8a535e380cf5f2fb556ff84957d481c13e54051c52e5baac70541"}, + {file = "grpcio-1.59.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:128e20f57c5f27cb0157e73756d1586b83c1b513ebecc83ea0ac37e4b0e4e758"}, + {file = "grpcio-1.59.2-cp39-cp39-win32.whl", hash = "sha256:686e975a5d16602dc0982c7c703948d17184bd1397e16c8ee03511ecb8c4cdda"}, + {file = "grpcio-1.59.2-cp39-cp39-win_amd64.whl", hash = "sha256:242adc47725b9a499ee77c6a2e36688fa6c96484611f33b1be4c57ab075a92dd"}, + {file = "grpcio-1.59.2.tar.gz", hash = "sha256:d8f9cd4ad1be90b0cf350a2f04a38a36e44a026cac1e036ac593dc48efe91d52"}, +] + +[package.extras] +protobuf = ["grpcio-tools (>=1.59.2)"] + +[[package]] +name = "grpcio-tools" +version = "1.59.2" +description = "Protobuf code generator for gRPC" +optional = false +python-versions = ">=3.7" +files = [ + {file = "grpcio-tools-1.59.2.tar.gz", hash = "sha256:75905266cf90f1866b322575c2edcd4b36532c33fc512bb1b380dc58d84b1030"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:9b2885c0e2c9a97bde33497a919032afbd8b5c6dc2f8d4dd4198e77226e0de05"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f410375830a9bb7140a07da4d75bf380e0958377bed50d77d1dae302de4314e"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:e21fc172522d2dda815223a359b2aca9bc317a1b5e5dea5a58cd5079333af133"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:072a7ce979ea4f7579c3c99fcbde3d1882c3d1942a3b51d159f67af83b714cd8"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b38f8edb2909702c2478b52f6213982c21e4f66f739ac953b91f97863ba2c06a"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:12fdee2de80d83eadb1294e0f8a0cb6cefcd2e4988ed680038ab09cd04361ee4"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a3cb707da722a0b6c4021fc2cc1c005a8d4037d8ad0252f93df318b9b8a6b4f3"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-win32.whl", hash = "sha256:ec2fbb02ebb9f2ae1b1c69cccf913dee8c41f5acad94014d3ce11b53720376e3"}, + {file = "grpcio_tools-1.59.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0dc271a200dbab6547b2c73fcbdb7efe94c31cb633aa20d073f7cf4493493e1"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:d634b65cc8ee769edccf1647d8a16861a27e0d8cbd787c711168d2c5e9bddbd1"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:b0b712acec00a9cbc2204c271d638062a2cb8ce74f25d158b023ff6e93182659"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:dd5c78f8e7c6e721b9009c92481a0e3b30a9926ef721120723a03b8a34a34fb9"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:724f4f0eecc17fa66216eebfff145631070f04ed7fb4ddf7a7d1c4f954ecc2a1"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77ec33ddee691e60511e2a7c793aad4cf172ae20e08d95c786cbba395f6203a7"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fa1b9dee7811fad081816e884d063c4dd4946dba61aa54243b4c76c311090c48"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ba8dba19e7b2b6f7369004533866f222ba483b9e14d2d152ecf9339c0df1283a"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-win32.whl", hash = "sha256:df35d145bc2f6e5f57b74cb69f66526675a5f2dcf7d54617ce0deff0c82cca0a"}, + {file = "grpcio_tools-1.59.2-cp311-cp311-win_amd64.whl", hash = "sha256:99ddc0f5304071a355c261ae49ea5d29b9e9b6dcf422dfc55ada70a243e27e8f"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:670f5889853215999eb3511a623dd7dff01b1ce1a64610d13366e0fd337f8c79"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:1e949e66d4555ce319fd7acef90df625138078d8729c4dc6f6a9f05925034433"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:09d809ca88999b2578119683f9f0f6a9b42de95ea21550852114a1540b6a642c"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db0925545180223fabd6da9b34513efac83aa16673ef8b1cb0cc678e8cf0923c"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2ccb59dfbf2ebd668a5a7c4b7bb2b859859641d2b199114b557cd045aac6102"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:12cc7698fad48866f68fdef831685cb31ef5814ac605d248c4e5fc964a6fb3f6"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:55c401599d5093c4cfa83b8f0ee9757b4d6d3029b10bd67be2cffeada7a44961"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-win32.whl", hash = "sha256:896f5cdf58f658025a4f7e4ea96c81183b4b6a4b1b4d92ae66d112ac91f062f1"}, + {file = "grpcio_tools-1.59.2-cp312-cp312-win_amd64.whl", hash = "sha256:b53db1523015a3acda75722357df6c94afae37f6023800c608e09a5c05393804"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-linux_armv7l.whl", hash = "sha256:d08b398509ea4d544bcecddd9a21f59dc556396916c3915904cac206af2db72b"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:09749e832e06493841000275248b031f7154665900d1e1b0e42fc17a64bf904d"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:e972746000aa192521715f776fab617a3437bed29e90fe0e0fd0d0d6f498d7d4"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cbeeb3d8ec4cb25c92e17bfbdcef3c3669e85c5ee787a6e581cb942bc0ae2b88"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed8e6632d8d839456332d97b96db10bd2dbf3078e728d063394ac2d54597ad80"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:531f87c8e884c6a2e58f040039dfbfe997a4e33baa58f7c7d9993db37b1f5ad0"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:feca316e17cfead823af6eae0fc20c0d5299a94d71cfb7531a0e92d050a5fb2f"}, + {file = "grpcio_tools-1.59.2-cp37-cp37m-win_amd64.whl", hash = "sha256:41b5dd6a06c2563ac3b3adda6d875b15e63eb7b1629e85fc9af608c3a76c4c82"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-linux_armv7l.whl", hash = "sha256:7ec536cdae870a74080c665cfb1dca8d0784a931aa3c26376ef971a3a51b59d4"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:9c106ebbed0db446f59f0efe5c3fce33a0a21bf75b392966585e4b5934891b92"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:32141ef309543a446337e934f0b7a2565a6fca890ff4e543630a09ef72c8d00b"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5f2ce5ecd63c492949b03af73b1dd6d502c567cc2f9c2057137e518b0c702a01"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a9ce2a209871ed1c5ae2229e6f4f5a3ea96d83b7871df5d9773d72a72545683"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7f0e26af7c07bfa906c91ca9f5932514928a7f032f5f20aecad6b5541037de7e"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:48782727c5cff8b8c96e028a8a58614ff6a37eadc0db85866516210c7aafe9ae"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-win32.whl", hash = "sha256:4a1810bc5de51cc162a19ed3c11da8ddc64d8cfcba049ef337c20fcb397f048b"}, + {file = "grpcio_tools-1.59.2-cp38-cp38-win_amd64.whl", hash = "sha256:3cf9949a2aadcece3c1e0dd59249aea53dbfc8cc94f7d707797acd67cf6cf931"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-linux_armv7l.whl", hash = "sha256:f52e0ce8f2dcf1f160c847304016c446075a83ab925d98933d4681bfa8af2962"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:eb597d6bf9f5bfa54d00546e828f0d4e2c69250d1bc17c27903c0c7b66372135"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:17ef468836d7cf0b2419f4d5c7ac84ec2d598a1ae410773585313edacf7c393e"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dee5f7e7a56177234e61a483c70ca2ae34e73128372c801bb7039993870889f1"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f50ff312b88918c5a6461e45c5e03869749a066b1c24a7327e8e13e117efe4fc"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a85da4200295ee17e3c1ae068189a43844420ed7e9d531a042440f52de486dfb"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f518f22a3082de00f0d7a216e96366a87e6973111085ba1603c3bfa7dba2e728"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-win32.whl", hash = "sha256:6e735a26e8ea8bb89dc69343d1d00ea607449c6d81e21f339ee118562f3d1931"}, + {file = "grpcio_tools-1.59.2-cp39-cp39-win_amd64.whl", hash = "sha256:3491cb69c909d586c23d7e6d0ac87844ca22f496f505ce429c0d3301234f2cf3"}, +] + +[package.dependencies] +grpcio = ">=1.59.2" +protobuf = ">=4.21.6,<5.0dev" +setuptools = "*" + +[[package]] +name = "protobuf" +version = "4.25.0" +description = "" +optional = false +python-versions = ">=3.8" +files = [ + {file = "protobuf-4.25.0-cp310-abi3-win32.whl", hash = "sha256:5c1203ac9f50e4853b0a0bfffd32c67118ef552a33942982eeab543f5c634395"}, + {file = "protobuf-4.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:c40ff8f00aa737938c5378d461637d15c442a12275a81019cc2fef06d81c9419"}, + {file = "protobuf-4.25.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:cf21faba64cd2c9a3ed92b7a67f226296b10159dbb8fbc5e854fc90657d908e4"}, + {file = "protobuf-4.25.0-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:32ac2100b0e23412413d948c03060184d34a7c50b3e5d7524ee96ac2b10acf51"}, + {file = "protobuf-4.25.0-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:683dc44c61f2620b32ce4927de2108f3ebe8ccf2fd716e1e684e5a50da154054"}, + {file = "protobuf-4.25.0-cp38-cp38-win32.whl", hash = "sha256:1a3ba712877e6d37013cdc3476040ea1e313a6c2e1580836a94f76b3c176d575"}, + {file = "protobuf-4.25.0-cp38-cp38-win_amd64.whl", hash = "sha256:b2cf8b5d381f9378afe84618288b239e75665fe58d0f3fd5db400959274296e9"}, + {file = "protobuf-4.25.0-cp39-cp39-win32.whl", hash = "sha256:63714e79b761a37048c9701a37438aa29945cd2417a97076048232c1df07b701"}, + {file = "protobuf-4.25.0-cp39-cp39-win_amd64.whl", hash = "sha256:d94a33db8b7ddbd0af7c467475fb9fde0c705fb315a8433c0e2020942b863a1f"}, + {file = "protobuf-4.25.0-py3-none-any.whl", hash = "sha256:1a53d6f64b00eecf53b65ff4a8c23dc95df1fa1e97bb06b8122e5a64f49fc90a"}, + {file = "protobuf-4.25.0.tar.gz", hash = "sha256:68f7caf0d4f012fd194a301420cf6aa258366144d814f358c5b32558228afa7c"}, +] + +[[package]] +name = "setuptools" +version = "68.2.2" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "setuptools-68.2.2-py3-none-any.whl", hash = "sha256:b454a35605876da60632df1a60f736524eb73cc47bbc9f3f1ef1b644de74fd2a"}, + {file = "setuptools-68.2.2.tar.gz", hash = "sha256:4ac1475276d2f1c48684874089fefcd83bd7162ddaafb81fac866ba0db282a87"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + +[[package]] +name = "soupsieve" +version = "2.5" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.8" +content-hash = "35ed5a98dd3f951bbfc44b949ad9148634159976cb54ac6f257d119c12d9d924" diff --git a/.test-infra/mock-apis/proto/echo/v1/echo.proto b/.test-infra/mock-apis/proto/echo/v1/echo.proto new file mode 100644 index 0000000000000..826dc0f233fd5 --- /dev/null +++ b/.test-infra/mock-apis/proto/echo/v1/echo.proto @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Protocol buffers describing a simple mock API that echos a request. + */ + +syntax = "proto3"; + +package proto.echo.v1; +option go_package = "proto/echo/v1"; +option java_package = "org.apache.beam.testinfra.mockapis.echo.v1"; + +// EchoService simulates a mock API that echos a request. +service EchoService { + + // Echo an EchoRequest payload in an EchoResponse. + rpc Echo(EchoRequest) returns (EchoResponse) {} +} + +// The request to echo a payload. +message EchoRequest { + string id = 1; + bytes payload = 2; +} + +// The response echo of a request payload. +message EchoResponse { + string id = 1; + bytes payload = 2; +} \ No newline at end of file diff --git a/.test-infra/mock-apis/pyproject.toml b/.test-infra/mock-apis/pyproject.toml new file mode 100644 index 0000000000000..680bf489ba133 --- /dev/null +++ b/.test-infra/mock-apis/pyproject.toml @@ -0,0 +1,38 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +[tool.poetry] +name = "mock-apis" +version = "0.1.1" +authors = ["Ritesh Ghorse , Damon Douglas <>"] +license = "Apache-2.0" +description = "" +readme = "README.md" +packages = [ + { include = "src/main/python/"} +] + +[tool.poetry.dependencies] +python = "^3.8" +google = "^3.0.0" +grpcio = "^1.53.0" +grpcio-tools = "^1.53.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/.test-infra/mock-apis/src/main/go/cmd/service/echo/main.go b/.test-infra/mock-apis/src/main/go/cmd/service/echo/main.go new file mode 100644 index 0000000000000..891468a603a12 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/cmd/service/echo/main.go @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// echo is an executable that runs the echov1.EchoService. +package main + +import ( + "context" + "fmt" + "log/slog" + "net" + "net/http" + "os" + "os/signal" + + gcplogging "cloud.google.com/go/logging" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/cache" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/environment" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/logging" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/service/echo" + "github.com/redis/go-redis/v9" + "google.golang.org/grpc" +) + +var ( + env = []environment.Variable{ + environment.CacheHost, + environment.GrpcPort, + environment.HttpPort, + } + + logger *slog.Logger + logAttrs []slog.Attr + opts = &logging.Options{ + Name: "echo", + } +) + +func init() { + for _, v := range env { + logAttrs = append(logAttrs, slog.Attr{ + Key: v.Key(), + Value: slog.StringValue(v.Value()), + }) + } +} + +func main() { + ctx := context.Background() + + if !environment.ProjectId.Missing() { + client, err := gcplogging.NewClient(ctx, environment.ProjectId.Value()) + if err != nil { + slog.LogAttrs(ctx, slog.LevelError, err.Error(), logAttrs...) + os.Exit(1) + } + + opts.Client = client + } + + logger = logging.New(opts) + + if err := run(ctx); err != nil { + logger.LogAttrs(ctx, slog.LevelError, err.Error(), logAttrs...) + os.Exit(1) + } +} + +func run(ctx context.Context) error { + ctx, cancel := signal.NotifyContext(ctx, os.Interrupt) + defer cancel() + + if err := environment.Missing(env...); err != nil { + return err + } + + grpcPort, err := environment.GrpcPort.Int() + if err != nil { + return err + } + grpcAddress := fmt.Sprintf(":%v", grpcPort) + + httpPort, err := environment.HttpPort.Int() + if err != nil { + return err + } + httpAddress := fmt.Sprintf(":%v", httpPort) + + s := grpc.NewServer() + defer s.GracefulStop() + + r := redis.NewClient(&redis.Options{ + Addr: environment.CacheHost.Value(), + }) + + echoOpts := &echo.Options{ + Decrementer: (*cache.RedisCache)(r), + LoggingAttrs: logAttrs, + Logger: logger, + // TODO(damondouglas): add GCP metrics client + // MetricsWriter: + } + + handler, err := echo.Register(s, echoOpts) + if err != nil { + return err + } + + logger.LogAttrs(ctx, slog.LevelInfo, "starting service", logAttrs...) + + lis, err := net.Listen("tcp", grpcAddress) + if err != nil { + return err + } + + errChan := make(chan error) + go func() { + if err := s.Serve(lis); err != nil { + errChan <- err + } + }() + + go func() { + if err := http.ListenAndServe(httpAddress, handler); err != nil { + errChan <- err + } + }() + + select { + case err := <-errChan: + return err + case <-ctx.Done(): + logger.LogAttrs(ctx, slog.LevelInfo, "shutting down", logAttrs...) + return nil + } +} diff --git a/.test-infra/mock-apis/src/main/go/cmd/service/refresher/main.go b/.test-infra/mock-apis/src/main/go/cmd/service/refresher/main.go new file mode 100644 index 0000000000000..63e32671935c8 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/cmd/service/refresher/main.go @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// refresher is an executable that runs the cache.Refresher service. +package main + +import ( + "context" + "log/slog" + "os" + "os/signal" + + gcplogging "cloud.google.com/go/logging" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/cache" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/environment" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/logging" + "github.com/redis/go-redis/v9" +) + +var ( + env = []environment.Variable{ + environment.CacheHost, + environment.QuotaId, + environment.QuotaSize, + environment.QuotaRefreshInterval, + } + logger *slog.Logger + logAttrs []slog.Attr + opts = &logging.Options{ + Name: "refresher", + } +) + +func init() { + for _, v := range env { + logAttrs = append(logAttrs, slog.Attr{ + Key: v.Key(), + Value: slog.StringValue(v.Value()), + }) + } +} + +func main() { + ctx := context.Background() + + if !environment.ProjectId.Missing() { + client, err := gcplogging.NewClient(ctx, environment.ProjectId.Value()) + if err != nil { + slog.LogAttrs(ctx, slog.LevelError, err.Error(), logAttrs...) + os.Exit(1) + } + + opts.Client = client + } + + logger = logging.New(opts) + if err := run(ctx); err != nil { + logger.LogAttrs(ctx, slog.LevelError, err.Error(), logAttrs...) + os.Exit(1) + } +} + +func run(ctx context.Context) error { + ctx, cancel := signal.NotifyContext(ctx, os.Interrupt) + defer cancel() + + if err := environment.Missing(env...); err != nil { + return err + } + + size, err := environment.QuotaSize.UInt64() + if err != nil { + return err + } + + interval, err := environment.QuotaRefreshInterval.Duration() + if err != nil { + return err + } + + r := redis.NewClient(&redis.Options{ + Addr: environment.CacheHost.Value(), + }) + + opts := &cache.Options{ + Logger: logger, + Setter: (*cache.RedisCache)(r), + } + + ref, err := cache.NewRefresher(ctx, opts) + if err != nil { + return err + } + + errChan := make(chan error) + go func() { + if err := ref.Refresh(ctx, environment.QuotaId.Value(), size, interval); err != nil { + errChan <- err + } + }() + + select { + case err := <-errChan: + return err + case <-ctx.Done(): + return nil + } + +} diff --git a/.test-infra/mock-apis/src/main/go/internal/cache/cache.go b/.test-infra/mock-apis/src/main/go/internal/cache/cache.go new file mode 100644 index 0000000000000..cab20ad998ccc --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/cache/cache.go @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cache + +import ( + "context" + "errors" + "fmt" + "log/slog" + "reflect" + "time" + + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/logging" +) + +var ( + + // ErrNotExist is an error indicating that a resource does not exist + ErrNotExist = errors.New("resource does not exist") +) + +// IsNotExist is true when err is ErrNotExist. +func IsNotExist(err error) bool { + return errors.Is(err, ErrNotExist) +} + +// Options for running the Refresher. +type Options struct { + Setter UInt64Setter + Logger *slog.Logger +} + +// Refresher refreshes a value in a cache on a set interval. +type Refresher struct { + opts *Options + stop chan struct{} +} + +// NewRefresher instantiates a Refresher. +func NewRefresher(ctx context.Context, opts *Options) (*Refresher, error) { + if opts.Logger == nil { + opts.Logger = logging.New(&logging.Options{ + Name: reflect.TypeOf((*Refresher)(nil)).PkgPath(), + }) + } + + if opts.Setter == nil { + return nil, fmt.Errorf("%T.Setter is nil but required", opts) + } + + if err := opts.Setter.Alive(ctx); err != nil { + return nil, err + } + + ref := &Refresher{ + opts: opts, + } + + return ref, nil +} + +// Stop the Refresher. +func (ref *Refresher) Stop() { + ref.stop <- struct{}{} +} + +// Refresh the size of the associated key at an interval. +func (ref *Refresher) Refresh(ctx context.Context, key string, size uint64, interval time.Duration) error { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + ref.stop = make(chan struct{}) + attrs := []slog.Attr{ + { + Key: "key", + Value: slog.StringValue(key), + }, + { + Key: "size", + Value: slog.Uint64Value(size), + }, + { + Key: "interval", + Value: slog.StringValue(interval.String()), + }, + } + + ref.opts.Logger.LogAttrs(ctx, slog.LevelInfo, "starting refresher service", attrs...) + + if err := ref.opts.Setter.Set(ctx, key, size, interval); err != nil { + return err + } + ref.opts.Logger.LogAttrs(ctx, slog.LevelDebug, "successful initial refresh", attrs...) + + tick := time.Tick(interval) + for { + select { + case <-tick: + if err := ref.opts.Setter.Set(ctx, key, size, interval); err != nil { + return err + } + ref.opts.Logger.LogAttrs(ctx, slog.LevelDebug, "refresh successful", attrs...) + case <-ref.stop: + ref.opts.Logger.LogAttrs(ctx, slog.LevelInfo, "stopping refresher service", attrs...) + return nil + case <-ctx.Done(): + return nil + } + } +} diff --git a/.test-infra/mock-apis/src/main/go/internal/cache/doc.go b/.test-infra/mock-apis/src/main/go/internal/cache/doc.go new file mode 100644 index 0000000000000..c0f937aa8d599 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/cache/doc.go @@ -0,0 +1,17 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cache stores and retrieves data from a cache. +package cache diff --git a/.test-infra/mock-apis/src/main/go/internal/cache/interface.go b/.test-infra/mock-apis/src/main/go/internal/cache/interface.go new file mode 100644 index 0000000000000..8266f7205885a --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/cache/interface.go @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cache + +import ( + "context" + "time" +) + +// HealthChecker checks the health and availability of a resource. +type HealthChecker interface { + + // Alive checks whether the resource is healthy and available. + Alive(ctx context.Context) error +} + +// UInt64Setter associates a key with a value for an expiry time.Duration. +type UInt64Setter interface { + HealthChecker + + // Set a key with a value for an expiry time.Duration. + Set(ctx context.Context, key string, value uint64, expiry time.Duration) error +} + +// Decrementer decrements a value associated with a key. +type Decrementer interface { + HealthChecker + + // Decrement the value associated with a key; returns the value after + // decrementing it. + Decrement(ctx context.Context, key string) (int64, error) +} diff --git a/.test-infra/mock-apis/src/main/go/internal/cache/redis.go b/.test-infra/mock-apis/src/main/go/internal/cache/redis.go new file mode 100644 index 0000000000000..51ad73061cb78 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/cache/redis.go @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cache + +import ( + "context" + "time" + + "github.com/redis/go-redis/v9" +) + +// Validate interface implementations +var _ UInt64Setter = &RedisCache{} +var _ Decrementer = &RedisCache{} +var _ HealthChecker = &RedisCache{} + +// RedisCache implements a Decrementer and a Refresher. +type RedisCache redis.Client + +// Set implements Refresher's Set method using a redis cache where expiry of 0 +// has no expiration. Returns any error from the redis client. +func (client *RedisCache) Set(ctx context.Context, key string, value uint64, expiry time.Duration) error { + r := (*redis.Client)(client) + return r.Set(ctx, key, value, expiry).Err() +} + +// Decrement implements Decrementer's Decrement method using a redis cache. +// Returns an error when the key does not exist or from the redis client. +func (client *RedisCache) Decrement(ctx context.Context, key string) (int64, error) { + r := (*redis.Client)(client) + v, err := r.Exists(ctx, key).Result() + if err != nil { + return -1, err + } + if v == 0 { + return -1, ErrNotExist + } + return r.Decr(ctx, key).Result() +} + +// Alive implements HealthChecker's Alive checking the availability of a +// redis cache. Returns an error if no successful connection. +func (client *RedisCache) Alive(ctx context.Context) error { + r := (*redis.Client)(client) + return r.Ping(ctx).Err() +} diff --git a/.test-infra/mock-apis/src/main/go/internal/environment/variable.go b/.test-infra/mock-apis/src/main/go/internal/environment/variable.go new file mode 100644 index 0000000000000..b1e3a8ec40118 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/environment/variable.go @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package environment provides helpers for interacting with environment variables. +package environment + +import ( + "fmt" + "os" + "strconv" + "strings" + "time" +) + +var ( + // HttpPort is the port to bind an HTTP service. + HttpPort Variable = "HTTP_PORT" + + // GrpcPort is the port to bind a gRPC service. + GrpcPort Variable = "GRPC_PORT" + + // CacheHost is the host address of the cache. + CacheHost Variable = "CACHE_HOST" + + // ProjectId is the ID of the Google Cloud host project. + ProjectId Variable = "PROJECT_ID" + + // QuotaId uniquely identifies a quota measure. + QuotaId Variable = "QUOTA_ID" + + // QuotaSize specifies the size of the quota. + QuotaSize Variable = "QUOTA_SIZE" + + // QuotaRefreshInterval configures how often a quota is refreshed. + QuotaRefreshInterval Variable = "QUOTA_REFRESH_INTERVAL" +) + +// Variable defines an environment variable via a string type alias. +// Variable's string defaultValue assigns the system environment variable key. +type Variable string + +// Default a default value to the system environment. +func (v Variable) Default(value string) error { + if v.Missing() { + return os.Setenv((string)(v), value) + } + return nil +} + +// MustDefault a default value to the system environment. Panics on error. +func (v Variable) MustDefault(value string) { + if err := v.Default(value); err != nil { + panic(err) + } +} + +// Missing reports whether the system environment variable is an empty string. +func (v Variable) Missing() bool { + return v.Value() == "" +} + +// Key returns the system environment variable key. +func (v Variable) Key() string { + return (string)(v) +} + +// Value returns the system environment variable value. +func (v Variable) Value() string { + return os.Getenv((string)(v)) +} + +// Int returns the system environment variable parsed as an int. +func (v Variable) Int() (int, error) { + return strconv.Atoi(v.Value()) +} + +// UInt64 returns the system environment variable value parsed as a uint64. +func (v Variable) UInt64() (uint64, error) { + return strconv.ParseUint(v.Value(), 10, 64) +} + +// Duration returns the system environment variable value parsed as time.Duration. +func (v Variable) Duration() (time.Duration, error) { + return time.ParseDuration(v.Value()) +} + +// KeyValue returns a concatenated string of the system environment variable's +// =. +func (v Variable) KeyValue() string { + return fmt.Sprintf("%s=%s", (string)(v), v.Value()) +} + +// Missing reports as an error listing all Variable among vars that are +// not assigned in the system environment. +func Missing(vars ...Variable) error { + var missing []string + for _, v := range vars { + if v.Missing() { + missing = append(missing, v.KeyValue()) + } + } + if len(missing) > 0 { + return fmt.Errorf("variables empty but expected from environment: %s", strings.Join(missing, "; ")) + } + return nil +} diff --git a/.test-infra/mock-apis/src/main/go/internal/environment/variable_test.go b/.test-infra/mock-apis/src/main/go/internal/environment/variable_test.go new file mode 100644 index 0000000000000..b566f14d446bf --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/environment/variable_test.go @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package environment + +import ( + "errors" + "os" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestMissing(t *testing.T) { + type args struct { + vars []Variable + values []string + } + tests := []struct { + name string + args args + want error + }{ + { + name: "{}", + args: args{}, + }, + { + name: "{A=}", + args: args{ + vars: []Variable{ + "A", + }, + values: []string{ + "", + }, + }, + want: errors.New("variables empty but expected from environment: A="), + }, + { + name: "{A=1}", + args: args{ + vars: []Variable{ + "A", + }, + values: []string{ + "1", + }, + }, + want: nil, + }, + { + name: "{A=; B=}", + args: args{ + vars: []Variable{ + "A", + "B", + }, + values: []string{ + "", + "", + }, + }, + want: errors.New("variables empty but expected from environment: A=; B="), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var got, want string + clearVars(tt.args.vars...) + set(t, tt.args.vars, tt.args.values) + err := Missing(tt.args.vars...) + if err != nil { + got = err.Error() + } + if tt.want != nil { + want = tt.want.Error() + } + if diff := cmp.Diff(want, got); diff != "" { + t.Errorf("Missing() error returned unexpected difference in error messages (-want +got):\n%s", diff) + } + }) + } +} + +func TestVariable_Default(t *testing.T) { + type args struct { + setValue string + defaultValue string + } + tests := []struct { + name string + v Variable + args args + want string + }{ + { + name: "environment variable not set", + v: "A", + args: args{ + defaultValue: "1", + }, + want: "1", + }, + { + name: "environment variable default is overridden by set value", + v: "A", + args: args{ + setValue: "2", + defaultValue: "1", + }, + want: "2", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clearVars(tt.v) + if tt.args.setValue != "" { + set(t, []Variable{tt.v}, []string{tt.args.setValue}) + } + if err := tt.v.Default(tt.args.defaultValue); err != nil { + t.Fatalf("could not set default environment variable value during test execution: %v", err) + } + got := os.Getenv(tt.v.Key()) + if got != tt.want { + t.Errorf("Default() = %s, want %s", got, tt.want) + } + }) + } +} + +func TestVariable_KeyValue(t *testing.T) { + tests := []struct { + name string + v Variable + value string + want string + }{ + { + name: "environment variable not set", + v: "A", + want: "A=", + }, + { + name: "environment variable is set", + v: "A", + value: "1", + want: "A=1", + }, + } + for _, tt := range tests { + clearVars(tt.v) + t.Run(tt.name, func(t *testing.T) { + set(t, []Variable{tt.v}, []string{tt.value}) + got := tt.v.KeyValue() + if got != tt.want { + t.Errorf("KeyValue() = %s, want %s", got, tt.want) + } + }) + } +} + +func TestVariable_Missing(t *testing.T) { + type args struct { + setValue string + defaultValue string + } + tests := []struct { + name string + args args + v Variable + want bool + }{ + { + name: "no default and not set", + args: args{}, + v: "A", + want: true, + }, + { + name: "has default but not set", + args: args{ + defaultValue: "1", + }, + v: "A", + want: false, + }, + { + name: "no default but set", + args: args{ + setValue: "1", + }, + v: "A", + want: false, + }, + { + name: "has default and set", + args: args{ + setValue: "2", + defaultValue: "1", + }, + v: "A", + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clearVars(tt.v) + if tt.args.defaultValue != "" { + if err := tt.v.Default(tt.args.defaultValue); err != nil { + t.Fatalf("could not set default environment variable value during test execution: %v", err) + } + } + if tt.args.setValue != "" { + set(t, []Variable{tt.v}, []string{tt.args.setValue}) + } + if got := tt.v.Missing(); got != tt.want { + t.Errorf("Missing() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestVariable_Value(t *testing.T) { + type args struct { + setValue string + defaultValue string + } + tests := []struct { + name string + args args + v Variable + want string + }{ + { + name: "no default and not set", + args: args{}, + v: "A", + want: "", + }, + { + name: "has default but not set", + args: args{ + defaultValue: "1", + }, + v: "A", + want: "1", + }, + { + name: "no default but set", + args: args{ + setValue: "1", + }, + v: "A", + want: "1", + }, + { + name: "has default and set", + args: args{ + setValue: "2", + defaultValue: "1", + }, + v: "A", + want: "2", + }, + } + for _, tt := range tests { + clearVars(tt.v) + if tt.args.defaultValue != "" { + if err := tt.v.Default(tt.args.defaultValue); err != nil { + t.Fatalf("could not set default environment variable value during test execution: %v", err) + } + } + if tt.args.setValue != "" { + set(t, []Variable{tt.v}, []string{tt.args.setValue}) + } + t.Run(tt.name, func(t *testing.T) { + if got := tt.v.Value(); got != tt.want { + t.Errorf("Value() = %v, want %v", got, tt.want) + } + }) + } +} + +func clearVars(vars ...Variable) { + for _, k := range vars { + _ = os.Setenv(k.Key(), "") + } +} + +func set(t *testing.T, vars []Variable, values []string) { + if len(vars) != len(values) { + t.Fatalf("test cases should be configured with matching args.vars and args.values: len(tt.args.vars): %v != len(tt.args.values): %v", len(vars), len(values)) + } + for i := range vars { + key := vars[i].Key() + value := values[i] + _ = os.Setenv(key, value) + } +} diff --git a/.test-infra/mock-apis/src/main/go/internal/logging/logging.go b/.test-infra/mock-apis/src/main/go/internal/logging/logging.go new file mode 100644 index 0000000000000..53cead40b0d8a --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/logging/logging.go @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package logging performs structured output of log entries. +package logging + +import ( + "context" + "encoding/json" + "io" + "log/slog" + "os" + "path" + "runtime" + "sync" + + "cloud.google.com/go/logging" + "cloud.google.com/go/logging/apiv2/loggingpb" +) + +// Options for the slog.Logger +type Options struct { + *slog.HandlerOptions + Name string + Writer io.Writer + Client *logging.Client +} + +// New instantiates a slog.Logger to output using Google Cloud logging entries. +// When running locally, output is JSON strings of Cloud logging entries and +// does not make any API calls to the service. When running in Google Cloud, +// logging entries are submitted to the Cloud logging service. +func New(opts *Options) *slog.Logger { + if opts.HandlerOptions == nil { + opts.HandlerOptions = &slog.HandlerOptions{} + } + + opts.AddSource = true + + if opts.Writer == nil { + opts.Writer = os.Stdout + } + + handler := &gcpHandler{ + name: opts.Name, + mu: &sync.Mutex{}, + out: opts.Writer, + JSONHandler: slog.NewJSONHandler(opts.Writer, opts.HandlerOptions), + } + + if opts.Client != nil { + handler.logger = opts.Client.Logger(path.Base(opts.Name)) + } + + return slog.New(handler) +} + +var _ slog.Handler = &gcpHandler{} + +type gcpHandler struct { + name string + *slog.JSONHandler + mu *sync.Mutex + out io.Writer + logger *logging.Logger +} + +func (g *gcpHandler) Enabled(ctx context.Context, level slog.Level) bool { + return g.JSONHandler.Enabled(ctx, level) +} + +func severity(lvl slog.Level) logging.Severity { + switch lvl { + case slog.LevelDebug: + return logging.Debug + case slog.LevelInfo: + return logging.Info + case slog.LevelWarn: + return logging.Warning + case slog.LevelError: + return logging.Error + } + return logging.Default +} + +func (g *gcpHandler) Handle(_ context.Context, record slog.Record) error { + payload := map[string]any{ + "message": record.Message, + } + record.Attrs(func(attr slog.Attr) bool { + payload[attr.Key] = attr.Value.Any() + return true + }) + fs := runtime.CallersFrames([]uintptr{record.PC}) + f, _ := fs.Next() + entry := logging.Entry{ + LogName: g.name, + Timestamp: record.Time, + Severity: severity(record.Level), + Payload: payload, + SourceLocation: &loggingpb.LogEntrySourceLocation{ + File: f.File, + Line: int64(f.Line), + }, + } + g.mu.Lock() + defer g.mu.Unlock() + if g.logger == nil { + return json.NewEncoder(g.out).Encode(entry) + } + + entry.LogName = "" + g.logger.Log(entry) + return g.logger.Flush() +} + +func (g *gcpHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + h := g.JSONHandler + return h.WithAttrs(attrs) +} + +func (g *gcpHandler) WithGroup(name string) slog.Handler { + h := g.JSONHandler + return h.WithGroup(name) +} diff --git a/.test-infra/mock-apis/src/main/go/internal/logging/logging_test.go b/.test-infra/mock-apis/src/main/go/internal/logging/logging_test.go new file mode 100644 index 0000000000000..87bfa160981cf --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/logging/logging_test.go @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package logging_test + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "runtime" + "testing" + "time" + + gcplogging "cloud.google.com/go/logging" + "cloud.google.com/go/logging/apiv2/loggingpb" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/logging" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" +) + +var ( + opts = []cmp.Option{ + cmpopts.IgnoreFields(loggingpb.LogEntrySourceLocation{}, "state", "sizeCache", "unknownFields"), + cmpopts.IgnoreFields(gcplogging.Entry{}, "Timestamp"), + } +) + +func Test_logger_Info(t *testing.T) { + type args struct { + message string + fields []slog.Attr + } + tests := []struct { + name string + args args + want gcplogging.Entry + }{ + { + name: "message only", + args: args{ + message: "hello log", + }, + want: gcplogging.Entry{ + LogName: "message only", + Severity: gcplogging.Info, + Payload: map[string]interface{}{ + "message": "hello log", + }, + }, + }, + { + name: "with flat fields", + args: args{ + message: "message with fields", + fields: []slog.Attr{ + { + Key: "string", + Value: slog.StringValue("a string"), + }, + { + Key: "int", + Value: slog.IntValue(1), + }, + { + Key: "bool", + Value: slog.BoolValue(true), + }, + { + Key: "float", + Value: slog.Float64Value(1.23456789), + }, + }, + }, + want: gcplogging.Entry{ + LogName: "with flat fields", + Severity: gcplogging.Info, + Payload: map[string]interface{}{ + "message": "message with fields", + "string": "a string", + "int": float64(1), + "bool": true, + "float": 1.23456789, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + buf := bytes.Buffer{} + l := logging.New(&logging.Options{ + Name: tt.name, + Writer: &buf, + }) + l.LogAttrs(context.Background(), slog.LevelInfo, tt.args.message, tt.args.fields...) + _, file, line, _ := runtime.Caller(0) + tt.want.SourceLocation = &loggingpb.LogEntrySourceLocation{ + File: file, + Line: int64(line) - 1, + } + var got gcplogging.Entry + if err := json.NewDecoder(&buf).Decode(&got); err != nil { + t.Fatal(err) + } + if diff := cmp.Diff(tt.want, got, opts...); diff != "" { + t.Errorf("LogAttrs(Info) yielded unexpected difference in log entry (-want, +got):\n%s", diff) + } + }) + } +} +func Test_logger_Error(t *testing.T) { + buf := bytes.Buffer{} + l := logging.New(&logging.Options{ + Name: "test logger error", + Writer: &buf, + }) + message := "some error" + fields := []slog.Attr{ + { + Key: "observed", + Value: slog.TimeValue(time.Unix(1000000000, 0)), + }, + } + l.LogAttrs(context.Background(), slog.LevelError, message, fields...) + _, file, line, _ := runtime.Caller(0) + var got gcplogging.Entry + if err := json.NewDecoder(&buf).Decode(&got); err != nil { + t.Fatal(err) + } + if diff := cmp.Diff(gcplogging.Entry{ + LogName: "test logger error", + Severity: gcplogging.Error, + Payload: map[string]any{"message": "some error", "observed": "2001-09-09T01:46:40Z"}, + SourceLocation: &loggingpb.LogEntrySourceLocation{ + File: file, + Line: int64(line) - 1, + }, + }, got, opts...); diff != "" { + t.Errorf("LogAttrs(Error) yielded unexpected difference in log entry (-want, +got):\n%s", diff) + } +} diff --git a/.test-infra/mock-apis/src/main/go/internal/metric/doc.go b/.test-infra/mock-apis/src/main/go/internal/metric/doc.go new file mode 100644 index 0000000000000..43bfc771c1985 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/metric/doc.go @@ -0,0 +1,17 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package metric supports monitoring. +package metric diff --git a/.test-infra/mock-apis/src/main/go/internal/metric/gcp.go b/.test-infra/mock-apis/src/main/go/internal/metric/gcp.go new file mode 100644 index 0000000000000..3d23d538955a0 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/metric/gcp.go @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import ( + "context" + "path" + + monitoring "cloud.google.com/go/monitoring/apiv3" + "cloud.google.com/go/monitoring/apiv3/v2/monitoringpb" + "google.golang.org/genproto/googleapis/api/metric" + "google.golang.org/genproto/googleapis/api/monitoredres" + "google.golang.org/protobuf/types/known/timestamppb" +) + +const ( + metricTypePrefix = "custom.googleapis.com" + monitoredResourceType = "generic_task" +) + +// GcpGauge implements a Writer for a Google Cloud gauge. +// See https://cloud.google.com/monitoring/api/v3/kinds-and-types#metric-kinds +type GcpGauge monitoring.MetricClient + +// Write to a Google Cloud monitoring gauge. +func (writer *GcpGauge) Write(ctx context.Context, name string, unit string, points ...*Point) error { + var mPts []*monitoringpb.Point + for _, p := range points { + t := timestamppb.New(p.Timestamp) + mPts = append(mPts, &monitoringpb.Point{ + Interval: &monitoringpb.TimeInterval{ + StartTime: t, + EndTime: t, + }, + Value: &monitoringpb.TypedValue{ + Value: &monitoringpb.TypedValue_Int64Value{ + Int64Value: p.Value, + }, + }, + }) + } + ts := timeseries(name, unit, metric.MetricDescriptor_GAUGE, mPts) + + client := (*monitoring.MetricClient)(writer) + return client.CreateTimeSeries(ctx, &monitoringpb.CreateTimeSeriesRequest{ + Name: name, + TimeSeries: []*monitoringpb.TimeSeries{ts}, + }) +} + +func timeseries(name string, unit string, kind metric.MetricDescriptor_MetricKind, points []*monitoringpb.Point) *monitoringpb.TimeSeries { + return &monitoringpb.TimeSeries{ + Metric: &metric.Metric{ + Type: path.Join(metricTypePrefix, name), + }, + Resource: &monitoredres.MonitoredResource{ + Type: monitoredResourceType, + }, + MetricKind: kind, + ValueType: metric.MetricDescriptor_INT64, + Unit: unit, + Points: points, + } +} diff --git a/.test-infra/mock-apis/src/main/go/internal/metric/interface.go b/.test-infra/mock-apis/src/main/go/internal/metric/interface.go new file mode 100644 index 0000000000000..d0f7e385227e2 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/metric/interface.go @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import ( + "context" + "time" +) + +// Writer writes to a metric sink. +type Writer interface { + + // Write to a metric sink. + Write(ctx context.Context, name string, unit string, points ...*Point) error +} + +// Point models a metric data point. +type Point struct { + + // Timestamp of the metric data point. + Timestamp time.Time + + // Value of the metric data point. + Value int64 +} diff --git a/.test-infra/mock-apis/src/main/go/internal/proto/echo/v1/echo.pb.go b/.test-infra/mock-apis/src/main/go/internal/proto/echo/v1/echo.pb.go new file mode 100644 index 0000000000000..97ced9227d7fa --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/proto/echo/v1/echo.pb.go @@ -0,0 +1,256 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Protocol buffers describing a simple mock API that echos a request. + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc (unknown) +// source: proto/echo/v1/echo.proto + +package v1 + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// The request to echo a payload. +type EchoRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` +} + +func (x *EchoRequest) Reset() { + *x = EchoRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_proto_echo_v1_echo_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *EchoRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*EchoRequest) ProtoMessage() {} + +func (x *EchoRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_echo_v1_echo_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use EchoRequest.ProtoReflect.Descriptor instead. +func (*EchoRequest) Descriptor() ([]byte, []int) { + return file_proto_echo_v1_echo_proto_rawDescGZIP(), []int{0} +} + +func (x *EchoRequest) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *EchoRequest) GetPayload() []byte { + if x != nil { + return x.Payload + } + return nil +} + +// The response echo of a request payload. +type EchoResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + Payload []byte `protobuf:"bytes,2,opt,name=payload,proto3" json:"payload,omitempty"` +} + +func (x *EchoResponse) Reset() { + *x = EchoResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_proto_echo_v1_echo_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *EchoResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*EchoResponse) ProtoMessage() {} + +func (x *EchoResponse) ProtoReflect() protoreflect.Message { + mi := &file_proto_echo_v1_echo_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use EchoResponse.ProtoReflect.Descriptor instead. +func (*EchoResponse) Descriptor() ([]byte, []int) { + return file_proto_echo_v1_echo_proto_rawDescGZIP(), []int{1} +} + +func (x *EchoResponse) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *EchoResponse) GetPayload() []byte { + if x != nil { + return x.Payload + } + return nil +} + +var File_proto_echo_v1_echo_proto protoreflect.FileDescriptor + +var file_proto_echo_v1_echo_proto_rawDesc = []byte{ + 0x0a, 0x18, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x65, 0x63, 0x68, 0x6f, 0x2f, 0x76, 0x31, 0x2f, + 0x65, 0x63, 0x68, 0x6f, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0d, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x2e, 0x65, 0x63, 0x68, 0x6f, 0x2e, 0x76, 0x31, 0x22, 0x37, 0x0a, 0x0b, 0x45, 0x63, 0x68, + 0x6f, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x70, 0x61, 0x79, 0x6c, + 0x6f, 0x61, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x70, 0x61, 0x79, 0x6c, 0x6f, + 0x61, 0x64, 0x22, 0x38, 0x0a, 0x0c, 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, + 0x73, 0x65, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, + 0x69, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x70, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x18, 0x02, 0x20, + 0x01, 0x28, 0x0c, 0x52, 0x07, 0x70, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x32, 0x50, 0x0a, 0x0b, + 0x45, 0x63, 0x68, 0x6f, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x41, 0x0a, 0x04, 0x45, + 0x63, 0x68, 0x6f, 0x12, 0x1a, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x65, 0x63, 0x68, 0x6f, + 0x2e, 0x76, 0x31, 0x2e, 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x1b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x65, 0x63, 0x68, 0x6f, 0x2e, 0x76, 0x31, 0x2e, + 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x3b, + 0x0a, 0x2a, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x62, 0x65, 0x61, + 0x6d, 0x2e, 0x74, 0x65, 0x73, 0x74, 0x69, 0x6e, 0x66, 0x72, 0x61, 0x2e, 0x6d, 0x6f, 0x63, 0x6b, + 0x61, 0x70, 0x69, 0x73, 0x2e, 0x65, 0x63, 0x68, 0x6f, 0x2e, 0x76, 0x31, 0x5a, 0x0d, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x2f, 0x65, 0x63, 0x68, 0x6f, 0x2f, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, + 0x74, 0x6f, 0x33, +} + +var ( + file_proto_echo_v1_echo_proto_rawDescOnce sync.Once + file_proto_echo_v1_echo_proto_rawDescData = file_proto_echo_v1_echo_proto_rawDesc +) + +func file_proto_echo_v1_echo_proto_rawDescGZIP() []byte { + file_proto_echo_v1_echo_proto_rawDescOnce.Do(func() { + file_proto_echo_v1_echo_proto_rawDescData = protoimpl.X.CompressGZIP(file_proto_echo_v1_echo_proto_rawDescData) + }) + return file_proto_echo_v1_echo_proto_rawDescData +} + +var file_proto_echo_v1_echo_proto_msgTypes = make([]protoimpl.MessageInfo, 2) +var file_proto_echo_v1_echo_proto_goTypes = []interface{}{ + (*EchoRequest)(nil), // 0: proto.echo.v1.EchoRequest + (*EchoResponse)(nil), // 1: proto.echo.v1.EchoResponse +} +var file_proto_echo_v1_echo_proto_depIdxs = []int32{ + 0, // 0: proto.echo.v1.EchoService.Echo:input_type -> proto.echo.v1.EchoRequest + 1, // 1: proto.echo.v1.EchoService.Echo:output_type -> proto.echo.v1.EchoResponse + 1, // [1:2] is the sub-list for method output_type + 0, // [0:1] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_proto_echo_v1_echo_proto_init() } +func file_proto_echo_v1_echo_proto_init() { + if File_proto_echo_v1_echo_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_proto_echo_v1_echo_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*EchoRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_proto_echo_v1_echo_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*EchoResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_proto_echo_v1_echo_proto_rawDesc, + NumEnums: 0, + NumMessages: 2, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_proto_echo_v1_echo_proto_goTypes, + DependencyIndexes: file_proto_echo_v1_echo_proto_depIdxs, + MessageInfos: file_proto_echo_v1_echo_proto_msgTypes, + }.Build() + File_proto_echo_v1_echo_proto = out.File + file_proto_echo_v1_echo_proto_rawDesc = nil + file_proto_echo_v1_echo_proto_goTypes = nil + file_proto_echo_v1_echo_proto_depIdxs = nil +} diff --git a/.test-infra/mock-apis/src/main/go/internal/proto/echo/v1/echo_grpc.pb.go b/.test-infra/mock-apis/src/main/go/internal/proto/echo/v1/echo_grpc.pb.go new file mode 100644 index 0000000000000..3ce2bdeeeec1e --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/proto/echo/v1/echo_grpc.pb.go @@ -0,0 +1,107 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc (unknown) +// source: proto/echo/v1/echo.proto + +package v1 + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// EchoServiceClient is the client API for EchoService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type EchoServiceClient interface { + // Echo an EchoRequest payload in an EchoResponse. + Echo(ctx context.Context, in *EchoRequest, opts ...grpc.CallOption) (*EchoResponse, error) +} + +type echoServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewEchoServiceClient(cc grpc.ClientConnInterface) EchoServiceClient { + return &echoServiceClient{cc} +} + +func (c *echoServiceClient) Echo(ctx context.Context, in *EchoRequest, opts ...grpc.CallOption) (*EchoResponse, error) { + out := new(EchoResponse) + err := c.cc.Invoke(ctx, "/proto.echo.v1.EchoService/Echo", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// EchoServiceServer is the server API for EchoService service. +// All implementations must embed UnimplementedEchoServiceServer +// for forward compatibility +type EchoServiceServer interface { + // Echo an EchoRequest payload in an EchoResponse. + Echo(context.Context, *EchoRequest) (*EchoResponse, error) + mustEmbedUnimplementedEchoServiceServer() +} + +// UnimplementedEchoServiceServer must be embedded to have forward compatible implementations. +type UnimplementedEchoServiceServer struct { +} + +func (UnimplementedEchoServiceServer) Echo(context.Context, *EchoRequest) (*EchoResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method Echo not implemented") +} +func (UnimplementedEchoServiceServer) mustEmbedUnimplementedEchoServiceServer() {} + +// UnsafeEchoServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to EchoServiceServer will +// result in compilation errors. +type UnsafeEchoServiceServer interface { + mustEmbedUnimplementedEchoServiceServer() +} + +func RegisterEchoServiceServer(s grpc.ServiceRegistrar, srv EchoServiceServer) { + s.RegisterService(&EchoService_ServiceDesc, srv) +} + +func _EchoService_Echo_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(EchoRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(EchoServiceServer).Echo(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/proto.echo.v1.EchoService/Echo", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(EchoServiceServer).Echo(ctx, req.(*EchoRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// EchoService_ServiceDesc is the grpc.ServiceDesc for EchoService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var EchoService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "proto.echo.v1.EchoService", + HandlerType: (*EchoServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "Echo", + Handler: _EchoService_Echo_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "proto/echo/v1/echo.proto", +} diff --git a/.test-infra/mock-apis/src/main/go/internal/service/echo/echo.go b/.test-infra/mock-apis/src/main/go/internal/service/echo/echo.go new file mode 100644 index 0000000000000..d0682551775f7 --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/internal/service/echo/echo.go @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package echo contains the EchoService API implementation. +package echo + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "net/http" + "path" + "reflect" + "time" + + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/cache" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/logging" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/metric" + echov1 "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/proto/echo/v1" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/health/grpc_health_v1" + "google.golang.org/grpc/status" +) + +const ( + metricsNamePrefix = "echo" + echoPath = "/proto.echo.v1.EchoService/Echo" + PathAlias = "/v1/echo" + healthPath = "/grpc.health.v1.Health/Check" + healthPathAlias = "/v1/healthz" +) + +type Options struct { + Decrementer cache.Decrementer + MetricsWriter metric.Writer + Logger *slog.Logger + LoggingAttrs []slog.Attr +} + +// Register a grpc.Server with the echov1.EchoService. Returns a http.Handler or error. +func Register(s *grpc.Server, opts *Options) (http.Handler, error) { + if opts.Logger == nil { + opts.Logger = logging.New(&logging.Options{ + Name: reflect.TypeOf((*echo)(nil)).PkgPath(), + }) + } + var attrs []any + for _, attr := range opts.LoggingAttrs { + attrs = append(attrs, attr) + } + opts.Logger = opts.Logger.With(attrs...) + srv := &echo{ + opts: opts, + } + + echov1.RegisterEchoServiceServer(s, srv) + grpc_health_v1.RegisterHealthServer(s, srv) + + return srv, nil +} + +type echo struct { + echov1.UnimplementedEchoServiceServer + grpc_health_v1.UnimplementedHealthServer + opts *Options +} + +// ServeHTTP implements http.Handler, allowing echo to support HTTP clients in addition to gRPC. +func (srv *echo) ServeHTTP(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case echoPath, PathAlias: + srv.httpHandler(w, r) + case healthPath, healthPathAlias: + srv.checkHandler(w, r) + default: + http.Error(w, fmt.Sprintf("%s not found", r.URL.Path), http.StatusNotFound) + } +} + +// Check checks whether echo service's underlying decrementer is alive. +func (srv *echo) Check(ctx context.Context, _ *grpc_health_v1.HealthCheckRequest) (*grpc_health_v1.HealthCheckResponse, error) { + if err := srv.opts.Decrementer.Alive(ctx); err != nil { + return nil, err + } + return &grpc_health_v1.HealthCheckResponse{ + Status: grpc_health_v1.HealthCheckResponse_SERVING, + }, nil +} + +func (srv *echo) checkHandler(w http.ResponseWriter, r *http.Request) { + resp, err := srv.Check(r.Context(), nil) + if err != nil { + + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + if err := json.NewEncoder(w).Encode(resp); err != nil { + srv.opts.Logger.Log(r.Context(), slog.LevelError, err.Error()) + http.Error(w, err.Error(), http.StatusInternalServerError) + } +} + +// Watch the health of the echov1.EchoServiceServer. +func (srv *echo) Watch(request *grpc_health_v1.HealthCheckRequest, server grpc_health_v1.Health_WatchServer) error { + resp, err := srv.Check(server.Context(), request) + if err != nil { + srv.opts.Logger.Log(server.Context(), slog.LevelError, err.Error()) + return err + } + return server.Send(resp) +} + +// Echo a EchoRequest with a EchoResponse. Decrements an underlying quota identified by the id of the request. +// Returns a cache.IsNotExist if request's id does not map to a key in the cache. +// See cache.Refresher for how the cache refreshes the quota identified by the request id. +func (srv *echo) Echo(ctx context.Context, request *echov1.EchoRequest) (*echov1.EchoResponse, error) { + v, err := srv.opts.Decrementer.Decrement(ctx, request.Id) + if cache.IsNotExist(err) { + return nil, status.Errorf(codes.NotFound, "error: source not found: %s, err %v", request.Id, err) + } + if err != nil { + srv.opts.Logger.Log(ctx, slog.LevelError, err.Error()) + return nil, status.Errorf(codes.Internal, "error: encountered from cache for resource: %srv, err %v", request.Id, err) + } + + if err := srv.writeMetric(ctx, request.Id, v); err != nil { + return nil, err + } + + if v < 0 { + return nil, status.Errorf(codes.ResourceExhausted, "error: resource exhausted for: %s", request.Id) + } + + return &echov1.EchoResponse{ + Id: request.Id, + Payload: request.Payload, + }, nil +} + +func (srv *echo) writeMetric(ctx context.Context, id string, value int64) error { + if srv.opts.MetricsWriter == nil { + return nil + } + if err := srv.opts.MetricsWriter.Write(ctx, path.Join(metricsNamePrefix, id), "unit", &metric.Point{ + Timestamp: time.Now(), + Value: value + 1, + }); err != nil { + srv.opts.Logger.Log(ctx, slog.LevelError, err.Error()) + } + return nil +} + +func (srv *echo) httpHandler(w http.ResponseWriter, r *http.Request) { + var body *echov1.EchoRequest + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + err = fmt.Errorf("error decoding request body, payload field of %T needs to be base64 encoded, error: %w", body, err) + srv.opts.Logger.Log(r.Context(), slog.LevelError, err.Error()) + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + resp, err := srv.Echo(r.Context(), body) + + switch status.Code(err) { + case codes.OK: + if err := json.NewEncoder(w).Encode(resp); err != nil { + srv.opts.Logger.Log(r.Context(), slog.LevelError, err.Error()) + http.Error(w, err.Error(), http.StatusInternalServerError) + } + case codes.InvalidArgument: + http.Error(w, err.Error(), http.StatusBadRequest) + case codes.DeadlineExceeded: + http.Error(w, err.Error(), http.StatusRequestTimeout) + case codes.NotFound: + http.Error(w, err.Error(), http.StatusNotFound) + case codes.ResourceExhausted: + http.Error(w, err.Error(), http.StatusTooManyRequests) + default: + srv.opts.Logger.Log(r.Context(), slog.LevelError, err.Error()) + http.Error(w, err.Error(), http.StatusInternalServerError) + } +} diff --git a/.test-infra/mock-apis/src/main/go/test/integration/echo/echo_test.go b/.test-infra/mock-apis/src/main/go/test/integration/echo/echo_test.go new file mode 100644 index 0000000000000..102d8508d481f --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/test/integration/echo/echo_test.go @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Tests for the src/main/go/cmd/service/echo service. +package echo + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "regexp" + "sync" + "testing" + "time" + + echov1 "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/proto/echo/v1" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/internal/service/echo" + "github.com/apache/beam/test-infra/mock-apis/src/main/go/test/integration" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/status" +) + +const ( + // QuotaIds below correspond to: + // kubectl get deploy --selector=app.kubernetes.io/tag=refresher -o custom-columns='QUOTA_ID:.metadata.labels.quota-id' + // See https://github.com/apache/beam/tree/master/.test-infra/mock-apis#writing-integration-tests + shouldExceedQuotaId = "echo-should-exceed-quota" + shouldNeverExceedQuotaId = "echo-should-never-exceed-quota" + shouldNotExistId = "should-not-exist" + refresh10Per1s = "echo-10-per-1s-quota" + defaultNumCalls = 3 +) + +var ( + grpcOpts = []grpc.DialOption{ + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + } + + timeout = time.Second * 3 +) + +func TestEcho(t *testing.T) { + payload := []byte("payload") + + for _, tt := range []struct { + tag string + quotaId string + client echov1.EchoServiceClient + want *echov1.EchoResponse + numCalls int + wantErr error + }{ + { + tag: "http", + quotaId: shouldExceedQuotaId, + client: withHttp(t), + wantErr: errors.New("429 Too Many Requests"), + }, + { + tag: "grpc", + quotaId: shouldExceedQuotaId, + client: withGrpc(t), + wantErr: status.Error(codes.ResourceExhausted, "error: resource exhausted for: echo-should-exceed-quota"), + }, + { + tag: "http", + quotaId: shouldNotExistId, + client: withHttp(t), + wantErr: errors.New("404 Not Found"), + }, + { + tag: "grpc", + quotaId: shouldNotExistId, + client: withGrpc(t), + wantErr: status.Error(codes.NotFound, "error: source not found: should-not-exist, err resource does not exist"), + }, + { + tag: "http", + quotaId: shouldNeverExceedQuotaId, + client: withHttp(t), + want: &echov1.EchoResponse{ + Id: shouldNeverExceedQuotaId, + Payload: payload, + }, + }, + { + tag: "grpc", + quotaId: shouldNeverExceedQuotaId, + client: withGrpc(t), + want: &echov1.EchoResponse{ + Id: shouldNeverExceedQuotaId, + Payload: payload, + }, + }, + { + numCalls: 20, + tag: "grpc", + quotaId: refresh10Per1s, + client: withGrpc(t), + wantErr: status.Error(codes.ResourceExhausted, "error: resource exhausted for: echo-10-per-1s-quota"), + }, + } { + t.Run(fmt.Sprintf("%s/%s", tt.quotaId, tt.tag), func(t *testing.T) { + ctx, cancel := withTimeout() + defer cancel() + + if tt.numCalls == 0 { + tt.numCalls = defaultNumCalls + } + + wg := sync.WaitGroup{} + wg.Add(tt.numCalls) + + req := &echov1.EchoRequest{ + Id: tt.quotaId, + Payload: payload, + } + + var resps []*echov1.EchoResponse + var errs []error + + for i := 0; i < tt.numCalls; i++ { + go func() { + resp, err := tt.client.Echo(ctx, req) + if err != nil { + errs = append(errs, err) + } + if resp != nil { + resps = append(resps, resp) + } + wg.Done() + }() + } + + wg.Wait() + + if tt.wantErr != nil && len(errs) == 0 { + t.Errorf("Echo(%+v) err = nil, wantErr = %v", req, tt.wantErr) + return + } + + for _, err := range errs { + if diff := cmp.Diff(tt.wantErr.Error(), err.Error()); diff != "" { + t.Errorf("Echo(%+v) err mismatch (-want +got)\n%s", req, diff) + } + } + + if tt.want != nil { + for _, resp := range resps { + if diff := cmp.Diff(tt.want, resp, cmpopts.IgnoreUnexported(echov1.EchoResponse{})); diff != "" { + t.Errorf("Echo(%+v) mismatch (-want +got)\n%s", req, diff) + } + } + } + + }) + } +} + +func TestMain(m *testing.M) { + integration.Run(m) +} + +func withGrpc(t *testing.T) echov1.EchoServiceClient { + t.Helper() + ctx, cancel := withTimeout() + defer cancel() + + if *integration.GRPCServiceEndpoint == "" { + t.Fatalf("missing flag: -%s", integration.GrpcServiceEndpointFlag) + } + + conn, err := grpc.DialContext(ctx, *integration.GRPCServiceEndpoint, grpcOpts...) + if err != nil { + t.Fatalf("DialContext(%s) err %v", *integration.GRPCServiceEndpoint, err) + } + t.Cleanup(func() { + if err := conn.Close(); err != nil { + t.Fatal(err) + } + }) + + return echov1.NewEchoServiceClient(conn) +} + +type httpCaller struct { + rawUrl string +} + +func (h *httpCaller) Echo(ctx context.Context, in *echov1.EchoRequest, _ ...grpc.CallOption) (*echov1.EchoResponse, error) { + ctx, cancel := withTimeout() + defer cancel() + buf := bytes.Buffer{} + if err := json.NewEncoder(&buf).Encode(in); err != nil { + return nil, err + } + + resp, err := http.Post(h.rawUrl, "application/json", &buf) + if err != nil { + return nil, err + } + + if resp.StatusCode > 299 { + return nil, errors.New(resp.Status) + } + + var result *echov1.EchoResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + return result, nil +} + +func withHttp(t *testing.T) echov1.EchoServiceClient { + if *integration.HTTPServiceEndpoint == "" { + t.Fatalf("missing flag: -%s", integration.HttpServiceEndpointFlag) + } + p := regexp.MustCompile(`^http://`) + rawUrl := fmt.Sprint(*integration.HTTPServiceEndpoint, echo.PathAlias) + if !p.MatchString(rawUrl) { + t.Fatalf("missing 'http(s)' scheme from %s", *integration.HTTPServiceEndpoint) + } + return &httpCaller{ + rawUrl: rawUrl, + } +} + +func withTimeout() (context.Context, context.CancelFunc) { + return context.WithTimeout(context.Background(), timeout) +} diff --git a/.test-infra/mock-apis/src/main/go/test/integration/integration.go b/.test-infra/mock-apis/src/main/go/test/integration/integration.go new file mode 100644 index 0000000000000..777225061eacf --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/test/integration/integration.go @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package integration provides functionality that needs to be shared between all +// integration tests. +package integration + +import ( + "flag" + "os" + "testing" +) + +// Run a testing.M, first calling flag.Parse if not flag.Parsed. +func Run(m *testing.M) { + if !flag.Parsed() { + flag.Parse() + } + os.Exit(m.Run()) +} diff --git a/.test-infra/mock-apis/src/main/go/test/integration/vars.go b/.test-infra/mock-apis/src/main/go/test/integration/vars.go new file mode 100644 index 0000000000000..a4bd9bb60e2ef --- /dev/null +++ b/.test-infra/mock-apis/src/main/go/test/integration/vars.go @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration + +import ( + "flag" + "fmt" +) + +const ( + GrpcServiceEndpointFlag = "grpc_service_endpoint" + HttpServiceEndpointFlag = "http_service_endpoint" + + moreInfoUrl = "https://github.com/apache/beam/tree/master/.test-infra/mock-apis#writing-integration-tests" +) + +var ( + moreInfo = fmt.Sprintf("See %s for more information on how to get the relevant value for your test.", moreInfoUrl) + + requiredFlags = []string{ + GrpcServiceEndpointFlag, + HttpServiceEndpointFlag, + } +) + +// The following flags apply to one or more integration tests and used via +// go test ./src/main/go/test/integration/... +var ( + // GRPCServiceEndpoint is the address of the deployed service. + GRPCServiceEndpoint = flag.String(GrpcServiceEndpointFlag, "", + "The endpoint to target gRPC calls to a service. "+moreInfo) + + // HTTPServiceEndpoint is the address of the deployed service. + HTTPServiceEndpoint = flag.String(HttpServiceEndpointFlag, "", + "The endpoint to target HTTP calls to a service. "+moreInfo) +) diff --git a/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/Echo.java b/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/Echo.java new file mode 100644 index 0000000000000..4652ff716b87d --- /dev/null +++ b/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/Echo.java @@ -0,0 +1,1447 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.testinfra.mockapis.echo.v1; + +@SuppressWarnings({ + "argument", + "assignment", + "initialization.fields.uninitialized", + "initialization.static.field.uninitialized", + "override.param", + "ClassTypeParameterName", + "ForbidNonVendoredGuava", + "JavadocStyle", + "LocalVariableName", + "MemberName", + "NeedBraces", + "MissingOverride", + "RedundantModifier", + "ReferenceEquality", + "UnusedVariable", +}) +public final class Echo { + private Echo() {} + + public static void registerAllExtensions(com.google.protobuf.ExtensionRegistryLite registry) {} + + public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) { + registerAllExtensions((com.google.protobuf.ExtensionRegistryLite) registry); + } + + public interface EchoRequestOrBuilder + extends + // @@protoc_insertion_point(interface_extends:proto.echo.v1.EchoRequest) + com.google.protobuf.MessageOrBuilder { + + /** + * string id = 1 [json_name = "id"]; + * + * @return The id. + */ + java.lang.String getId(); + /** + * string id = 1 [json_name = "id"]; + * + * @return The bytes for id. + */ + com.google.protobuf.ByteString getIdBytes(); + + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return The payload. + */ + com.google.protobuf.ByteString getPayload(); + } + /** + * + * + *
+   * The request to echo a payload.
+   * 
+ * + * Protobuf type {@code proto.echo.v1.EchoRequest} + */ + public static final class EchoRequest extends com.google.protobuf.GeneratedMessageV3 + implements + // @@protoc_insertion_point(message_implements:proto.echo.v1.EchoRequest) + EchoRequestOrBuilder { + private static final long serialVersionUID = 0L; + // Use EchoRequest.newBuilder() to construct. + private EchoRequest(com.google.protobuf.GeneratedMessageV3.Builder builder) { + super(builder); + } + + private EchoRequest() { + id_ = ""; + payload_ = com.google.protobuf.ByteString.EMPTY; + } + + @java.lang.Override + @SuppressWarnings({"unused"}) + protected java.lang.Object newInstance(UnusedPrivateParameter unused) { + return new EchoRequest(); + } + + @java.lang.Override + public final com.google.protobuf.UnknownFieldSet getUnknownFields() { + return this.unknownFields; + } + + public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoRequest_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internalGetFieldAccessorTable() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoRequest_fieldAccessorTable + .ensureFieldAccessorsInitialized( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.class, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.Builder.class); + } + + public static final int ID_FIELD_NUMBER = 1; + + @SuppressWarnings("serial") + private volatile java.lang.Object id_ = ""; + /** + * string id = 1 [json_name = "id"]; + * + * @return The id. + */ + @java.lang.Override + public java.lang.String getId() { + java.lang.Object ref = id_; + if (ref instanceof java.lang.String) { + return (java.lang.String) ref; + } else { + com.google.protobuf.ByteString bs = (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + id_ = s; + return s; + } + } + /** + * string id = 1 [json_name = "id"]; + * + * @return The bytes for id. + */ + @java.lang.Override + public com.google.protobuf.ByteString getIdBytes() { + java.lang.Object ref = id_; + if (ref instanceof java.lang.String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8((java.lang.String) ref); + id_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + + public static final int PAYLOAD_FIELD_NUMBER = 2; + private com.google.protobuf.ByteString payload_ = com.google.protobuf.ByteString.EMPTY; + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return The payload. + */ + @java.lang.Override + public com.google.protobuf.ByteString getPayload() { + return payload_; + } + + private byte memoizedIsInitialized = -1; + + @java.lang.Override + public final boolean isInitialized() { + byte isInitialized = memoizedIsInitialized; + if (isInitialized == 1) return true; + if (isInitialized == 0) return false; + + memoizedIsInitialized = 1; + return true; + } + + @java.lang.Override + public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException { + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(id_)) { + com.google.protobuf.GeneratedMessageV3.writeString(output, 1, id_); + } + if (!payload_.isEmpty()) { + output.writeBytes(2, payload_); + } + getUnknownFields().writeTo(output); + } + + @java.lang.Override + public int getSerializedSize() { + int size = memoizedSize; + if (size != -1) return size; + + size = 0; + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(id_)) { + size += com.google.protobuf.GeneratedMessageV3.computeStringSize(1, id_); + } + if (!payload_.isEmpty()) { + size += com.google.protobuf.CodedOutputStream.computeBytesSize(2, payload_); + } + size += getUnknownFields().getSerializedSize(); + memoizedSize = size; + return size; + } + + @java.lang.Override + public boolean equals(final java.lang.Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest)) { + return super.equals(obj); + } + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest other = + (org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest) obj; + + if (!getId().equals(other.getId())) return false; + if (!getPayload().equals(other.getPayload())) return false; + if (!getUnknownFields().equals(other.getUnknownFields())) return false; + return true; + } + + @java.lang.Override + public int hashCode() { + if (memoizedHashCode != 0) { + return memoizedHashCode; + } + int hash = 41; + hash = (19 * hash) + getDescriptor().hashCode(); + hash = (37 * hash) + ID_FIELD_NUMBER; + hash = (53 * hash) + getId().hashCode(); + hash = (37 * hash) + PAYLOAD_FIELD_NUMBER; + hash = (53 * hash) + getPayload().hashCode(); + hash = (29 * hash) + getUnknownFields().hashCode(); + memoizedHashCode = hash; + return hash; + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + java.nio.ByteBuffer data) throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + java.nio.ByteBuffer data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + com.google.protobuf.ByteString data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + com.google.protobuf.ByteString data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom(byte[] data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + java.io.InputStream input) throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException(PARSER, input); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException( + PARSER, input, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseDelimitedFrom( + java.io.InputStream input) throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseDelimitedWithIOException(PARSER, input); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseDelimitedFrom( + java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseDelimitedWithIOException( + PARSER, input, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + com.google.protobuf.CodedInputStream input) throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException(PARSER, input); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest parseFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException( + PARSER, input, extensionRegistry); + } + + @java.lang.Override + public Builder newBuilderForType() { + return newBuilder(); + } + + public static Builder newBuilder() { + return DEFAULT_INSTANCE.toBuilder(); + } + + public static Builder newBuilder( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest prototype) { + return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype); + } + + @java.lang.Override + public Builder toBuilder() { + return this == DEFAULT_INSTANCE ? new Builder() : new Builder().mergeFrom(this); + } + + @java.lang.Override + protected Builder newBuilderForType( + com.google.protobuf.GeneratedMessageV3.BuilderParent parent) { + Builder builder = new Builder(parent); + return builder; + } + /** + * + * + *
+     * The request to echo a payload.
+     * 
+ * + * Protobuf type {@code proto.echo.v1.EchoRequest} + */ + public static final class Builder + extends com.google.protobuf.GeneratedMessageV3.Builder + implements + // @@protoc_insertion_point(builder_implements:proto.echo.v1.EchoRequest) + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequestOrBuilder { + public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoRequest_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internalGetFieldAccessorTable() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoRequest_fieldAccessorTable + .ensureFieldAccessorsInitialized( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.class, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.Builder.class); + } + + // Construct using org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.newBuilder() + private Builder() {} + + private Builder(com.google.protobuf.GeneratedMessageV3.BuilderParent parent) { + super(parent); + } + + @java.lang.Override + public Builder clear() { + super.clear(); + bitField0_ = 0; + id_ = ""; + payload_ = com.google.protobuf.ByteString.EMPTY; + return this; + } + + @java.lang.Override + public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoRequest_descriptor; + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest + getDefaultInstanceForType() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.getDefaultInstance(); + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest build() { + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest result = buildPartial(); + if (!result.isInitialized()) { + throw newUninitializedMessageException(result); + } + return result; + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest buildPartial() { + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest result = + new org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest(this); + if (bitField0_ != 0) { + buildPartial0(result); + } + onBuilt(); + return result; + } + + private void buildPartial0( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest result) { + int from_bitField0_ = bitField0_; + if (((from_bitField0_ & 0x00000001) != 0)) { + result.id_ = id_; + } + if (((from_bitField0_ & 0x00000002) != 0)) { + result.payload_ = payload_; + } + } + + @java.lang.Override + public Builder clone() { + return super.clone(); + } + + @java.lang.Override + public Builder setField( + com.google.protobuf.Descriptors.FieldDescriptor field, java.lang.Object value) { + return super.setField(field, value); + } + + @java.lang.Override + public Builder clearField(com.google.protobuf.Descriptors.FieldDescriptor field) { + return super.clearField(field); + } + + @java.lang.Override + public Builder clearOneof(com.google.protobuf.Descriptors.OneofDescriptor oneof) { + return super.clearOneof(oneof); + } + + @java.lang.Override + public Builder setRepeatedField( + com.google.protobuf.Descriptors.FieldDescriptor field, + int index, + java.lang.Object value) { + return super.setRepeatedField(field, index, value); + } + + @java.lang.Override + public Builder addRepeatedField( + com.google.protobuf.Descriptors.FieldDescriptor field, java.lang.Object value) { + return super.addRepeatedField(field, value); + } + + @java.lang.Override + public Builder mergeFrom(com.google.protobuf.Message other) { + if (other instanceof org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest) { + return mergeFrom((org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest) other); + } else { + super.mergeFrom(other); + return this; + } + } + + public Builder mergeFrom(org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest other) { + if (other + == org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.getDefaultInstance()) + return this; + if (!other.getId().isEmpty()) { + id_ = other.id_; + bitField0_ |= 0x00000001; + onChanged(); + } + if (other.getPayload() != com.google.protobuf.ByteString.EMPTY) { + setPayload(other.getPayload()); + } + this.mergeUnknownFields(other.getUnknownFields()); + onChanged(); + return this; + } + + @java.lang.Override + public final boolean isInitialized() { + return true; + } + + @java.lang.Override + public Builder mergeFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + if (extensionRegistry == null) { + throw new java.lang.NullPointerException(); + } + try { + boolean done = false; + while (!done) { + int tag = input.readTag(); + switch (tag) { + case 0: + done = true; + break; + case 10: + { + id_ = input.readStringRequireUtf8(); + bitField0_ |= 0x00000001; + break; + } // case 10 + case 18: + { + payload_ = input.readBytes(); + bitField0_ |= 0x00000002; + break; + } // case 18 + default: + { + if (!super.parseUnknownField(input, extensionRegistry, tag)) { + done = true; // was an endgroup tag + } + break; + } // default: + } // switch (tag) + } // while (!done) + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.unwrapIOException(); + } finally { + onChanged(); + } // finally + return this; + } + + private int bitField0_; + + private java.lang.Object id_ = ""; + /** + * string id = 1 [json_name = "id"]; + * + * @return The id. + */ + public java.lang.String getId() { + java.lang.Object ref = id_; + if (!(ref instanceof java.lang.String)) { + com.google.protobuf.ByteString bs = (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + id_ = s; + return s; + } else { + return (java.lang.String) ref; + } + } + /** + * string id = 1 [json_name = "id"]; + * + * @return The bytes for id. + */ + public com.google.protobuf.ByteString getIdBytes() { + java.lang.Object ref = id_; + if (ref instanceof String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8((java.lang.String) ref); + id_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + /** + * string id = 1 [json_name = "id"]; + * + * @param value The id to set. + * @return This builder for chaining. + */ + public Builder setId(java.lang.String value) { + if (value == null) { + throw new NullPointerException(); + } + id_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + /** + * string id = 1 [json_name = "id"]; + * + * @return This builder for chaining. + */ + public Builder clearId() { + id_ = getDefaultInstance().getId(); + bitField0_ = (bitField0_ & ~0x00000001); + onChanged(); + return this; + } + /** + * string id = 1 [json_name = "id"]; + * + * @param value The bytes for id to set. + * @return This builder for chaining. + */ + public Builder setIdBytes(com.google.protobuf.ByteString value) { + if (value == null) { + throw new NullPointerException(); + } + checkByteStringIsUtf8(value); + id_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + + private com.google.protobuf.ByteString payload_ = com.google.protobuf.ByteString.EMPTY; + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return The payload. + */ + @java.lang.Override + public com.google.protobuf.ByteString getPayload() { + return payload_; + } + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @param value The payload to set. + * @return This builder for chaining. + */ + public Builder setPayload(com.google.protobuf.ByteString value) { + if (value == null) { + throw new NullPointerException(); + } + payload_ = value; + bitField0_ |= 0x00000002; + onChanged(); + return this; + } + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return This builder for chaining. + */ + public Builder clearPayload() { + bitField0_ = (bitField0_ & ~0x00000002); + payload_ = getDefaultInstance().getPayload(); + onChanged(); + return this; + } + + @java.lang.Override + public final Builder setUnknownFields( + final com.google.protobuf.UnknownFieldSet unknownFields) { + return super.setUnknownFields(unknownFields); + } + + @java.lang.Override + public final Builder mergeUnknownFields( + final com.google.protobuf.UnknownFieldSet unknownFields) { + return super.mergeUnknownFields(unknownFields); + } + + // @@protoc_insertion_point(builder_scope:proto.echo.v1.EchoRequest) + } + + // @@protoc_insertion_point(class_scope:proto.echo.v1.EchoRequest) + private static final org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest + DEFAULT_INSTANCE; + + static { + DEFAULT_INSTANCE = new org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest(); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + private static final com.google.protobuf.Parser PARSER = + new com.google.protobuf.AbstractParser() { + @java.lang.Override + public EchoRequest parsePartialFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + Builder builder = newBuilder(); + try { + builder.mergeFrom(input, extensionRegistry); + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.setUnfinishedMessage(builder.buildPartial()); + } catch (com.google.protobuf.UninitializedMessageException e) { + throw e.asInvalidProtocolBufferException() + .setUnfinishedMessage(builder.buildPartial()); + } catch (java.io.IOException e) { + throw new com.google.protobuf.InvalidProtocolBufferException(e) + .setUnfinishedMessage(builder.buildPartial()); + } + return builder.buildPartial(); + } + }; + + public static com.google.protobuf.Parser parser() { + return PARSER; + } + + @java.lang.Override + public com.google.protobuf.Parser getParserForType() { + return PARSER; + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest getDefaultInstanceForType() { + return DEFAULT_INSTANCE; + } + } + + public interface EchoResponseOrBuilder + extends + // @@protoc_insertion_point(interface_extends:proto.echo.v1.EchoResponse) + com.google.protobuf.MessageOrBuilder { + + /** + * string id = 1 [json_name = "id"]; + * + * @return The id. + */ + java.lang.String getId(); + /** + * string id = 1 [json_name = "id"]; + * + * @return The bytes for id. + */ + com.google.protobuf.ByteString getIdBytes(); + + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return The payload. + */ + com.google.protobuf.ByteString getPayload(); + } + /** + * + * + *
+   * The response echo of a request payload.
+   * 
+ * + * Protobuf type {@code proto.echo.v1.EchoResponse} + */ + public static final class EchoResponse extends com.google.protobuf.GeneratedMessageV3 + implements + // @@protoc_insertion_point(message_implements:proto.echo.v1.EchoResponse) + EchoResponseOrBuilder { + private static final long serialVersionUID = 0L; + // Use EchoResponse.newBuilder() to construct. + private EchoResponse(com.google.protobuf.GeneratedMessageV3.Builder builder) { + super(builder); + } + + private EchoResponse() { + id_ = ""; + payload_ = com.google.protobuf.ByteString.EMPTY; + } + + @java.lang.Override + @SuppressWarnings({"unused"}) + protected java.lang.Object newInstance(UnusedPrivateParameter unused) { + return new EchoResponse(); + } + + @java.lang.Override + public final com.google.protobuf.UnknownFieldSet getUnknownFields() { + return this.unknownFields; + } + + public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoResponse_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internalGetFieldAccessorTable() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoResponse_fieldAccessorTable + .ensureFieldAccessorsInitialized( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.class, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.Builder.class); + } + + public static final int ID_FIELD_NUMBER = 1; + + @SuppressWarnings("serial") + private volatile java.lang.Object id_ = ""; + /** + * string id = 1 [json_name = "id"]; + * + * @return The id. + */ + @java.lang.Override + public java.lang.String getId() { + java.lang.Object ref = id_; + if (ref instanceof java.lang.String) { + return (java.lang.String) ref; + } else { + com.google.protobuf.ByteString bs = (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + id_ = s; + return s; + } + } + /** + * string id = 1 [json_name = "id"]; + * + * @return The bytes for id. + */ + @java.lang.Override + public com.google.protobuf.ByteString getIdBytes() { + java.lang.Object ref = id_; + if (ref instanceof java.lang.String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8((java.lang.String) ref); + id_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + + public static final int PAYLOAD_FIELD_NUMBER = 2; + private com.google.protobuf.ByteString payload_ = com.google.protobuf.ByteString.EMPTY; + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return The payload. + */ + @java.lang.Override + public com.google.protobuf.ByteString getPayload() { + return payload_; + } + + private byte memoizedIsInitialized = -1; + + @java.lang.Override + public final boolean isInitialized() { + byte isInitialized = memoizedIsInitialized; + if (isInitialized == 1) return true; + if (isInitialized == 0) return false; + + memoizedIsInitialized = 1; + return true; + } + + @java.lang.Override + public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException { + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(id_)) { + com.google.protobuf.GeneratedMessageV3.writeString(output, 1, id_); + } + if (!payload_.isEmpty()) { + output.writeBytes(2, payload_); + } + getUnknownFields().writeTo(output); + } + + @java.lang.Override + public int getSerializedSize() { + int size = memoizedSize; + if (size != -1) return size; + + size = 0; + if (!com.google.protobuf.GeneratedMessageV3.isStringEmpty(id_)) { + size += com.google.protobuf.GeneratedMessageV3.computeStringSize(1, id_); + } + if (!payload_.isEmpty()) { + size += com.google.protobuf.CodedOutputStream.computeBytesSize(2, payload_); + } + size += getUnknownFields().getSerializedSize(); + memoizedSize = size; + return size; + } + + @java.lang.Override + public boolean equals(final java.lang.Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse)) { + return super.equals(obj); + } + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse other = + (org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse) obj; + + if (!getId().equals(other.getId())) return false; + if (!getPayload().equals(other.getPayload())) return false; + if (!getUnknownFields().equals(other.getUnknownFields())) return false; + return true; + } + + @java.lang.Override + public int hashCode() { + if (memoizedHashCode != 0) { + return memoizedHashCode; + } + int hash = 41; + hash = (19 * hash) + getDescriptor().hashCode(); + hash = (37 * hash) + ID_FIELD_NUMBER; + hash = (53 * hash) + getId().hashCode(); + hash = (37 * hash) + PAYLOAD_FIELD_NUMBER; + hash = (53 * hash) + getPayload().hashCode(); + hash = (29 * hash) + getUnknownFields().hashCode(); + memoizedHashCode = hash; + return hash; + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + java.nio.ByteBuffer data) throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + java.nio.ByteBuffer data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + com.google.protobuf.ByteString data) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + com.google.protobuf.ByteString data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + byte[] data) throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return PARSER.parseFrom(data, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + java.io.InputStream input) throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException(PARSER, input); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException( + PARSER, input, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseDelimitedFrom( + java.io.InputStream input) throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseDelimitedWithIOException(PARSER, input); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseDelimitedFrom( + java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseDelimitedWithIOException( + PARSER, input, extensionRegistry); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + com.google.protobuf.CodedInputStream input) throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException(PARSER, input); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse parseFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return com.google.protobuf.GeneratedMessageV3.parseWithIOException( + PARSER, input, extensionRegistry); + } + + @java.lang.Override + public Builder newBuilderForType() { + return newBuilder(); + } + + public static Builder newBuilder() { + return DEFAULT_INSTANCE.toBuilder(); + } + + public static Builder newBuilder( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse prototype) { + return DEFAULT_INSTANCE.toBuilder().mergeFrom(prototype); + } + + @java.lang.Override + public Builder toBuilder() { + return this == DEFAULT_INSTANCE ? new Builder() : new Builder().mergeFrom(this); + } + + @java.lang.Override + protected Builder newBuilderForType( + com.google.protobuf.GeneratedMessageV3.BuilderParent parent) { + Builder builder = new Builder(parent); + return builder; + } + /** + * + * + *
+     * The response echo of a request payload.
+     * 
+ * + * Protobuf type {@code proto.echo.v1.EchoResponse} + */ + public static final class Builder + extends com.google.protobuf.GeneratedMessageV3.Builder + implements + // @@protoc_insertion_point(builder_implements:proto.echo.v1.EchoResponse) + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponseOrBuilder { + public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoResponse_descriptor; + } + + @java.lang.Override + protected com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internalGetFieldAccessorTable() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoResponse_fieldAccessorTable + .ensureFieldAccessorsInitialized( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.class, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.Builder.class); + } + + // Construct using org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.newBuilder() + private Builder() {} + + private Builder(com.google.protobuf.GeneratedMessageV3.BuilderParent parent) { + super(parent); + } + + @java.lang.Override + public Builder clear() { + super.clear(); + bitField0_ = 0; + id_ = ""; + payload_ = com.google.protobuf.ByteString.EMPTY; + return this; + } + + @java.lang.Override + public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo + .internal_static_proto_echo_v1_EchoResponse_descriptor; + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse + getDefaultInstanceForType() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.getDefaultInstance(); + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse build() { + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse result = buildPartial(); + if (!result.isInitialized()) { + throw newUninitializedMessageException(result); + } + return result; + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse buildPartial() { + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse result = + new org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse(this); + if (bitField0_ != 0) { + buildPartial0(result); + } + onBuilt(); + return result; + } + + private void buildPartial0( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse result) { + int from_bitField0_ = bitField0_; + if (((from_bitField0_ & 0x00000001) != 0)) { + result.id_ = id_; + } + if (((from_bitField0_ & 0x00000002) != 0)) { + result.payload_ = payload_; + } + } + + @java.lang.Override + public Builder clone() { + return super.clone(); + } + + @java.lang.Override + public Builder setField( + com.google.protobuf.Descriptors.FieldDescriptor field, java.lang.Object value) { + return super.setField(field, value); + } + + @java.lang.Override + public Builder clearField(com.google.protobuf.Descriptors.FieldDescriptor field) { + return super.clearField(field); + } + + @java.lang.Override + public Builder clearOneof(com.google.protobuf.Descriptors.OneofDescriptor oneof) { + return super.clearOneof(oneof); + } + + @java.lang.Override + public Builder setRepeatedField( + com.google.protobuf.Descriptors.FieldDescriptor field, + int index, + java.lang.Object value) { + return super.setRepeatedField(field, index, value); + } + + @java.lang.Override + public Builder addRepeatedField( + com.google.protobuf.Descriptors.FieldDescriptor field, java.lang.Object value) { + return super.addRepeatedField(field, value); + } + + @java.lang.Override + public Builder mergeFrom(com.google.protobuf.Message other) { + if (other instanceof org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse) { + return mergeFrom((org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse) other); + } else { + super.mergeFrom(other); + return this; + } + } + + public Builder mergeFrom(org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse other) { + if (other + == org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.getDefaultInstance()) + return this; + if (!other.getId().isEmpty()) { + id_ = other.id_; + bitField0_ |= 0x00000001; + onChanged(); + } + if (other.getPayload() != com.google.protobuf.ByteString.EMPTY) { + setPayload(other.getPayload()); + } + this.mergeUnknownFields(other.getUnknownFields()); + onChanged(); + return this; + } + + @java.lang.Override + public final boolean isInitialized() { + return true; + } + + @java.lang.Override + public Builder mergeFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + if (extensionRegistry == null) { + throw new java.lang.NullPointerException(); + } + try { + boolean done = false; + while (!done) { + int tag = input.readTag(); + switch (tag) { + case 0: + done = true; + break; + case 10: + { + id_ = input.readStringRequireUtf8(); + bitField0_ |= 0x00000001; + break; + } // case 10 + case 18: + { + payload_ = input.readBytes(); + bitField0_ |= 0x00000002; + break; + } // case 18 + default: + { + if (!super.parseUnknownField(input, extensionRegistry, tag)) { + done = true; // was an endgroup tag + } + break; + } // default: + } // switch (tag) + } // while (!done) + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.unwrapIOException(); + } finally { + onChanged(); + } // finally + return this; + } + + private int bitField0_; + + private java.lang.Object id_ = ""; + /** + * string id = 1 [json_name = "id"]; + * + * @return The id. + */ + public java.lang.String getId() { + java.lang.Object ref = id_; + if (!(ref instanceof java.lang.String)) { + com.google.protobuf.ByteString bs = (com.google.protobuf.ByteString) ref; + java.lang.String s = bs.toStringUtf8(); + id_ = s; + return s; + } else { + return (java.lang.String) ref; + } + } + /** + * string id = 1 [json_name = "id"]; + * + * @return The bytes for id. + */ + public com.google.protobuf.ByteString getIdBytes() { + java.lang.Object ref = id_; + if (ref instanceof String) { + com.google.protobuf.ByteString b = + com.google.protobuf.ByteString.copyFromUtf8((java.lang.String) ref); + id_ = b; + return b; + } else { + return (com.google.protobuf.ByteString) ref; + } + } + /** + * string id = 1 [json_name = "id"]; + * + * @param value The id to set. + * @return This builder for chaining. + */ + public Builder setId(java.lang.String value) { + if (value == null) { + throw new NullPointerException(); + } + id_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + /** + * string id = 1 [json_name = "id"]; + * + * @return This builder for chaining. + */ + public Builder clearId() { + id_ = getDefaultInstance().getId(); + bitField0_ = (bitField0_ & ~0x00000001); + onChanged(); + return this; + } + /** + * string id = 1 [json_name = "id"]; + * + * @param value The bytes for id to set. + * @return This builder for chaining. + */ + public Builder setIdBytes(com.google.protobuf.ByteString value) { + if (value == null) { + throw new NullPointerException(); + } + checkByteStringIsUtf8(value); + id_ = value; + bitField0_ |= 0x00000001; + onChanged(); + return this; + } + + private com.google.protobuf.ByteString payload_ = com.google.protobuf.ByteString.EMPTY; + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return The payload. + */ + @java.lang.Override + public com.google.protobuf.ByteString getPayload() { + return payload_; + } + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @param value The payload to set. + * @return This builder for chaining. + */ + public Builder setPayload(com.google.protobuf.ByteString value) { + if (value == null) { + throw new NullPointerException(); + } + payload_ = value; + bitField0_ |= 0x00000002; + onChanged(); + return this; + } + /** + * bytes payload = 2 [json_name = "payload"]; + * + * @return This builder for chaining. + */ + public Builder clearPayload() { + bitField0_ = (bitField0_ & ~0x00000002); + payload_ = getDefaultInstance().getPayload(); + onChanged(); + return this; + } + + @java.lang.Override + public final Builder setUnknownFields( + final com.google.protobuf.UnknownFieldSet unknownFields) { + return super.setUnknownFields(unknownFields); + } + + @java.lang.Override + public final Builder mergeUnknownFields( + final com.google.protobuf.UnknownFieldSet unknownFields) { + return super.mergeUnknownFields(unknownFields); + } + + // @@protoc_insertion_point(builder_scope:proto.echo.v1.EchoResponse) + } + + // @@protoc_insertion_point(class_scope:proto.echo.v1.EchoResponse) + private static final org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse + DEFAULT_INSTANCE; + + static { + DEFAULT_INSTANCE = new org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse(); + } + + public static org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse + getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + private static final com.google.protobuf.Parser PARSER = + new com.google.protobuf.AbstractParser() { + @java.lang.Override + public EchoResponse parsePartialFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + Builder builder = newBuilder(); + try { + builder.mergeFrom(input, extensionRegistry); + } catch (com.google.protobuf.InvalidProtocolBufferException e) { + throw e.setUnfinishedMessage(builder.buildPartial()); + } catch (com.google.protobuf.UninitializedMessageException e) { + throw e.asInvalidProtocolBufferException() + .setUnfinishedMessage(builder.buildPartial()); + } catch (java.io.IOException e) { + throw new com.google.protobuf.InvalidProtocolBufferException(e) + .setUnfinishedMessage(builder.buildPartial()); + } + return builder.buildPartial(); + } + }; + + public static com.google.protobuf.Parser parser() { + return PARSER; + } + + @java.lang.Override + public com.google.protobuf.Parser getParserForType() { + return PARSER; + } + + @java.lang.Override + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse + getDefaultInstanceForType() { + return DEFAULT_INSTANCE; + } + } + + private static final com.google.protobuf.Descriptors.Descriptor + internal_static_proto_echo_v1_EchoRequest_descriptor; + private static final com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_proto_echo_v1_EchoRequest_fieldAccessorTable; + private static final com.google.protobuf.Descriptors.Descriptor + internal_static_proto_echo_v1_EchoResponse_descriptor; + private static final com.google.protobuf.GeneratedMessageV3.FieldAccessorTable + internal_static_proto_echo_v1_EchoResponse_fieldAccessorTable; + + public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() { + return descriptor; + } + + private static com.google.protobuf.Descriptors.FileDescriptor descriptor; + + static { + java.lang.String[] descriptorData = { + "\n\030proto/echo/v1/echo.proto\022\rproto.echo.v" + + "1\"7\n\013EchoRequest\022\016\n\002id\030\001 \001(\tR\002id\022\030\n\007payl" + + "oad\030\002 \001(\014R\007payload\"8\n\014EchoResponse\022\016\n\002id" + + "\030\001 \001(\tR\002id\022\030\n\007payload\030\002 \001(\014R\007payload2P\n\013" + + "EchoService\022A\n\004Echo\022\032.proto.echo.v1.Echo" + + "Request\032\033.proto.echo.v1.EchoResponse\"\000B;" + + "\n*org.apache.beam.testinfra.mockapis.ech" + + "o.v1Z\rproto/echo/v1b\006proto3" + }; + descriptor = + com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom( + descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {}); + internal_static_proto_echo_v1_EchoRequest_descriptor = getDescriptor().getMessageTypes().get(0); + internal_static_proto_echo_v1_EchoRequest_fieldAccessorTable = + new com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_proto_echo_v1_EchoRequest_descriptor, + new java.lang.String[] { + "Id", "Payload", + }); + internal_static_proto_echo_v1_EchoResponse_descriptor = + getDescriptor().getMessageTypes().get(1); + internal_static_proto_echo_v1_EchoResponse_fieldAccessorTable = + new com.google.protobuf.GeneratedMessageV3.FieldAccessorTable( + internal_static_proto_echo_v1_EchoResponse_descriptor, + new java.lang.String[] { + "Id", "Payload", + }); + } + + // @@protoc_insertion_point(outer_class_scope) +} diff --git a/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/EchoServiceGrpc.java b/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/EchoServiceGrpc.java new file mode 100644 index 0000000000000..14437899b69ce --- /dev/null +++ b/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/EchoServiceGrpc.java @@ -0,0 +1,393 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.testinfra.mockapis.echo.v1; + +import static io.grpc.MethodDescriptor.generateFullMethodName; + +/** + * + * + *
+ * EchoService simulates a mock API that echos a request.
+ * 
+ */ +@SuppressWarnings({ + "argument", + "assignment", + "initialization.fields.uninitialized", + "initialization.static.field.uninitialized", + "override.param", + "ClassTypeParameterName", + "ForbidNonVendoredGuava", + "JavadocStyle", + "LocalVariableName", + "MemberName", + "NeedBraces", + "MissingOverride", + "RedundantModifier", + "ReferenceEquality", + "UnusedVariable", +}) +@javax.annotation.Generated( + value = "by gRPC proto compiler (version 1.58.0)", + comments = "Source: proto/echo/v1/echo.proto") +@io.grpc.stub.annotations.GrpcGenerated +public final class EchoServiceGrpc { + + private EchoServiceGrpc() {} + + public static final java.lang.String SERVICE_NAME = "proto.echo.v1.EchoService"; + + // Static method descriptors that strictly reflect the proto. + private static volatile io.grpc.MethodDescriptor< + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse> + getEchoMethod; + + @io.grpc.stub.annotations.RpcMethod( + fullMethodName = SERVICE_NAME + '/' + "Echo", + requestType = org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest.class, + responseType = org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse.class, + methodType = io.grpc.MethodDescriptor.MethodType.UNARY) + public static io.grpc.MethodDescriptor< + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse> + getEchoMethod() { + io.grpc.MethodDescriptor< + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse> + getEchoMethod; + if ((getEchoMethod = EchoServiceGrpc.getEchoMethod) == null) { + synchronized (EchoServiceGrpc.class) { + if ((getEchoMethod = EchoServiceGrpc.getEchoMethod) == null) { + EchoServiceGrpc.getEchoMethod = + getEchoMethod = + io.grpc.MethodDescriptor + . + newBuilder() + .setType(io.grpc.MethodDescriptor.MethodType.UNARY) + .setFullMethodName(generateFullMethodName(SERVICE_NAME, "Echo")) + .setSampledToLocalTracing(true) + .setRequestMarshaller( + io.grpc.protobuf.ProtoUtils.marshaller( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest + .getDefaultInstance())) + .setResponseMarshaller( + io.grpc.protobuf.ProtoUtils.marshaller( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse + .getDefaultInstance())) + .setSchemaDescriptor(new EchoServiceMethodDescriptorSupplier("Echo")) + .build(); + } + } + } + return getEchoMethod; + } + + /** Creates a new async stub that supports all call types for the service */ + public static EchoServiceStub newStub(io.grpc.Channel channel) { + io.grpc.stub.AbstractStub.StubFactory factory = + new io.grpc.stub.AbstractStub.StubFactory() { + @java.lang.Override + public EchoServiceStub newStub(io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + return new EchoServiceStub(channel, callOptions); + } + }; + return EchoServiceStub.newStub(factory, channel); + } + + /** + * Creates a new blocking-style stub that supports unary and streaming output calls on the service + */ + public static EchoServiceBlockingStub newBlockingStub(io.grpc.Channel channel) { + io.grpc.stub.AbstractStub.StubFactory factory = + new io.grpc.stub.AbstractStub.StubFactory() { + @java.lang.Override + public EchoServiceBlockingStub newStub( + io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + return new EchoServiceBlockingStub(channel, callOptions); + } + }; + return EchoServiceBlockingStub.newStub(factory, channel); + } + + /** Creates a new ListenableFuture-style stub that supports unary calls on the service */ + public static EchoServiceFutureStub newFutureStub(io.grpc.Channel channel) { + io.grpc.stub.AbstractStub.StubFactory factory = + new io.grpc.stub.AbstractStub.StubFactory() { + @java.lang.Override + public EchoServiceFutureStub newStub( + io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + return new EchoServiceFutureStub(channel, callOptions); + } + }; + return EchoServiceFutureStub.newStub(factory, channel); + } + + /** + * + * + *
+   * EchoService simulates a mock API that echos a request.
+   * 
+ */ + public interface AsyncService { + + /** + * + * + *
+     * Echo an EchoRequest payload in an EchoResponse.
+     * 
+ */ + default void echo( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest request, + io.grpc.stub.StreamObserver + responseObserver) { + io.grpc.stub.ServerCalls.asyncUnimplementedUnaryCall(getEchoMethod(), responseObserver); + } + } + + /** + * Base class for the server implementation of the service EchoService. + * + *
+   * EchoService simulates a mock API that echos a request.
+   * 
+ */ + public abstract static class EchoServiceImplBase + implements io.grpc.BindableService, AsyncService { + + @java.lang.Override + public final io.grpc.ServerServiceDefinition bindService() { + return EchoServiceGrpc.bindService(this); + } + } + + /** + * A stub to allow clients to do asynchronous rpc calls to service EchoService. + * + *
+   * EchoService simulates a mock API that echos a request.
+   * 
+ */ + public static final class EchoServiceStub + extends io.grpc.stub.AbstractAsyncStub { + private EchoServiceStub(io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + super(channel, callOptions); + } + + @java.lang.Override + protected EchoServiceStub build(io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + return new EchoServiceStub(channel, callOptions); + } + + /** + * + * + *
+     * Echo an EchoRequest payload in an EchoResponse.
+     * 
+ */ + public void echo( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest request, + io.grpc.stub.StreamObserver + responseObserver) { + io.grpc.stub.ClientCalls.asyncUnaryCall( + getChannel().newCall(getEchoMethod(), getCallOptions()), request, responseObserver); + } + } + + /** + * A stub to allow clients to do synchronous rpc calls to service EchoService. + * + *
+   * EchoService simulates a mock API that echos a request.
+   * 
+ */ + public static final class EchoServiceBlockingStub + extends io.grpc.stub.AbstractBlockingStub { + private EchoServiceBlockingStub(io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + super(channel, callOptions); + } + + @java.lang.Override + protected EchoServiceBlockingStub build( + io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + return new EchoServiceBlockingStub(channel, callOptions); + } + + /** + * + * + *
+     * Echo an EchoRequest payload in an EchoResponse.
+     * 
+ */ + public org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse echo( + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest request) { + return io.grpc.stub.ClientCalls.blockingUnaryCall( + getChannel(), getEchoMethod(), getCallOptions(), request); + } + } + + /** + * A stub to allow clients to do ListenableFuture-style rpc calls to service EchoService. + * + *
+   * EchoService simulates a mock API that echos a request.
+   * 
+ */ + public static final class EchoServiceFutureStub + extends io.grpc.stub.AbstractFutureStub { + private EchoServiceFutureStub(io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + super(channel, callOptions); + } + + @java.lang.Override + protected EchoServiceFutureStub build( + io.grpc.Channel channel, io.grpc.CallOptions callOptions) { + return new EchoServiceFutureStub(channel, callOptions); + } + + /** + * + * + *
+     * Echo an EchoRequest payload in an EchoResponse.
+     * 
+ */ + public com.google.common.util.concurrent.ListenableFuture< + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse> + echo(org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest request) { + return io.grpc.stub.ClientCalls.futureUnaryCall( + getChannel().newCall(getEchoMethod(), getCallOptions()), request); + } + } + + private static final int METHODID_ECHO = 0; + + private static final class MethodHandlers + implements io.grpc.stub.ServerCalls.UnaryMethod, + io.grpc.stub.ServerCalls.ServerStreamingMethod, + io.grpc.stub.ServerCalls.ClientStreamingMethod, + io.grpc.stub.ServerCalls.BidiStreamingMethod { + private final AsyncService serviceImpl; + private final int methodId; + + MethodHandlers(AsyncService serviceImpl, int methodId) { + this.serviceImpl = serviceImpl; + this.methodId = methodId; + } + + @java.lang.Override + @java.lang.SuppressWarnings("unchecked") + public void invoke(Req request, io.grpc.stub.StreamObserver responseObserver) { + switch (methodId) { + case METHODID_ECHO: + serviceImpl.echo( + (org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest) request, + (io.grpc.stub.StreamObserver< + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse>) + responseObserver); + break; + default: + throw new AssertionError(); + } + } + + @java.lang.Override + @java.lang.SuppressWarnings("unchecked") + public io.grpc.stub.StreamObserver invoke( + io.grpc.stub.StreamObserver responseObserver) { + switch (methodId) { + default: + throw new AssertionError(); + } + } + } + + public static final io.grpc.ServerServiceDefinition bindService(AsyncService service) { + return io.grpc.ServerServiceDefinition.builder(getServiceDescriptor()) + .addMethod( + getEchoMethod(), + io.grpc.stub.ServerCalls.asyncUnaryCall( + new MethodHandlers< + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest, + org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse>( + service, METHODID_ECHO))) + .build(); + } + + private abstract static class EchoServiceBaseDescriptorSupplier + implements io.grpc.protobuf.ProtoFileDescriptorSupplier, + io.grpc.protobuf.ProtoServiceDescriptorSupplier { + EchoServiceBaseDescriptorSupplier() {} + + @java.lang.Override + public com.google.protobuf.Descriptors.FileDescriptor getFileDescriptor() { + return org.apache.beam.testinfra.mockapis.echo.v1.Echo.getDescriptor(); + } + + @java.lang.Override + public com.google.protobuf.Descriptors.ServiceDescriptor getServiceDescriptor() { + return getFileDescriptor().findServiceByName("EchoService"); + } + } + + private static final class EchoServiceFileDescriptorSupplier + extends EchoServiceBaseDescriptorSupplier { + EchoServiceFileDescriptorSupplier() {} + } + + private static final class EchoServiceMethodDescriptorSupplier + extends EchoServiceBaseDescriptorSupplier + implements io.grpc.protobuf.ProtoMethodDescriptorSupplier { + private final java.lang.String methodName; + + EchoServiceMethodDescriptorSupplier(java.lang.String methodName) { + this.methodName = methodName; + } + + @java.lang.Override + public com.google.protobuf.Descriptors.MethodDescriptor getMethodDescriptor() { + return getServiceDescriptor().findMethodByName(methodName); + } + } + + private static volatile io.grpc.ServiceDescriptor serviceDescriptor; + + public static io.grpc.ServiceDescriptor getServiceDescriptor() { + io.grpc.ServiceDescriptor result = serviceDescriptor; + if (result == null) { + synchronized (EchoServiceGrpc.class) { + result = serviceDescriptor; + if (result == null) { + serviceDescriptor = + result = + io.grpc.ServiceDescriptor.newBuilder(SERVICE_NAME) + .setSchemaDescriptor(new EchoServiceFileDescriptorSupplier()) + .addMethod(getEchoMethod()) + .build(); + } + } + } + return result; + } +} diff --git a/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/package-info.java b/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/package-info.java new file mode 100644 index 0000000000000..00b7fa2f70b78 --- /dev/null +++ b/.test-infra/mock-apis/src/main/java/org/apache/beam/testinfra/mockapis/echo/v1/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Autogenerated code supporting a quota aware gRPC endpoint client. */ +package org.apache.beam.testinfra.mockapis.echo.v1; diff --git a/.test-infra/mock-apis/src/main/python/proto/echo/v1/echo_pb2.py b/.test-infra/mock-apis/src/main/python/proto/echo/v1/echo_pb2.py new file mode 100644 index 0000000000000..0a1bd2aff7715 --- /dev/null +++ b/.test-infra/mock-apis/src/main/python/proto/echo/v1/echo_pb2.py @@ -0,0 +1,47 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: proto/echo/v1/echo.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x18proto/echo/v1/echo.proto\x12\rproto.echo.v1\"7\n\x0b\x45\x63hoRequest\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x18\n\x07payload\x18\x02 \x01(\x0cR\x07payload\"8\n\x0c\x45\x63hoResponse\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x18\n\x07payload\x18\x02 \x01(\x0cR\x07payload2P\n\x0b\x45\x63hoService\x12\x41\n\x04\x45\x63ho\x12\x1a.proto.echo.v1.EchoRequest\x1a\x1b.proto.echo.v1.EchoResponse\"\x00\x42;\n*org.apache.beam.testinfra.mockapis.echo.v1Z\rproto/echo/v1b\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'proto.echo.v1.echo_pb2', _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + _globals['DESCRIPTOR']._options = None + _globals['DESCRIPTOR']._serialized_options = b'\n*org.apache.beam.testinfra.mockapis.echo.v1Z\rproto/echo/v1' + _globals['_ECHOREQUEST']._serialized_start=43 + _globals['_ECHOREQUEST']._serialized_end=98 + _globals['_ECHORESPONSE']._serialized_start=100 + _globals['_ECHORESPONSE']._serialized_end=156 + _globals['_ECHOSERVICE']._serialized_start=158 + _globals['_ECHOSERVICE']._serialized_end=238 +# @@protoc_insertion_point(module_scope) diff --git a/.test-infra/mock-apis/src/main/python/proto/echo/v1/echo_pb2_grpc.py b/.test-infra/mock-apis/src/main/python/proto/echo/v1/echo_pb2_grpc.py new file mode 100644 index 0000000000000..0a92ee4af6c44 --- /dev/null +++ b/.test-infra/mock-apis/src/main/python/proto/echo/v1/echo_pb2_grpc.py @@ -0,0 +1,87 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from proto.echo.v1 import echo_pb2 as proto_dot_echo_dot_v1_dot_echo__pb2 + + +class EchoServiceStub(object): + """EchoService simulates a mock API that echos a request. + """ + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.Echo = channel.unary_unary( + '/proto.echo.v1.EchoService/Echo', + request_serializer=proto_dot_echo_dot_v1_dot_echo__pb2.EchoRequest.SerializeToString, + response_deserializer=proto_dot_echo_dot_v1_dot_echo__pb2.EchoResponse.FromString, + ) + + +class EchoServiceServicer(object): + """EchoService simulates a mock API that echos a request. + """ + + def Echo(self, request, context): + """Echo an EchoRequest payload in an EchoResponse. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_EchoServiceServicer_to_server(servicer, server): + rpc_method_handlers = { + 'Echo': grpc.unary_unary_rpc_method_handler( + servicer.Echo, + request_deserializer=proto_dot_echo_dot_v1_dot_echo__pb2.EchoRequest.FromString, + response_serializer=proto_dot_echo_dot_v1_dot_echo__pb2.EchoResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'proto.echo.v1.EchoService', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + + # This class is part of an EXPERIMENTAL API. +class EchoService(object): + """EchoService simulates a mock API that echos a request. + """ + + @staticmethod + def Echo(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/proto.echo.v1.EchoService/Echo', + proto_dot_echo_dot_v1_dot_echo__pb2.EchoRequest.SerializeToString, + proto_dot_echo_dot_v1_dot_echo__pb2.EchoResponse.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/.test-infra/tools/build.gradle b/.test-infra/tools/build.gradle index ea4a12a24fc69..274cd3022911e 100644 --- a/.test-infra/tools/build.gradle +++ b/.test-infra/tools/build.gradle @@ -24,6 +24,26 @@ task removeStaleSDKContainerImages(type: Exec) { commandLine './stale_dataflow_prebuilt_image_cleaner.sh' } -task cleanupOtherStaleResources(type: Exec) { +task removeStaleBqDatasets(type: Exec) { commandLine './stale_bq_datasets_cleaner.sh' } + +task removeStaleCbtInstances(type: Exec) { + commandLine './stale_cbt_instances_cleaner.sh' +} + +task removeStaleK8sWorkload(type: Exec) { + commandLine './stale_k8s_workload_cleaner.sh' +} + +task removeStaleSpannerResources(type: Exec) { + commandLine './stale_spanner_cleaner.sh' +} + +task cleanupOtherStaleResources { + // declared as finalizedBy dependency so that other task continue even if one dep task fails + finalizedBy tasks.removeStaleBqDatasets + finalizedBy tasks.removeStaleCbtInstances + finalizedBy tasks.removeStaleK8sWorkload + finalizedBy tasks.removeStaleSpannerResources +} diff --git a/.test-infra/tools/stale_bq_datasets_cleaner.sh b/.test-infra/tools/stale_bq_datasets_cleaner.sh index ee3f27ef671e0..47e8ea95498ee 100755 --- a/.test-infra/tools/stale_bq_datasets_cleaner.sh +++ b/.test-infra/tools/stale_bq_datasets_cleaner.sh @@ -41,7 +41,12 @@ for dataset in ${BQ_DATASETS[@]}; do LAST_MODIFIED=$(($LAST_MODIFIED_MS / 1000)) if [[ $GRACE_PERIOD -gt $LAST_MODIFIED ]]; then if bq --project_id=$PROJECT rm -r -f $dataset; then - echo "Deleted $dataset (modified `date -d @$LAST_MODIFIED`)" + if [[ $OSTYPE == "linux-gnu"* ]]; then + # date command usage depending on OS + echo "Deleted $dataset (modified `date -d @$LAST_MODIFIED`)" + elif [[ $OSTYPE == "darwin"* ]]; then + echo "Deleted $dataset (modified `date -r @$LAST_MODIFIED`)" + fi else failed_calls+=1 fi diff --git a/.test-infra/tools/stale_cbt_instances_cleaner.sh b/.test-infra/tools/stale_cbt_instances_cleaner.sh new file mode 100755 index 0000000000000..ed7f9df622e7e --- /dev/null +++ b/.test-infra/tools/stale_cbt_instances_cleaner.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Deletes stale and old BQ datasets that are left after tests. +# + +set -euo pipefail + +PROJECT=apache-beam-testing + +# get first 50 instances +CBT_INSTANCES=`cbt -project=$PROJECT listinstances | awk 'NR>2 {print $1} NR==52{exit}'` + +CLEANUP_INSTANCE_TEMPLATES=(bt-read-tests bt-write-xlang test[a-z]+) + +# A grace period of 5 days +GRACE_PERIOD=$((`date +%s` - 24 * 3600 * 5)) +# count number of failed api calls +declare -i failed_calls=0 + +for instance in ${CBT_INSTANCES[@]}; do + for template in ${CLEANUP_INSTANCE_TEMPLATES[@]}; do + pattern=$template-"([0-9]{8})"- + if [[ $instance =~ $pattern ]]; then + CREATE_DATE=${BASH_REMATCH[1]} + if [[ $OSTYPE == "linux-gnu"* ]]; then + # skip if not a valid date + CREATED=`date -d ${CREATE_DATE} +%s` || continue + elif [[ $OSTYPE == "darwin"* ]]; then + # date command usage depending on OS + CREATED=`date -ju -f "%Y%m%d-%H%M%S" ${CREATE_DATE}-000000 +%s` || continue + else + echo "Unsupported OS $OSTYPE" + exit 1 + fi + if [[ $GRACE_PERIOD -gt $CREATED ]]; then + if cbt -project=$PROJECT deleteinstance $instance; then + echo "Deleted $instance (created $CREATE_DATE)" + else + failed_calls+=1 + fi + fi + break + fi + done +done + +# fail the script if failed_calls is nonzero +if [[ failed_calls -ne 0 ]]; then + echo "Failed delete $failed_calls instances" + exit 1 +fi diff --git a/.test-infra/tools/stale_dataflow_jobs_cleaner.sh b/.test-infra/tools/stale_dataflow_jobs_cleaner.sh index e6df7058427b3..e56304807475b 100755 --- a/.test-infra/tools/stale_dataflow_jobs_cleaner.sh +++ b/.test-infra/tools/stale_dataflow_jobs_cleaner.sh @@ -27,10 +27,3 @@ if [[ ${STALE_JOBS} ]]; then else echo "No stale jobs found." fi - -# Delete spanner databases older than 1 day. -gcloud spanner databases list \ ---instance beam-test \ ---filter="createTime < $(date --iso-8601=s -d '1 day ago')" \ ---format="value(name)" | \ -xargs -I{} gcloud spanner databases delete {} --instance beam-test --quiet diff --git a/.test-infra/tools/stale_k8s_workload_cleaner.sh b/.test-infra/tools/stale_k8s_workload_cleaner.sh new file mode 100755 index 0000000000000..9ddaf17f2ce8d --- /dev/null +++ b/.test-infra/tools/stale_k8s_workload_cleaner.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Deletes stale and old BQ datasets that are left after tests. +# + +set -euo pipefail + +# Clean up the stale kubernetes workload of given cluster + +PROJECT=apache-beam-testing +LOCATION=us-central1-a +CLUSTER=io-datastores + +function should_teardown() { + if [[ $1 =~ ^([0-9]+)([a-z]) ]]; then + local time_scale=${BASH_REMATCH[1]} + local time_unit=${BASH_REMATCH[2]} + # cutoff = 8 h + if [ $time_unit == y ] || [ $time_unit == d ]; then + return 0 + elif [ $time_unit == h ] && [ $time_scale -ge 8 ]; then + return 0 + fi + fi + return 1 +} + +gcloud container clusters get-credentials io-datastores --zone us-central1-a --project apache-beam-testing + +while read NAME STATUS AGE; do + if [[ $NAME =~ ^beam-.+(test|-it) ]] && should_teardown $AGE; then + kubectl delete namespace $NAME + fi +done < <( kubectl get namespaces --context=gke_${PROJECT}_${LOCATION}_${CLUSTER} ) diff --git a/sdks/python/build-requirements.txt b/.test-infra/tools/stale_spanner_cleaner.sh old mode 100644 new mode 100755 similarity index 66% rename from sdks/python/build-requirements.txt rename to .test-infra/tools/stale_spanner_cleaner.sh index 4fe47079d8d09..28ecc6e47c98c --- a/sdks/python/build-requirements.txt +++ b/.test-infra/tools/stale_spanner_cleaner.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -14,15 +15,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# Deletes stale and old BQ datasets that are left after tests. +set -euo pipefail -# TODO(https://github.com/apache/beam/issues/20051): Consider PEP-517/PEP-518 instead of this file. +PROJECT=apache-beam-testing -setuptools -wheel>=0.36.0 -grpcio-tools==1.53.0 -mypy-protobuf==3.5.0 -# Avoid https://github.com/pypa/virtualenv/issues/2006 -distlib==0.3.7 - -# Numpy headers -numpy>=1.14.3,<1.26 +# Delete spanner databases older than 1 day. +gcloud spanner databases list \ +--instance beam-test \ +--project $PROJECT \ +--filter="createTime < $(date --iso-8601=s -d '1 day ago')" \ +--format="value(name)" | \ +xargs -I{} gcloud spanner databases delete {} --instance beam-test --quiet diff --git a/CHANGES.md b/CHANGES.md index 650b33c124072..523ef3455aec6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -53,7 +53,7 @@ * ([#X](https://github.com/apache/beam/issues/X)). --> -# [2.51.0] - Unreleased +# [2.53.0] - Unreleased ## Highlights @@ -63,6 +63,70 @@ ## I/Os * Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* TextIO now supports skipping multiple header lines (Java) ([#17990](https://github.com/apache/beam/issues/17990)). + +## New Features / Improvements + +* X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). + +## Breaking Changes + +* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). + +## Deprecations + +* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). + +## Bugfixes + +* Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). + +## Security Fixes +* Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). + +## Known Issues + +* ([#X](https://github.com/apache/beam/issues/X)). + +# [2.52.0] - Unreleased + +## Highlights + +* Previously deprecated Avro-dependent code (Beam Release 2.46.0) has been finally removed from Java SDK "core" package. +Please, use `beam-sdks-java-extensions-avro` instead. This will allow to easily update Avro version in user code without +potential breaking changes in Beam "core" since the Beam Avro extension already supports the latest Avro versions and +should handle this. ([#25252](https://github.com/apache/beam/issues/25252)). +* Publishing Java 21 SDK container images now supported as part of Apache Beam release process. ([#28120](https://github.com/apache/beam/issues/28120)) + * Direct Runner and Dataflow Runner support running pipelines on Java21 (experimental until tests fully setup). For other runners (Flink, Spark, Samza, etc) support status depend on runner projects. + +## New Features / Improvements + +* Add `UseDataStreamForBatch` pipeline option to the Flink runner. When it is set to true, Flink runner will run batch + jobs using the DataStream API. By default the option is set to false, so the batch jobs are still executed + using the DataSet API. +* `upload_graph` as one of the Experiments options for DataflowRunner is no longer required when the graph is larger than 10MB for Java SDK ([PR#28621](https://github.com/apache/beam/pull/28621). +* state amd side input cache has been enabled to a default of 100 MB. Use `--max_cache_memory_usage_mb=X` to provide cache size for the user state API and side inputs. (Python) ([#28770](https://github.com/apache/beam/issues/28770)). +* Beam YAML stable release. Beam pipelines can now be written using YAML and leverage the Beam YAML framework which includes a preliminary set of IO's and turnkey transforms. More information can be found in the YAML root folder and in the [README](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/README.md). + + +## Breaking Changes + +* `org.apache.beam.sdk.io.CountingSource.CounterMark` uses custom `CounterMarkCoder` as a default coder since all Avro-dependent +classes finally moved to `extensions/avro`. In case if it's still required to use `AvroCoder` for `CounterMark`, then, +as a workaround, a copy of "old" `CountingSource` class should be placed into a project code and used directly +([#25252](https://github.com/apache/beam/issues/25252)). +* Renamed `host` to `firestoreHost` in `FirestoreOptions` to avoid potential conflict of command line arguments (Java) ([#29201](https://github.com/apache/beam/pull/29201)). + +## Bugfixes + +* Fixed "Desired bundle size 0 bytes must be greater than 0" in Java SDK's BigtableIO.BigtableSource when you have more cores than bytes to read (Java) [#28793](https://github.com/apache/beam/issues/28793). +* `watch_file_pattern` arg of the [RunInference](https://github.com/apache/beam/blob/104c10b3ee536a9a3ea52b4dbf62d86b669da5d9/sdks/python/apache_beam/ml/inference/base.py#L997) arg had no effect prior to 2.52.0. To use the behavior of arg `watch_file_pattern` prior to 2.52.0, follow the documentation at https://beam.apache.org/documentation/ml/side-input-updates/ and use `WatchFilePattern` PTransform as a SideInput. ([#28948](https://github.com/apache/beam/pulls/28948)) +* `MLTransform` doesn't output artifacts such as min, max and quantiles. Instead, `MLTransform` will add a feature to output these artifacts as human readable format - [#29017](https://github.com/apache/beam/issues/29017). For now, to use the artifacts such as min and max that were produced by the eariler `MLTransform`, use `read_artifact_location` of `MLTransform`, which reads artifacts that were produced earlier in a different `MLTransform` ([#29016](https://github.com/apache/beam/pull/29016/)) + +## Security Fixes +* Fixed [CVE-2023-39325](https://www.cve.org/CVERecord?id=CVE-2023-39325) (Java/Python/Go) ([#29118](https://github.com/apache/beam/issues/29118)). + +# [2.51.0] - 2023-10-03 ## New Features / Improvements @@ -70,6 +134,7 @@ * In Python, the [VertexAIModelHandlerJSON](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.vertex_ai_inference.html#apache_beam.ml.inference.vertex_ai_inference.VertexAIModelHandlerJSON) now supports passing in inference_args. These will be passed through to the Vertex endpoint as parameters. * Added support to run `mypy` on user pipelines ([#27906](https://github.com/apache/beam/issues/27906)) + ## Breaking Changes * Removed fastjson library dependency for Beam SQL. Table property is changed to be based on jackson ObjectNode (Java) ([#24154](https://github.com/apache/beam/issues/24154)). @@ -77,9 +142,6 @@ * Removed the parameter `t reflect.Type` from `parquetio.Write`. The element type is derived from the input PCollection (Go) ([#28490](https://github.com/apache/beam/issues/28490)) * Refactor BeamSqlSeekableTable.setUp adding a parameter joinSubsetType. [#28283](https://github.com/apache/beam/issues/28283) -## Deprecations - -* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). ## Bugfixes @@ -94,7 +156,8 @@ ## Known Issues -* ([#X](https://github.com/apache/beam/issues/X)). +* Python pipelines using BigQuery Storage Read API must pin `fastavro` + dependency to 1.8.3 or earlier: [#28811](https://github.com/apache/beam/issues/28811) # [2.50.0] - 2023-08-30 diff --git a/build.gradle.kts b/build.gradle.kts index a4a9a09e504fd..59161809f37c1 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -19,7 +19,7 @@ plugins { base // Apply one top level rat plugin to perform any required license enforcement analysis - id("org.nosphere.apache.rat") version "0.8.0" + id("org.nosphere.apache.rat") version "0.8.1" // Enable gradle-based release management id("net.researchgate.release") version "2.8.1" id("org.apache.beam.module") @@ -54,6 +54,7 @@ tasks.rat { // Proto/grpc generated wrappers "**/apache_beam/portability/api/**/*_pb2*.py", "**/go/pkg/beam/**/*.pb.go", + "**/mock-apis/**/*.pb.go", // Ignore go.sum files, which don't permit headers "**/go.sum", @@ -198,6 +199,12 @@ tasks.rat { // Ignore typesciript package management. "sdks/typescript/package-lock.json", "sdks/typescript/node_modules/**/*", + + // Ignore buf autogenerated files. + "**/buf.lock", + + // Ignore poetry autogenerated files. + "**/poetry.lock", ) // Add .gitignore excludes to the Apache Rat exclusion list. We re-create the behavior @@ -294,7 +301,6 @@ tasks.register("javaPreCommit") { dependsOn(":sdks:java:fn-execution:build") dependsOn(":sdks:java:harness:build") dependsOn(":sdks:java:harness:jmh:build") - dependsOn(":sdks:java:io:azure:build") dependsOn(":sdks:java:io:bigquery-io-perf-tests:build") dependsOn(":sdks:java:io:common:build") dependsOn(":sdks:java:io:contextualtextio:build") @@ -310,6 +316,8 @@ tasks.register("javaPreCommit") { dependsOn(":sdks:java:testing:test-utils:build") dependsOn(":sdks:java:testing:tpcds:build") dependsOn(":sdks:java:testing:watermarks:build") + dependsOn(":sdks:java:transform-service:build") + dependsOn(":sdks:java:transform-service:launcher:build") dependsOn(":examples:java:preCommit") dependsOn(":examples:java:twitter:preCommit") @@ -349,6 +357,7 @@ tasks.register("javaioPreCommit") { dependsOn(":sdks:java:io:parquet:build") dependsOn(":sdks:java:io:rabbitmq:build") dependsOn(":sdks:java:io:redis:build") + dependsOn(":sdks:java:io:rrio:build") dependsOn(":sdks:java:io:singlestore:build") dependsOn(":sdks:java:io:solr:build") dependsOn(":sdks:java:io:splunk:build") @@ -710,14 +719,12 @@ if (project.hasProperty("javaLinkageArtifactIds")) { } } } -if (project.hasProperty("compileAndRunTestsWithJava11")) { - tasks.getByName("javaPreCommitPortabilityApi").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion") - tasks.getByName("javaExamplesDataflowPrecommit").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion") - tasks.getByName("sqlPreCommit").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion") -} else if (project.hasProperty("compileAndRunTestsWithJava17")) { - tasks.getByName("javaPreCommitPortabilityApi").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion17") - tasks.getByName("javaExamplesDataflowPrecommit").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion17") - tasks.getByName("sqlPreCommit").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion17") +if (project.hasProperty("testJavaVersion")) { + var testVer = project.property("testJavaVersion") + + tasks.getByName("javaPreCommitPortabilityApi").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion$testVer") + tasks.getByName("javaExamplesDataflowPrecommit").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion$testVer") + tasks.getByName("sqlPreCommit").dependsOn(":sdks:java:testing:test-utils:verifyJavaVersion$testVer") } else { allprojects { tasks.withType(Test::class).configureEach { diff --git a/buildSrc/build.gradle.kts b/buildSrc/build.gradle.kts index edd10ee108f6d..d99a1003c3964 100644 --- a/buildSrc/build.gradle.kts +++ b/buildSrc/build.gradle.kts @@ -44,20 +44,19 @@ dependencies { implementation("gradle.plugin.com.github.johnrengelman:shadow:7.1.1") implementation("com.github.spotbugs.snom:spotbugs-gradle-plugin:5.0.14") - runtimeOnly("com.google.protobuf:protobuf-gradle-plugin:0.8.13") // Enable proto code generation - runtimeOnly("com.github.davidmc24.gradle-avro-plugin:gradle-avro-plugin:0.16.0") // Enable Avro code generation - runtimeOnly("com.diffplug.spotless:spotless-plugin-gradle:5.6.1") // Enable a code formatting plugin - runtimeOnly("com.palantir.gradle.docker:gradle-docker:0.34.0") // Enable building Docker containers - runtimeOnly("gradle.plugin.com.dorongold.plugins:task-tree:1.5") // Adds a 'taskTree' task to print task dependency tree - runtimeOnly("gradle.plugin.com.github.johnrengelman:shadow:7.1.1") // Enable shading Java dependencies + runtimeOnly("com.google.protobuf:protobuf-gradle-plugin:0.8.13") // Enable proto code generation + runtimeOnly("com.github.davidmc24.gradle-avro-plugin:gradle-avro-plugin:0.16.0") // Enable Avro code generation + runtimeOnly("com.diffplug.spotless:spotless-plugin-gradle:5.6.1") // Enable a code formatting plugin + runtimeOnly("gradle.plugin.com.dorongold.plugins:task-tree:1.5") // Adds a 'taskTree' task to print task dependency tree + runtimeOnly("gradle.plugin.com.github.johnrengelman:shadow:7.1.1") // Enable shading Java dependencies runtimeOnly("net.linguica.gradle:maven-settings-plugin:0.5") runtimeOnly("gradle.plugin.io.pry.gradle.offline_dependencies:gradle-offline-dependencies-plugin:0.5.0") // Enable creating an offline repository - runtimeOnly("net.ltgt.gradle:gradle-errorprone-plugin:1.2.1") // Enable errorprone Java static analysis + runtimeOnly("net.ltgt.gradle:gradle-errorprone-plugin:3.1.0") // Enable errorprone Java static analysis runtimeOnly("org.ajoberstar.grgit:grgit-gradle:4.1.1") // Enable website git publish to asf-site branch - runtimeOnly("com.avast.gradle:gradle-docker-compose-plugin:0.17.5") // Enable docker compose tasks + runtimeOnly("com.avast.gradle:gradle-docker-compose-plugin:0.16.12") // Enable docker compose tasks runtimeOnly("ca.cutterslade.gradle:gradle-dependency-analyze:1.8.3") // Enable dep analysis runtimeOnly("gradle.plugin.net.ossindex:ossindex-gradle-plugin:0.4.11") // Enable dep vulnerability analysis - runtimeOnly("org.checkerframework:checkerframework-gradle-plugin:0.6.33") // Enable enhanced static checking plugin + runtimeOnly("org.checkerframework:checkerframework-gradle-plugin:0.6.34") // Enable enhanced static checking plugin } // Because buildSrc is built and tested automatically _before_ gradle diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamDockerPlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamDockerPlugin.groovy new file mode 100644 index 0000000000000..442b35439cae5 --- /dev/null +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamDockerPlugin.groovy @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.gradle + +import java.util.regex.Pattern +import org.gradle.api.GradleException +import org.gradle.api.Plugin +import org.gradle.api.Project +import org.gradle.api.Task +import org.gradle.api.file.CopySpec +import org.gradle.api.logging.LogLevel +import org.gradle.api.logging.Logger +import org.gradle.api.logging.Logging +import org.gradle.api.tasks.Copy +import org.gradle.api.tasks.Delete +import org.gradle.api.tasks.Exec + +/** + * A gradle plug-in interacting with docker. Originally replicated from + * com.palantir.docker plugin. + */ +class BeamDockerPlugin implements Plugin { + private static final Logger logger = Logging.getLogger(BeamDockerPlugin.class) + private static final Pattern LABEL_KEY_PATTERN = Pattern.compile('^[a-z0-9.-]*$') + + static class DockerExtension { + Project project + + private static final String DEFAULT_DOCKERFILE_PATH = 'Dockerfile' + String name = null + File dockerfile = null + String dockerComposeTemplate = 'docker-compose.yml.template' + String dockerComposeFile = 'docker-compose.yml' + Set dependencies = [] as Set + Set tags = [] as Set + Map namedTags = [:] + Map labels = [:] + Map buildArgs = [:] + boolean pull = false + boolean noCache = false + String network = null + boolean buildx = false + Set platform = [] as Set + boolean load = false + boolean push = false + String builder = null + + File resolvedDockerfile = null + File resolvedDockerComposeTemplate = null + File resolvedDockerComposeFile = null + + // The CopySpec defining the Docker Build Context files + final CopySpec copySpec + + DockerExtension(Project project) { + this.project = project + this.copySpec = project.copySpec() + } + + void resolvePathsAndValidate() { + if (dockerfile != null) { + resolvedDockerfile = dockerfile + } else { + resolvedDockerfile = project.file(DEFAULT_DOCKERFILE_PATH) + } + resolvedDockerComposeFile = project.file(dockerComposeFile) + resolvedDockerComposeTemplate = project.file(dockerComposeTemplate) + } + + void dependsOn(Task... args) { + this.dependencies = args as Set + } + + Set getDependencies() { + return dependencies + } + + void files(Object... files) { + copySpec.from(files) + } + + void tags(String... args) { + this.tags = args as Set + } + + Set getTags() { + return this.tags + project.getVersion().toString() + } + + Set getPlatform() { + return platform + } + + void platform(String... args) { + this.platform = args as Set + } + } + + @Override + void apply(Project project) { + DockerExtension ext = project.extensions.create('docker', DockerExtension, project) + + Delete clean = project.tasks.create('dockerClean', Delete, { + group = 'Docker' + description = 'Cleans Docker build directory.' + }) + + Copy prepare = project.tasks.create('dockerPrepare', Copy, { + group = 'Docker' + description = 'Prepares Docker build directory.' + dependsOn clean + }) + + Exec exec = project.tasks.create('docker', Exec, { + group = 'Docker' + description = 'Builds Docker image.' + dependsOn prepare + }) + + Task tag = project.tasks.create('dockerTag', { + group = 'Docker' + description = 'Applies all tags to the Docker image.' + dependsOn exec + }) + + Task pushAllTags = project.tasks.create('dockerTagsPush', { + group = 'Docker' + description = 'Pushes all tagged Docker images to configured Docker Hub.' + }) + + project.tasks.create('dockerPush', { + group = 'Docker' + description = 'Pushes named Docker image to configured Docker Hub.' + dependsOn pushAllTags + }) + + project.afterEvaluate { + ext.resolvePathsAndValidate() + String dockerDir = "${project.buildDir}/docker" + clean.delete dockerDir + + prepare.with { + with ext.copySpec + from(ext.resolvedDockerfile) { + rename { fileName -> + fileName.replace(ext.resolvedDockerfile.getName(), 'Dockerfile') + } + } + into dockerDir + } + + exec.with { + workingDir dockerDir + commandLine buildCommandLine(ext) + dependsOn ext.getDependencies() + logging.captureStandardOutput LogLevel.INFO + logging.captureStandardError LogLevel.ERROR + } + + Map tags = ext.namedTags.collectEntries { taskName, tagName -> + [ + generateTagTaskName(taskName), + [ + tagName: tagName, + tagTask: { + -> tagName } + ] + ] + } + + if (!ext.tags.isEmpty()) { + ext.tags.each { unresolvedTagName -> + String taskName = generateTagTaskName(unresolvedTagName) + + if (tags.containsKey(taskName)) { + throw new IllegalArgumentException("Task name '${taskName}' is existed.") + } + + tags[taskName] = [ + tagName: unresolvedTagName, + tagTask: { + -> computeName(ext.name, unresolvedTagName) } + ] + } + } + + tags.each { taskName, tagConfig -> + Exec tagSubTask = project.tasks.create('dockerTag' + taskName, Exec, { + group = 'Docker' + description = "Tags Docker image with tag '${tagConfig.tagName}'" + workingDir dockerDir + commandLine 'docker', 'tag', "${-> ext.name}", "${-> tagConfig.tagTask()}" + dependsOn exec + }) + tag.dependsOn tagSubTask + + Exec pushSubTask = project.tasks.create('dockerPush' + taskName, Exec, { + group = 'Docker' + description = "Pushes the Docker image with tag '${tagConfig.tagName}' to configured Docker Hub" + workingDir dockerDir + commandLine 'docker', 'push', "${-> tagConfig.tagTask()}" + dependsOn tagSubTask + }) + pushAllTags.dependsOn pushSubTask + } + } + } + + private List buildCommandLine(DockerExtension ext) { + List buildCommandLine = ['docker'] + if (ext.buildx) { + buildCommandLine.addAll(['buildx', 'build']) + if (!ext.platform.isEmpty()) { + buildCommandLine.addAll('--platform', String.join(',', ext.platform)) + } + if (ext.load) { + buildCommandLine.add '--load' + } + if (ext.push) { + buildCommandLine.add '--push' + if (ext.load) { + throw new Exception("cannot combine 'push' and 'load' options") + } + } + if (ext.builder != null) { + buildCommandLine.addAll('--builder', ext.builder) + } + } else { + buildCommandLine.add 'build' + } + if (ext.noCache) { + buildCommandLine.add '--no-cache' + } + if (ext.getNetwork() != null) { + buildCommandLine.addAll('--network', ext.network) + } + if (!ext.buildArgs.isEmpty()) { + for (Map.Entry buildArg : ext.buildArgs.entrySet()) { + buildCommandLine.addAll('--build-arg', "${buildArg.getKey()}=${buildArg.getValue()}" as String) + } + } + if (!ext.labels.isEmpty()) { + for (Map.Entry label : ext.labels.entrySet()) { + if (!label.getKey().matches(LABEL_KEY_PATTERN)) { + throw new GradleException(String.format("Docker label '%s' contains illegal characters. " + + "Label keys must only contain lowercase alphanumberic, `.`, or `-` characters (must match %s).", + label.getKey(), LABEL_KEY_PATTERN.pattern())) + } + buildCommandLine.addAll('--label', "${label.getKey()}=${label.getValue()}" as String) + } + } + if (ext.pull) { + buildCommandLine.add '--pull' + } + buildCommandLine.addAll(['-t', "${-> ext.name}", '.']) + logger.debug("${buildCommandLine}" as String) + return buildCommandLine + } + + private static String computeName(String name, String tag) { + int firstAt = tag.indexOf("@") + + String tagValue + if (firstAt > 0) { + tagValue = tag.substring(firstAt + 1, tag.length()) + } else { + tagValue = tag + } + + if (tagValue.contains(':') || tagValue.contains('/')) { + // tag with ':' or '/' -> force use the tag value + return tagValue + } else { + // tag without ':' and '/' -> replace the tag part of original name + int lastColon = name.lastIndexOf(':') + int lastSlash = name.lastIndexOf('/') + + int endIndex; + + // image_name -> this should remain + // host:port/image_name -> this should remain. + // host:port/image_name:v1 -> v1 should be replaced + if (lastColon > lastSlash) endIndex = lastColon + else endIndex = name.length() + + return name.substring(0, endIndex) + ":" + tagValue + } + } + + private static String generateTagTaskName(String name) { + String tagTaskName = name + int firstAt = name.indexOf("@") + + if (firstAt > 0) { + // Get substring of task name + tagTaskName = name.substring(0, firstAt) + } else if (firstAt == 0) { + // Task name must not be empty + throw new GradleException("Task name of docker tag '${name}' must not be empty.") + } else if (name.contains(':') || name.contains('/')) { + // Tags which with repo or name must have a task name + throw new GradleException("Docker tag '${name}' must have a task name.") + } + + StringBuffer sb = new StringBuffer(tagTaskName) + // Uppercase the first letter of task name + sb.replace(0, 1, tagTaskName.substring(0, 1).toUpperCase()); + return sb.toString() + } +} diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamDockerRunPlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamDockerRunPlugin.groovy new file mode 100644 index 0000000000000..5297c70181396 --- /dev/null +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamDockerRunPlugin.groovy @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.gradle + +import org.gradle.api.Plugin +import org.gradle.api.Project +import org.gradle.api.tasks.Exec + +/** + * A gradle plug-in handling 'docker run' command. Originally replicated from + * com.palantir.docker-run plugin. + */ +class BeamDockerRunPlugin implements Plugin { + + /** A class defining the configurations of dockerRun task. */ + static class DockerRunExtension { + String name + String image + Set ports = [] as Set + Map env = [:] + List arguments = [] + Map volumes = [:] + boolean daemonize = true + boolean clean = false + + public String getName() { + return name + } + + public void setName(String name) { + this.name = name + } + } + + @Override + void apply(Project project) { + DockerRunExtension ext = project.extensions.create('dockerRun', DockerRunExtension) + + Exec dockerRunStatus = project.tasks.create('dockerRunStatus', Exec, { + group = 'Docker Run' + description = 'Checks the run status of the container' + }) + + Exec dockerRun = project.tasks.create('dockerRun', Exec, { + group = 'Docker Run' + description = 'Runs the specified container with port mappings' + }) + + Exec dockerStop = project.tasks.create('dockerStop', Exec, { + group = 'Docker Run' + description = 'Stops the named container if it is running' + ignoreExitValue = true + }) + + Exec dockerRemoveContainer = project.tasks.create('dockerRemoveContainer', Exec, { + group = 'Docker Run' + description = 'Removes the persistent container associated with the Docker Run tasks' + ignoreExitValue = true + }) + + project.afterEvaluate { + /** Inspect status of docker. */ + dockerRunStatus.with { + standardOutput = new ByteArrayOutputStream() + commandLine 'docker', 'inspect', '--format={{.State.Running}}', ext.name + doLast { + if (standardOutput.toString().trim() != 'true') { + println "Docker container '${ext.name}' is STOPPED." + return 1 + } else { + println "Docker container '${ext.name}' is RUNNING." + } + } + } + + /** + * Run a docker container. See {@link DockerRunExtension} for supported + * arguments. + * + * Replication of dockerRun task of com.palantir.docker-run plugin. + */ + dockerRun.with { + List args = new ArrayList() + args.addAll(['docker', 'run']) + + if (ext.daemonize) { + args.add('-d') + } + if (ext.clean) { + args.add('--rm') + } else { + finalizedBy dockerRunStatus + } + for (String port : ext.ports) { + args.add('-p') + args.add(port) + } + for (Map.Entry volume : ext.volumes.entrySet()) { + File localFile = project.file(volume.key) + + if (!localFile.exists()) { + logger.error("ERROR: Local folder ${localFile} doesn't exist. Mounted volume will not be visible to container") + throw new IllegalStateException("Local folder ${localFile} doesn't exist.") + } + args.add('-v') + args.add("${localFile.absolutePath}:${volume.value}") + } + args.addAll(ext.env.collect{ k, v -> ['-e', "${k}=${v}"]}.flatten()) + args.add('--name') + args.add(ext.name) + if (!ext.arguments.isEmpty()) { + args.addAll(ext.arguments) + } + args.add(ext.image) + + commandLine args + } + + dockerStop.with { + commandLine 'docker', 'stop', ext.name + } + + dockerRemoveContainer.with { + commandLine 'docker', 'rm', ext.name + } + } + } +} diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index c31482d577e01..70da837b3b34e 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -23,7 +23,6 @@ import static java.util.UUID.randomUUID import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar import groovy.json.JsonOutput import groovy.json.JsonSlurper -import java.net.ServerSocket import java.util.logging.Logger import org.gradle.api.attributes.Category import org.gradle.api.GradleException @@ -450,6 +449,21 @@ class BeamModulePlugin implements Plugin { return 'beam' + p.path.replace(':', '-') } + static def getSupportedJavaVersion() { + if (JavaVersion.current() == JavaVersion.VERSION_1_8) { + return 'java8' + } else if (JavaVersion.current() == JavaVersion.VERSION_11) { + return 'java11' + } else if (JavaVersion.current() == JavaVersion.VERSION_17) { + return 'java17' + } else if (JavaVersion.current() == JavaVersion.VERSION_21) { + return 'java21' + } else { + String exceptionMessage = "Your Java version is unsupported. You need Java version of 8, 11, 17 or 21 to get started, but your Java version is: " + JavaVersion.current(); + throw new GradleException(exceptionMessage) + } + } + /* * Set compile args for compiling and running in different java version by modifying the compiler args in place. * @@ -489,13 +503,6 @@ class BeamModulePlugin implements Plugin { project.ext.mavenGroupId = 'org.apache.beam' - // Automatically use the official release version if we are performing a release - // otherwise append '-SNAPSHOT' - project.version = '2.52.0' - if (!isRelease(project)) { - project.version += '-SNAPSHOT' - } - // Default to dash-separated directories for artifact base name, // which will also be the default artifactId for maven publications project.apply plugin: 'base' @@ -591,12 +598,12 @@ class BeamModulePlugin implements Plugin { def dbcp2_version = "2.9.0" def errorprone_version = "2.10.0" // Try to keep gax_version consistent with gax-grpc version in google_cloud_platform_libraries_bom - def gax_version = "2.32.0" + def gax_version = "2.33.0" def google_ads_version = "26.0.0" def google_clients_version = "2.0.0" def google_cloud_bigdataoss_version = "2.2.16" // Try to keep google_cloud_spanner_version consistent with google_cloud_spanner_bom in google_cloud_platform_libraries_bom - def google_cloud_spanner_version = "6.45.0" + def google_cloud_spanner_version = "6.47.0" def google_code_gson_version = "2.10.1" def google_oauth_clients_version = "1.34.1" // Try to keep grpc_version consistent with gRPC version in google_cloud_platform_libraries_bom @@ -610,7 +617,7 @@ class BeamModulePlugin implements Plugin { def jackson_version = "2.14.1" def jaxb_api_version = "2.3.3" def jsr305_version = "3.0.2" - def everit_json_version = "1.14.1" + def everit_json_version = "1.14.2" def kafka_version = "2.4.1" def log4j2_version = "2.20.0" def nemo_version = "0.1" @@ -720,14 +727,14 @@ class BeamModulePlugin implements Plugin { google_api_client_java6 : "com.google.api-client:google-api-client-java6:$google_clients_version", google_api_common : "com.google.api:api-common", // google_cloud_platform_libraries_bom sets version // Keep version consistent with the version in google_cloud_bigquery, managed by google_cloud_platform_libraries_bom - google_api_services_bigquery : "com.google.apis:google-api-services-bigquery:v2-rev20230520-$google_clients_version", + google_api_services_bigquery : "com.google.apis:google-api-services-bigquery:v2-rev20230812-$google_clients_version", // Keep version consistent with the version in google_cloud_resourcemanager, managed by google_cloud_platform_libraries_bom - google_api_services_cloudresourcemanager : "com.google.apis:google-api-services-cloudresourcemanager:v1-rev20230129-$google_clients_version", + google_api_services_cloudresourcemanager : "com.google.apis:google-api-services-cloudresourcemanager:v1-rev20230806-$google_clients_version", google_api_services_dataflow : "com.google.apis:google-api-services-dataflow:v1b3-rev20220920-$google_clients_version", - google_api_services_healthcare : "com.google.apis:google-api-services-healthcare:v1-rev20230830-$google_clients_version", + google_api_services_healthcare : "com.google.apis:google-api-services-healthcare:v1-rev20231003-$google_clients_version", google_api_services_pubsub : "com.google.apis:google-api-services-pubsub:v1-rev20220904-$google_clients_version", // Keep version consistent with the version in google_cloud_nio, managed by google_cloud_platform_libraries_bom - google_api_services_storage : "com.google.apis:google-api-services-storage:v1-rev20230617-$google_clients_version", + google_api_services_storage : "com.google.apis:google-api-services-storage:v1-rev20230907-$google_clients_version", google_auth_library_credentials : "com.google.auth:google-auth-library-credentials", // google_cloud_platform_libraries_bom sets version google_auth_library_oauth2_http : "com.google.auth:google-auth-library-oauth2-http", // google_cloud_platform_libraries_bom sets version google_cloud_bigquery : "com.google.cloud:google-cloud-bigquery", // google_cloud_platform_libraries_bom sets version @@ -740,14 +747,14 @@ class BeamModulePlugin implements Plugin { google_cloud_datacatalog_v1beta1 : "com.google.cloud:google-cloud-datacatalog", // google_cloud_platform_libraries_bom sets version google_cloud_dataflow_java_proto_library_all: "com.google.cloud.dataflow:google-cloud-dataflow-java-proto-library-all:0.5.160304", // Keep version consistent with the version in google_cloud_datastore, managed by google_cloud_platform_libraries_bom - google_cloud_datastore_v1_proto_client : "com.google.cloud.datastore:datastore-v1-proto-client:2.16.3", + google_cloud_datastore_v1_proto_client : "com.google.cloud.datastore:datastore-v1-proto-client:2.17.1", google_cloud_firestore : "com.google.cloud:google-cloud-firestore", // google_cloud_platform_libraries_bom sets version google_cloud_pubsub : "com.google.cloud:google-cloud-pubsub", // google_cloud_platform_libraries_bom sets version google_cloud_pubsublite : "com.google.cloud:google-cloud-pubsublite", // google_cloud_platform_libraries_bom sets version // The release notes shows the versions set by the BOM: // https://github.com/googleapis/java-cloud-bom/releases/tag/v26.21.0 // Update libraries-bom version on sdks/java/container/license_scripts/dep_urls_java.yaml - google_cloud_platform_libraries_bom : "com.google.cloud:libraries-bom:26.22.0", + google_cloud_platform_libraries_bom : "com.google.cloud:libraries-bom:26.23.0", google_cloud_spanner : "com.google.cloud:google-cloud-spanner", // google_cloud_platform_libraries_bom sets version google_cloud_spanner_test : "com.google.cloud:google-cloud-spanner:$google_cloud_spanner_version:tests", google_code_gson : "com.google.code.gson:gson:$google_code_gson_version", @@ -817,7 +824,7 @@ class BeamModulePlugin implements Plugin { joda_time : "joda-time:joda-time:2.10.10", jsonassert : "org.skyscreamer:jsonassert:1.5.0", jsr305 : "com.google.code.findbugs:jsr305:$jsr305_version", - json_org : "org.json:json:20220320", // Keep in sync with everit-json-schema / google_cloud_platform_libraries_bom transitive deps. + json_org : "org.json:json:20231013", // Keep in sync with everit-json-schema / google_cloud_platform_libraries_bom transitive deps. everit_json_schema : "com.github.erosb:everit-json-schema:${everit_json_version}", junit : "junit:junit:4.13.1", jupiter_api : "org.junit.jupiter:junit-jupiter-api:$jupiter_version", @@ -868,7 +875,7 @@ class BeamModulePlugin implements Plugin { slf4j_jul_to_slf4j : "org.slf4j:jul-to-slf4j:$slf4j_version", slf4j_log4j12 : "org.slf4j:slf4j-log4j12:$slf4j_version", slf4j_jcl : "org.slf4j:slf4j-jcl:$slf4j_version", - snappy_java : "org.xerial.snappy:snappy-java:1.1.10.3", + snappy_java : "org.xerial.snappy:snappy-java:1.1.10.4", spark_core : "org.apache.spark:spark-core_2.11:$spark2_version", spark_streaming : "org.apache.spark:spark-streaming_2.11:$spark2_version", spark3_core : "org.apache.spark:spark-core_2.12:$spark3_version", @@ -930,28 +937,66 @@ class BeamModulePlugin implements Plugin { + suffix) } - project.ext.setJava17Options = { CompileOptions options -> - def java17Home = project.findProperty("java17Home") - options.fork = true - options.forkOptions.javaHome = java17Home as File - options.compilerArgs += ['-Xlint:-path'] - // Error prone requires some packages to be exported/opened for Java 17 - // Disabling checks since this property is only used for Jenkins tests - // https://github.com/tbroyer/gradle-errorprone-plugin#jdk-16-support - options.errorprone.errorproneArgs.add("-XepDisableAllChecks") - // The -J prefix is needed to workaround https://github.com/gradle/gradle/issues/22747 - options.forkOptions.jvmArgs += [ - "-J--add-exports=jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED", - "-J--add-exports=jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED", - "-J--add-exports=jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED", - "-J--add-exports=jdk.compiler/com.sun.tools.javac.model=ALL-UNNAMED", - "-J--add-exports=jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED", - "-J--add-exports=jdk.compiler/com.sun.tools.javac.processing=ALL-UNNAMED", - "-J--add-exports=jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED", - "-J--add-exports=jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED", - "-J--add-opens=jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED", - "-J--add-opens=jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED" - ] + // set compiler options for java version overrides to compile with a different java version + project.ext.setJavaVerOptions = { CompileOptions options, String ver -> + if (ver == '11') { + def java11Home = project.findProperty("java11Home") + options.fork = true + options.forkOptions.javaHome = java11Home as File + options.compilerArgs += ['-Xlint:-path'] + } else if (ver == '17') { + def java17Home = project.findProperty("java17Home") + options.fork = true + options.forkOptions.javaHome = java17Home as File + options.compilerArgs += ['-Xlint:-path'] + // Error prone requires some packages to be exported/opened for Java 17 + // Disabling checks since this property is only used for tests + // https://github.com/tbroyer/gradle-errorprone-plugin#jdk-16-support + options.errorprone.errorproneArgs.add("-XepDisableAllChecks") + // The -J prefix is needed to workaround https://github.com/gradle/gradle/issues/22747 + options.forkOptions.jvmArgs += [ + "-J--add-exports=jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.model=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.processing=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED", + "-J--add-opens=jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED", + "-J--add-opens=jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED" + ] + } else if (ver == '21') { + def java21Home = project.findProperty("java21Home") + options.fork = true + options.forkOptions.javaHome = java21Home as File + options.compilerArgs += [ + '-Xlint:-path', + '-Xlint:-this-escape' + ] + // Error prone requires some packages to be exported/opened for Java 17+ + // Disabling checks since this property is only used for tests + options.errorprone.errorproneArgs.add("-XepDisableAllChecks") + options.forkOptions.jvmArgs += [ + "-J--add-exports=jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.model=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.processing=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED", + "-J--add-exports=jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED", + "-J--add-opens=jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED", + "-J--add-opens=jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED" + ] + // TODO(https://github.com/apache/beam/issues/28963) + // upgrade checkerFramework to enable it in Java 21 + project.checkerFramework { + skipCheckerFramework = true + } + } else { + throw new GradleException("Unknown Java Version ${ver} for setting additional java options") + } } project.ext.repositories = { @@ -1252,7 +1297,7 @@ class BeamModulePlugin implements Plugin { if (configuration.shadowClosure) { // Ensure that tests are packaged and part of the artifact set. project.task('packageTests', type: Jar) { - classifier = 'tests-unshaded' + archiveClassifier = 'tests-unshaded' from project.sourceSets.test.output } project.artifacts.archives project.packageTests @@ -1500,27 +1545,20 @@ class BeamModulePlugin implements Plugin { options.errorprone.errorproneArgs.add("-Xep:Slf4jLoggerShouldBeNonStatic:OFF") } - if (project.hasProperty("compileAndRunTestsWithJava11")) { - def java11Home = project.findProperty("java11Home") - project.tasks.compileTestJava { - options.fork = true - options.forkOptions.javaHome = java11Home as File - options.compilerArgs += ['-Xlint:-path'] - setCompileAndRuntimeJavaVersion(options.compilerArgs, '11') - } - project.tasks.withType(Test).configureEach { - useJUnit() - executable = "${java11Home}/bin/java" - } - } else if (project.hasProperty("compileAndRunTestsWithJava17")) { - def java17Home = project.findProperty("java17Home") + // if specified test java version, modify the compile and runtime versions accordingly + if (['11', '17', '21'].contains(project.findProperty('testJavaVersion'))) { + String ver = project.getProperty('testJavaVersion') + def testJavaHome = project.getProperty("java${ver}Home") + + // redirect java compiler to specified version for compileTestJava only project.tasks.compileTestJava { - setCompileAndRuntimeJavaVersion(options.compilerArgs, '17') - project.ext.setJava17Options(options) + setCompileAndRuntimeJavaVersion(options.compilerArgs, ver) + project.ext.setJavaVerOptions(options, ver) } + // redirect java runtime to specified version for running tests project.tasks.withType(Test).configureEach { useJUnit() - executable = "${java17Home}/bin/java" + executable = "${testJavaHome}/bin/java" } } @@ -1560,13 +1598,13 @@ class BeamModulePlugin implements Plugin { } } - // Always configure the shadowJar classifier and merge service files. + // Always configure the shadowJar archiveClassifier and merge service files. if (configuration.shadowClosure) { // Only set the classifer on the unshaded classes if we are shading. - project.jar { classifier = "unshaded" } + project.jar { archiveClassifier = "unshaded" } project.shadowJar({ - classifier = null + archiveClassifier = null mergeServiceFiles() zip64 true into("META-INF/") { @@ -1575,11 +1613,11 @@ class BeamModulePlugin implements Plugin { } } << configuration.shadowClosure) - // Always configure the shadowTestJar classifier and merge service files. + // Always configure the shadowTestJar archiveClassifier and merge service files. project.task('shadowTestJar', type: ShadowJar, { group = "Shadow" description = "Create a combined JAR of project and test dependencies" - classifier = "tests" + archiveClassifier = "tests" from project.sourceSets.test.output configurations = [ project.configurations.testRuntimeMigration @@ -1639,7 +1677,7 @@ class BeamModulePlugin implements Plugin { project.tasks.register("testJar", Jar) { group = "Jar" description = "Create a JAR of test classes" - classifier = "tests" + archiveClassifier = "tests" from project.sourceSets.test.output zip64 true exclude "META-INF/INDEX.LIST" @@ -1794,18 +1832,18 @@ class BeamModulePlugin implements Plugin { project.task('sourcesJar', type: Jar) { from project.sourceSets.main.allSource - classifier = 'sources' + archiveClassifier = 'sources' } project.artifacts.archives project.sourcesJar project.task('testSourcesJar', type: Jar) { from project.sourceSets.test.allSource - classifier = 'test-sources' + archiveClassifier = 'test-sources' } project.artifacts.archives project.testSourcesJar project.task('javadocJar', type: Jar, dependsOn: project.javadoc) { - classifier = 'javadoc' + archiveClassifier = 'javadoc' from project.javadoc.destinationDir } project.artifacts.archives project.javadocJar @@ -1915,8 +1953,8 @@ class BeamModulePlugin implements Plugin { def dependencyNode = dependenciesNode.appendNode('dependency') def appendClassifier = { dep -> dep.artifacts.each { art -> - if (art.hasProperty('classifier')) { - dependencyNode.appendNode('classifier', art.classifier) + if (art.hasProperty('archiveClassifier')) { + dependencyNode.appendNode('archiveClassifier', art.archiveClassifier) } } } @@ -2162,7 +2200,7 @@ class BeamModulePlugin implements Plugin { def goRootDir = "${project.rootDir}/sdks/go" // This sets the whole project Go version. - project.ext.goVersion = "go1.21.1" + project.ext.goVersion = "go1.21.4" // Minor TODO: Figure out if we can pull out the GOCMD env variable after goPrepare script // completion, and avoid this GOBIN substitution. @@ -2210,7 +2248,7 @@ class BeamModulePlugin implements Plugin { /** ***********************************************************************************************/ project.ext.applyDockerNature = { - project.apply plugin: "com.palantir.docker" + project.apply plugin: BeamDockerPlugin project.docker { noCache true } project.tasks.create(name: "copyLicenses", type: Copy) { from "${project.rootProject.projectDir}/LICENSE" @@ -2222,7 +2260,7 @@ class BeamModulePlugin implements Plugin { } project.ext.applyDockerRunNature = { - project.apply plugin: "com.palantir.docker-run" + project.apply plugin: BeamDockerRunPlugin } /** ***********************************************************************************************/ @@ -2241,6 +2279,9 @@ class BeamModulePlugin implements Plugin { } groovyGradle { greclipse().configFile(grEclipseConfig) } } + // Workaround to fix spotless groovy and groovyGradle tasks use the same intermediate dir, + // until Beam no longer build on Java8 and can upgrade spotless plugin. + project.tasks.spotlessGroovy.mustRunAfter project.tasks.spotlessGroovyGradle } // containerImageName returns a configurable container image name, by default a @@ -2403,7 +2444,20 @@ class BeamModulePlugin implements Plugin { // TODO: Decide whether this should be inlined into the one project that relies on it // or be left here. - project.ext.applyAvroNature = { project.apply plugin: "com.commercehub.gradle.plugin.avro" } + project.ext.applyAvroNature = { + project.apply plugin: "com.commercehub.gradle.plugin.avro" + + // add dependency BeamModulePlugin defined custom tasks + // they are defined only when certain flags are provided (e.g. -Prelease; -Ppublishing, etc) + def sourcesJar = project.tasks.findByName('sourcesJar') + if (sourcesJar != null) { + sourcesJar.dependsOn project.tasks.getByName('generateAvroJava') + } + def testSourcesJar = project.tasks.findByName('testSourcesJar') + if (testSourcesJar != null) { + testSourcesJar.dependsOn project.tasks.getByName('generateTestAvroJava') + } + } project.ext.applyAntlrNature = { project.apply plugin: 'antlr' @@ -2414,6 +2468,17 @@ class BeamModulePlugin implements Plugin { generatedSourceDirs += project.generateTestGrammarSource.outputDirectory } } + + // add dependency BeamModulePlugin defined custom tasks + // they are defined only when certain flags are provided (e.g. -Prelease; -Ppublishing, etc) + def sourcesJar = project.tasks.findByName('sourcesJar') + if (sourcesJar != null) { + sourcesJar.mustRunAfter project.tasks.getByName('generateGrammarSource') + } + def testSourcesJar = project.tasks.findByName('testSourcesJar') + if (testSourcesJar != null) { + testSourcesJar.dependsOn project.tasks.getByName('generateTestGrammarSource') + } } // Creates a task to run the quickstart for a runner. @@ -2530,17 +2595,7 @@ class BeamModulePlugin implements Plugin { "java_expansion_service_allowlist_file": javaClassLookupAllowlistFile, ] def usesDataflowRunner = config.pythonPipelineOptions.contains("--runner=TestDataflowRunner") || config.pythonPipelineOptions.contains("--runner=DataflowRunner") - def javaContainerSuffix - if (JavaVersion.current() == JavaVersion.VERSION_1_8) { - javaContainerSuffix = 'java8' - } else if (JavaVersion.current() == JavaVersion.VERSION_11) { - javaContainerSuffix = 'java11' - } else if (JavaVersion.current() == JavaVersion.VERSION_17) { - javaContainerSuffix = 'java17' - } else { - String exceptionMessage = "Your Java version is unsupported. You need Java version of 8 or 11 or 17 to get started, but your Java version is: " + JavaVersion.current(); - throw new GradleException(exceptionMessage) - } + def javaContainerSuffix = getSupportedJavaVersion() // 1. Builds the chosen expansion service jar and launches it def setupTask = project.tasks.register(config.name+"Setup") { @@ -2643,17 +2698,7 @@ class BeamModulePlugin implements Plugin { ] def serviceArgs = project.project(':sdks:python').mapToArgString(expansionServiceOpts) def pythonContainerSuffix = project.project(':sdks:python').pythonVersion.replace('.', '') - def javaContainerSuffix - if (JavaVersion.current() == JavaVersion.VERSION_1_8) { - javaContainerSuffix = 'java8' - } else if (JavaVersion.current() == JavaVersion.VERSION_11) { - javaContainerSuffix = 'java11' - } else if (JavaVersion.current() == JavaVersion.VERSION_17) { - javaContainerSuffix = 'java17' - } else { - String exceptionMessage = "Your Java version is unsupported. You need Java version of 8 or 11 or 17 to get started, but your Java version is: " + JavaVersion.current(); - throw new GradleException(exceptionMessage) - } + def javaContainerSuffix = getSupportedJavaVersion() def setupTask = project.tasks.register(config.name+"Setup", Exec) { dependsOn ':sdks:java:container:'+javaContainerSuffix+':docker' dependsOn ':sdks:python:container:py'+pythonContainerSuffix+':docker' @@ -2826,17 +2871,7 @@ class BeamModulePlugin implements Plugin { ] def serviceArgs = project.project(':sdks:python').mapToArgString(transformServiceOpts) def pythonContainerSuffix = project.project(':sdks:python').pythonVersion.replace('.', '') - def javaContainerSuffix - if (JavaVersion.current() == JavaVersion.VERSION_1_8) { - javaContainerSuffix = 'java8' - } else if (JavaVersion.current() == JavaVersion.VERSION_11) { - javaContainerSuffix = 'java11' - } else if (JavaVersion.current() == JavaVersion.VERSION_17) { - javaContainerSuffix = 'java17' - } else { - String exceptionMessage = "Your Java version is unsupported. You need Java version of 8 or 11 or 17 to get started, but your Java version is: " + JavaVersion.current(); - throw new GradleException(exceptionMessage) - } + def javaContainerSuffix = getSupportedJavaVersion() // Transform service delivers transforms that refer to SDK harness containers with following sufixes. def transformServiceJavaContainerSuffix = 'java11' @@ -2947,9 +2982,12 @@ class BeamModulePlugin implements Plugin { } project.exec { executable 'sh' + // TODO: https://github.com/apache/beam/issues/29022 + // pip 23.3 is failing due to Hash mismatch between expected SHA of the packaged and actual SHA. + // until it is resolved on pip's side, don't use pip's cache. args '-c', ". ${project.ext.envdir}/bin/activate && " + - "pip install --pre --retries 10 --upgrade pip && " + - "pip install --pre --retries 10 --upgrade tox -r ${project.rootDir}/sdks/python/build-requirements.txt" + "pip install --pre --retries 10 --upgrade pip --no-cache-dir && " + + "pip install --pre --retries 10 --upgrade tox --no-cache-dir" } } // Gradle will delete outputs whenever it thinks they are stale. Putting a @@ -3032,30 +3070,40 @@ class BeamModulePlugin implements Plugin { } return argList.join(' ') } - project.ext.toxTask = { name, tox_env, posargs='' -> project.tasks.register(name) { dependsOn setupVirtualenv dependsOn ':sdks:python:sdist' - - doLast { - // Python source directory is also tox execution workspace, We want - // to isolate them per tox suite to avoid conflict when running - // multiple tox suites in parallel. - project.copy { from project.pythonSdkDeps; into copiedSrcRoot } - - def copiedPyRoot = "${copiedSrcRoot}/sdks/python" - def distTarBall = "${pythonRootDir}/build/apache-beam.tar.gz" - project.exec { - executable 'sh' - args '-c', ". ${project.ext.envdir}/bin/activate && cd ${copiedPyRoot} && scripts/run_tox.sh $tox_env $distTarBall '$posargs'" + if (project.hasProperty('useWheelDistribution')) { + def pythonVersionNumber = project.ext.pythonVersion.replace('.', '') + dependsOn ":sdks:python:bdistPy${pythonVersionNumber}linux" + doLast { + project.copy { from project.pythonSdkDeps; into copiedSrcRoot } + def copiedPyRoot = "${copiedSrcRoot}/sdks/python" + def collection = project.fileTree(project.project(':sdks:python').buildDir){ + include "**/apache_beam-*cp${pythonVersionNumber}*manylinux*.whl" + } + String packageFilename = collection.singleFile.toString() + project.exec { + executable 'sh' + args '-c', ". ${project.ext.envdir}/bin/activate && cd ${copiedPyRoot} && scripts/run_tox.sh $tox_env ${packageFilename} '$posargs' " + } + } + } else { + // tox task will run in editable mode, which is configured in the tox.ini file. + doLast { + project.copy { from project.pythonSdkDeps; into copiedSrcRoot } + def copiedPyRoot = "${copiedSrcRoot}/sdks/python" + project.exec { + executable 'sh' + args '-c', ". ${project.ext.envdir}/bin/activate && cd ${copiedPyRoot} && scripts/run_tox.sh $tox_env '$posargs'" + } } } inputs.files project.pythonSdkDeps outputs.files project.fileTree(dir: "${pythonRootDir}/target/.tox/${tox_env}/log/") } } - // Run single or a set of integration tests with provided test options and pipeline options. project.ext.enablePythonPerformanceTest = { diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy index 061ccf27cce28..97d96e6cf1ebd 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/VendorJavaPlugin.groovy @@ -126,7 +126,7 @@ artifactId=${project.name} } config.exclusions.each { exclude it } - classifier = null + archiveClassifier = null mergeServiceFiles() zip64 true exclude "META-INF/INDEX.LIST" diff --git a/contributor-docs/release-guide.md b/contributor-docs/release-guide.md index f066047f6df2f..7855c59ebbda4 100644 --- a/contributor-docs/release-guide.md +++ b/contributor-docs/release-guide.md @@ -16,198 +16,206 @@ limitations under the License. ## Introduction -The Apache Beam project periodically declares and publishes releases. A -release is one or more packages of the project artifact(s) that are approved -for general public distribution and use. They may come with various degrees of -caveat regarding their perceived quality and potential for change, such as -“alpha”, “beta”, “incubating”, “stable”, etc. - -The Beam community treats releases with great importance. They are a public +The Beam community treats releases with great importance. They are a public face of the project and most users interact with the project only through the releases. Releases are signed off by the entire Beam community in a public vote. Each release is executed by a *Release Manager*, who is selected among the Beam committers. This document describes the process that the Release Manager -follows to perform a release. Any changes to this process should be discussed -and adopted on the [dev@ mailing list](/get-started/support/). - -Please remember that publishing software has legal consequences. This guide -complements the foundation-wide [Product Release -Policy](https://www.apache.org/dev/release.html) and [Release Distribution -Policy](https://www.apache.org/dev/release-distribution). +follows to perform a release. -### Overview +Please remember that publishing software has legal consequences. This guide +complements the foundation-wide guides: -Release step flow chart + - [Product Release Policy](https://www.apache.org/dev/release.html) + - [Release Distribution + Policy](https://www.apache.org/dev/release-distribution). -The release process consists of several steps: +### What is in a Beam release -1. Decide to release -2. Prepare for the release -3. Build a release candidate -4. Verify & vote on the release candidate -5. If necessary, fix any issues and go back to "Build a release candidate" -6. Finalize the release -7. Promote the release +A Beam release consists of the following: ------------- + - ASF source zips archived on + [dist.apache.org](https://dist.apache.org/release/beam) (later archived to + [archive.apache.org](https://archive.apache.org/dist/beam) + - Java jars and poms published to [Maven + Central](https://mvnrepository.com/artifact/org.apache.beam) + - Python wheels published to [pypi](https://pypi.org/project/apache-beam/) + - Go artifacts published to + [pkg.go.dev](https://pkg.go.dev/github.com/apache/beam) + - Docker images published to + [dockerhub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) + - A tag on GitHub indicating the commit from which the release was built -## 1. Decide to release +In addition, each release is accompanied by: -Deciding to release and selecting a Release Manager is the first step of the release process. -This is a consensus-based decision of the entire community. + - A blog post announcing the release and describing the changes + - An update to the webpage to indicate the latest version -Anybody can propose a release on the dev@ mailing list, giving a solid argument and nominating a committer as the Release Manager (including themselves). -There’s no formal process, no vote requirements, and no timing requirements. Any objections should be resolved by consensus before starting the release. +### Phases of the release process -In general, the community prefers to have a rotating set of 3-5 Release Managers. -Keeping a small core set of managers allows enough people to build expertise in this area and improve processes over time, without Release Managers needing to re-learn the processes for each release. -That said, if you are a committer interested in serving the community in this way, please reach out to the community on the dev@ mailing list. +The release process consists of several phases: -### Checklist to proceed to the next step +1. Prepare for release +2. Stabilize the release branch / burn down release-blocking issues +3. Build a release candidate +4. Validate and approve the release candidate +5. Finalize the release +6. Promote the release +7. Post-release tasks -- [ ] Community agrees to release -- [ ] Community selects a committer as Release Manager +------------ -------- +## Prepare for release (~1 week before release cut) -## 2. Prepare for the release (~1 week before branch cut) +The following steps take place before the release branch is cut. -Before your first release, you should perform one-time configuration steps. -This will set up your security keys for signing the release and access to various release repositories. +### Decide to release -To prepare for each release, you should audit the project status in the GitHub issue tracker, and do necessary bookkeeping. -Finally, create a release branch from which individual release candidates will be built. +Deciding to release and selecting a Release Manager is the first step of the +release process. This is a consensus-based decision of the entire community. +Anybody can propose a release on the `dev@` list. There is no formal process, +no vote requirements, and no timing requirements. A committer must be +identified to be the Release Manager. In practice, most often a committer both +proposes to release and volunteers themselves as Release Manager. -__NOTE__: If you are using [GitHub two-factor authentication](https://help.github.com/articles/securing-your-account-with-two-factor-authentication-2fa/) and haven't configure HTTPS access, -please follow [the guide](https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/) to configure command line access. +------- ### Create a new milestone in GitHub for the next release -When contributors resolve an issue in GitHub, they are tagging it with a release that will contain their changes. -With the release currently underway, new issues should be resolved against a subsequent future release. -Therefore, you should create a release item for this subsequent release, as follows: +When contributors resolve an issue in GitHub, they are tagging it with a +release that will contain their changes. With the release currently underway, +new issues should be resolved against a subsequent future release. Therefore, +you should create a release item for this subsequent release, as follows: -1. In GitHub, navigate to [`Issues > Milestones > New Milestone`](https://github.com/apache/beam/milestones). -2. Add a new release. Choose the next minor version number after the version currently underway, select the next release due date (generally 6 weeks from today’s date) as the `Start Date`, and choose `Create Milestone`. -3. At the end of the release, go to the same page and mark the recently released version as closed. +In GitHub, navigate to [`Issues > Milestones > New +Milestone`](https://github.com/apache/beam/milestones) and add a new +release for the next minor version after the version you are preparing +to release. -### Accounts +---- -Please have these credentials ready at hand, you will likely need to enter them multiple times: +### Prepare accounts, keys, etc -* Apache ID and Password; -* GitHub ID, Password, and Personal Access Token. -* PyPi account and apitoken -* DockerHub ID and Password with beam maintainer access -* GPG pass phrase & 16-digit key ID -* Access to Beam's Apache Nexus repository -* Account to access to apache-beam-testing Google Cloud Platform project. The account must have permissions to start Cloud Build triggers. Required for Playground environment update. (E-mail at dev@ mailing list to request access) +Before your first release, you need to make sure you have all the necessary +accounts, keys, and access for publishing the release. The release process also +requires a variety of API tokens, which you can generate now or later when they +are needed. -If you don't have a given credential, follow the 'one-time' instructions below. +These are the credentials you will need: ----- - -### One-time setup instructions + - Apache ID and Password + - GitHub ID, Password, and Personal Access Token + - PyPi account with beam maintainer access and API Token + - GPG pass phrase & 16-digit key ID + - Access to Beam's Apache Nexus repository + - Account to access to apache-beam-testing Google Cloud Platform project. The + account must have permissions to start Cloud Build triggers. Required for + Playground environment update. (E-mail at dev@ mailing list to request + access) #### Apache ID and Password -This is your Apache committer user name and password. You selected these when you became an Apache Beam Committer. +This is your Apache committer user name and password. You selected these when +you became an Apache Beam Committer. #### Github ID, Password, and Personal Access Token -For some scripts, you need a Personal Access Token with `repo` and `workflow` permissions. -They can be generated from this page: https://github.com/settings/tokens. -See https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens for details. - -#### Register to PyPI - -Release manager needs to have an account with PyPI. -If you need one, [register at PyPI](https://pypi.python.org/account/register/). -You also need to be a maintainer (or an owner) of the [apache-beam](https://pypi.python.org/pypi/apache-beam) package in order to push a new release. -Ask on the mailing list for assistance. - -Generate a [PyPI APIToken](https://pypi.org/help/#apitoken) for use during the release. - -#### Login to DockerHub -If you are a member of the [`beam` DockerHub team](https://hub.docker.com/orgs/apache/teams/beam), run the following command manually. -It will ask you to input your DockerHub ID and password if authorization info cannot be found from ~/.docker/config.json file. - -``` -docker login docker.io -``` - -After successful login, authorization info will be stored at ~/.docker/config.json file. -For example, -``` -"https://index.docker.io/v1/": { - "auth": "xxxxxx" -} -``` - -If you are not already a member of the `beam` team, please email `dev@` mailing list for help with any DockerHub related tasks. We are not able -to add more members to the DockerHub team because [the ASF has a limited number of seats available](https://infra.apache.org/docker-hub-policy.html). + - [ ] If you are using [GitHub two-factor + authentication](https://help.github.com/articles/securing-your-account-with-two-factor-authentication-2fa/) + and haven't configure HTTPS access, please follow [the + guide](https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/) + to configure command line access. + - [ ] Generate a Personal Access Token with `repo` and `workflow` permissions. + They can be generated from this page: https://github.com/settings/tokens. + See + https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens + for details. + +#### PyPI account and API token + + - [ ] [Create an account with PyPI](https://pypi.python.org/account/register/) + if you don't have one already. + - [ ] Become a maintainer (or an owner) of the + [apache-beam](https://pypi.python.org/pypi/apache-beam) package. + - [ ] Generate a [PyPI APIToken](https://pypi.org/help/#apitoken) for use + during the release. #### GPG Key -You need to have a GPG key to sign the release artifacts. -Please be aware of the ASF-wide [release signing guidelines](https://www.apache.org/dev/release-signing.html). -If you don’t have a GPG key associated with your Apache account, please create one according to the guidelines. +You need to have a GPG key to sign the release artifacts. Please be aware of +the ASF-wide [release signing +guidelines](https://www.apache.org/dev/release-signing.html). If you don’t +have a GPG key associated with your Apache account, you must now create one +according to the guidelines. -There are 2 ways to configure your GPG key for release, either using release automation script(which is recommended), or running all commands manually. +Run the following helper script, or you can open it and run the commands +individually (helpful if it doesn't work as intended or if you already are +partially set up) -##### Use preparation_before_release.sh to setup GPG -* **Script:** [preparation_before_release.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/preparation_before_release.sh) + ./release/src/main/scripts/preparation_before_release.sh -* **Usage** - ``` - ./release/preparation_before_release.sh - ``` -* **Tasks included** - 1. Help you create a new GPG key if you want. - 2. Configure ```git user.signingkey``` with chosen pubkey. - 3. Add chosen pubkey into [dev KEYS](https://dist.apache.org/repos/dist/dev/beam/KEYS) and [release KEYS](https://dist.apache.org/repos/dist/release/beam/KEYS) +> **__NOTE__**: +> When generating the key, please make sure you choose the key type as +> __RSA and RSA (default)__ and key size as __4096 bit__. - **NOTES**: Only PMC can write into [release repo](https://dist.apache.org/repos/dist/release/beam/). - 4. Start GPG agents. +Now you should have: -__NOTE__: When generating the key, please make sure you choose the key type as __RSA and RSA (default)__ and key size as __4096 bit__. - -* To run the commands manually, refer to the contents of `preparation_before_release.sh`. + - [ ] A GPG key meeting ASF guidelines + - [ ] The key added to + [dev KEYS](https://dist.apache.org/repos/dist/dev/beam/KEYS) and [release KEYS](https://dist.apache.org/repos/dist/release/beam/KEYS) + **NOTE**: Only PMC can write into [release repo](https://dist.apache.org/repos/dist/release/beam/). + - [ ] The `user.signingkey` set in your `.gitconfig` + - [ ] `gpg-agent` with the key loaded ##### Key ID -* You may need your Key ID for future steps. Determine your Apache GPG Key and Key ID as follows: +You may need your Key ID for future steps. Determine your Apache GPG Key and +Key ID as follows: - gpg --list-sigs --keyid-format LONG + gpg --list-sigs --keyid-format LONG - This will list your GPG keys. One of these should reflect your Apache account, for example: +This will list your GPG keys. One of these should reflect your Apache account, +for example: - -------------------------------------------------- - pub rsa4096/845E6689845E6689 2016-02-23 - uid Nomen Nescio - sub rsa4096/BA4D50BEBA4D50BE 2016-02-23 + -------------------------------------------------- + pub rsa4096/845E6689845E6689 2016-02-23 + uid Nomen Nescio + sub rsa4096/BA4D50BEBA4D50BE 2016-02-23 - Here, the key ID is the 16-digit hex string in the `pub` line: `845E6689845E6689`. +Here, the key ID is the 16-digit hex string in the `pub` line: `845E6689845E6689`. ##### Submit your GPG public key into Ubuntu OpenPGP Key Server -In order to make yourself have right permission to stage java artifacts in Apache Nexus staging repository, -please submit your GPG public key into the [Ubuntu OpenPGP Key Server](https://keyserver.ubuntu.com/). -You will need to use an ascii-armored version of your key. -This can be obtained by running `gpg --export --armor` and copying the whole block -(including `----- PGP PUBLIC KEY BLOCK-----`). +In order to make yourself have right permission to stage java artifacts in +Apache Nexus staging repository, please submit your GPG public key into the +[Ubuntu OpenPGP Key Server](https://keyserver.ubuntu.com/). + +You will need to use an ascii-armored version of your key. This can be +obtained by running: + + gpg --export --armor + +Copying the whole block including `-----START PGP PUBLIC KEY BLOCK-----` and +`-----END PGP PUBLIC KEY BLOCK-----` #### Access to Apache Nexus repository -Configure access to the [Apache Nexus repository](https://repository.apache.org/), which enables final deployment of releases to the Maven Central Repository. +Configure access to the [Apache Nexus +repository](https://repository.apache.org/), which enables final deployment of +releases to the Maven Central Repository. -1. You log in with your Apache account. -2. Confirm you have appropriate access by finding `org.apache.beam` under `Staging Profiles`. +1. Log in with your Apache account. +2. Confirm you have appropriate access by finding `org.apache.beam` under + `Staging Profiles`. 3. Navigate to your `Profile` (top right dropdown menu of the page). -4. Choose `User Token` from the dropdown, then click `Access User Token`. Copy a snippet of the Maven XML configuration block. -5. Insert this snippet twice into your global Maven `settings.xml` file, typically `${HOME}/.m2/settings.xml`. The end result should look like this, where `TOKEN_NAME` and `TOKEN_PASSWORD` are your secret tokens: +4. Choose `User Token` from the dropdown, then click `Access User Token`. Copy + a snippet of the Maven XML configuration block. +5. Insert this snippet + twice into your global Maven `settings.xml` file, typically + `${HOME}/.m2/settings.xml`. The end result should look like this, where + `TOKEN_NAME` and `TOKEN_PASSWORD` are your secret tokens: @@ -225,463 +233,463 @@ Configure access to the [Apache Nexus repository](https://repository.apache.org/ -********** +---- + +### Dependency checks -### Handle Per Release tasks +Each language has routine dependency maintenance that you should check now. #### Update base image dependencies for Python container images -The Python base container images have static pinned `requirements.txt` that are -designed to be compatible with our dependency constraints but also not cause -runtime installs to occur, which slow things down immensely. -These need to be updated at least once per release cycle to avoid -out of date dependencies. +The Python base container images have pinned `requirements.txt` that are +compatible with our dependency constraints, and design to avoid run-time +installs, since run-time installs cause large delays at start-up time. Ideally, +we this should happen regularly when dependencies update, but it is important +to ensure that they are fully up to date for each release. Follow the instructions at https://s.apache.org/beam-python-requirements-generate -Ideally, do the update at least a week before the release cut, so that any issues -related to the update have time to surface. - #### Update Go version used for container builds -Go makes security patch releases of their tooling. Ideally, we upgrade as soon -as possible, but it is also good to ensure we are up to date for each release. - -This potentially affects container bootloader security, and at the least can cause -false positives when an default-configuration scanner is pointed at our containers. - - - [ ] See if https://go.dev/doc/devel/release has a newer release. Update throughout - Beam. See example at https://github.com/apache/beam/pull/27900/files - -#### Update the Java BOM - -Tracked in Github issue https://github.com/apache/beam/issues/28379 - -Ideally, do the update at least a week before the release cut, so that any issues -related to the update have time to surface. - -#### Investigate performance regressions +Go makes security patch releases of their tooling. This potentially affects +container bootloader security, and at the least can cause false positives when +an default-configuration scanner is pointed at our containers. Ideally, we +upgrade as soon as possible, but it is also good to ensure we are up to date +for each release. -Check the Beam load tests for possible performance regressions. -Measurements are available on [metrics.beam.apache.org](http://metrics.beam.apache.org). - -All Runners which publish data should be checked for the following, in both *batch* and *streaming* mode: - -- [ParDo](http://metrics.beam.apache.org/d/MOi-kf3Zk/pardo-load-tests) and [GBK](http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-test): Runtime, latency, checkpoint duration -- [Nexmark](http://metrics.beam.apache.org/d/ahudA_zGz/nexmark): Query runtime for all queries -- [IO](http://metrics.beam.apache.org/d/bnlHKP3Wz/java-io-it-tests-dataflow): Runtime +See if https://go.dev/doc/devel/release has a newer release. Update throughout +Beam. See example at https://github.com/apache/beam/pull/27900/files -If regressions are found, the release branch can still be created, but the regressions should be investigated and fixed as part of the release process. -The role of the release manager is to file GitHub issues for each regression with the milestone set to the to-be-released version. -The release manager oversees these just like any other issue marked with the milestone of the release. +### Cut the release branch -The mailing list should be informed to allow fixing the regressions in the course of the release. Issues should be filed and tagged with the milestone. +> **Note** +> Wait until the proposed branch cut day! -#### Triage release-blocking issues in GitHub +We cut the release branch on time and do not block/delay branch cut for incoming +fixes. This is because bugs are always being introduced as part of normal +development. We cut the branch to prevent new bugs being introduced and _then_ +we fix and cherrypick any truly release-blocking problems. -There could be outstanding release-blocking issues, which should be triaged before proceeding to build a release candidate. -We track them by assigning the blocked release to the issue's milestone before the issue is resolved. +In order to run this workflow, you will need to provide a Apache ID and Jenkins +API token. Your Jenkins API token can be generated by visiting +https://ci-beam.apache.org, signing in with your Apache credentials, then going +to `https://ci-beam.apache.org/user//configure` and clicking +`Add new token` in the API token section. -The release manager should triage what does and does not block a release. -The list of release-blocking issues is available at the [milestone status page](https://github.com/apache/beam/milestones). -Triage each unresolved issue with one of the following resolutions: +- [ ] Run + [cut_release_branch](https://github.com/apache/beam/actions/workflows/cut_release_branch.yml) + (click `run workflow`) -* An issue should not block the release if the problem exists in the current released version or is a bug in new functionality that does not exist in the current released version. -* An issue should be a blocker if the problem is a regression between the currently released version and the release in progress and has no easy workaround. +The final state of the repository after release branch is cut should match this +diagram: -For all GitHub issues: +Increment minor version on master branch and set Dataflow container version on release branch -* If the issue has been resolved and the GitHub issue was not updated, resolve it accordingly. +This should be accomplished by the +[cut_release_branch](https://github.com/apache/beam/actions/workflows/cut_release_branch.yml) +workflow. This workflow will also update +[mass_comment.py](https://github.com/apache/beam/blob/master/release/src/main/scripts/mass_comment.py) +to contain all of the active Jenkins jobs. -For issues with type "Bug" or labeled "flaky": +The following must be manually done or confirmed: -* If the issue is a known continuously failing test, it is not acceptable to defer this until the next release. - Please work with the Beam community to resolve the issue. -* If the issue is a known flaky test, make an attempt to delegate a fix. - However, if the issue may take too long to fix (to the discretion of the release manager): - * Delegate manual testing of the flaky issue to ensure no release blocking issues. - * Update the milestone to the version of the next release. - Please consider discussing this with stakeholders and the dev@ mailing list, as appropriate. +- [ ] The `master` branch has the SNAPSHOT/dev version incremented. +- [ ] The release branch has the SNAPSHOT/dev version to be released. +- [ ] The Dataflow container image should be modified to the version to be released. +- [ ] Due to a bug/limitation in the workflow, you must navigate to the pull + request found in the logs and comment `Run Gradle Publish`. +- [ ] After publish, close the PR. +- [ ] Manually update `CHANGES.md` on `master` by adding a new section for the + next release + ([example](https://github.com/apache/beam/commit/96ab1fb3fe07acf7f7dc9d8c829ae36890d1535c)). -For all other GitHub issues: +#### Inform the mailing list -* If the issue has not been resolved and it is acceptable to defer this until the next release, update the milestone to the new version you just created. - Please consider discussing this with stakeholders and the dev@ mailing list, as appropriate. -* If the issue has not been resolved and it is not acceptable to release until it is fixed, the release cannot proceed. - Instead, work with the Beam community to resolve the issue. +The dev@ mailing list should be informed about the release branch being cut. +Alongside with this note, a list of pending issues and to-be-triaged issues +should be included. Afterwards, this list can be refined and updated by the +release manager and the Beam community. -If there is a bug found in the RC creation process/tools, those issues should be considered high priority and fixed in 7 days. -### Checklist to proceed to the next step +### Checklist to proceed to the next phase +- [ ] Community agrees to release +- [ ] Community selects a committer (you) as Release Manager - [ ] Next release has a milestone in github. - [ ] You have your various account credentials prepared. -- [ ] Per Release tasks for the current release have been handled. -- [ ] Open issues/PRs against the current release have been notified. -- [ ] Performance Regressions have been investigated and had issues filed. -- [ ] It is the proposed branch cut day. - -********** - -## 3. Build a release candidate - -Building a release candidate involves creating a release branch, running validation tests against the branch, filing issues, cherry picking fixes, -making a release candidate tag, and building all artifacts from that tag. +- [ ] You checked the dependency maintenance for each language. +- [ ] The release branch is created. +- [ ] The `master` branch is moved along to the next release. +- [ ] You have informed `dev@beam.apache.org` that you have cut the branch and + are proceeding to stabilization -### Create a release branch in apache/beam repository - -As a final step in preparation for the release, you should create the release branch, and update version information on the original branch. -This should happen once per release. If additional release candidates are required, they are built from later versions of this branch. - -The final state of the repository should match this diagram: - -Increment minor version on master branch and set Dataflow container version on release branch - -The key points to know: - -- The `master` branch has the SNAPSHOT/dev version incremented. -- The release branch has the SNAPSHOT/dev version to be released. -- The Dataflow container image should be modified to the version to be released. - -This will all be accomplished by the [cut_release_branch](https://github.com/apache/beam/actions/workflows/cut_release_branch.yml) -workflow. This workflow will also update [mass_comment.py](https://github.com/apache/beam/blob/master/release/src/main/scripts/mass_comment.py) -to contain all of the active Jenkins jobs. - -After updating the master branch, the workflow will also start a build of -[the nightly snapshot](https://ci-beam.apache.org/job/beam_Release_NightlySnapshot/) against master branch. -Some processes, including our archetype tests, rely on having a live SNAPSHOT of the current version from the `master` branch. -Once the release branch is cut, these SNAPSHOT versions are no longer found, so builds will be broken until a new snapshot is available. -The workflow starts the nightly snapshot by creating an empty PR against apache:master (which will be linked to in the logs). - -#### Use cut_release_branch workflow to cut a release branch - -* **Action:** [cut_release_branch](https://github.com/apache/beam/actions/workflows/cut_release_branch.yml) (click `run workflow`) +------- -In order to run this workflow, you will need to provide a Apache ID and Jenkins API token. -Your Jenkins API token can be generated by visiting https://ci-beam.apache.org, signing in with your Apache credentials, -then going to `https://ci-beam.apache.org/user//configure` and clicking `Add new token` in the API token section. +## Stabilize the release branch -* Tasks you need to do manually to __verify the SNAPSHOT build__ - 1. Check whether the Jenkins job gets triggered. If not, please comment ```Run Gradle Publish``` into the generated PR. - 2. After verifying build succeeded, you need to close PR manually. - 3. Manually update `CHANGES.md` on `master` by adding a new section for the next release ([example](https://github.com/apache/beam/commit/96ab1fb3fe07acf7f7dc9d8c829ae36890d1535c)). +Once the release branch is cut, your job is to make sure tests pass, fix bugs, +confirm performance, defer feature requests, etc, until the branch is ready for +the work of building a release candidate. ### Verify release branch -After the release branch is cut you need to make sure it builds and has no significant issues that would block the creation of the release candidate. -There are 2 ways to perform this verification, either running automation script(recommended), or running all commands manually. - -> Dataflow tests will fail if the Dataflow worker container is not created and published by this time. -> Should be done by Google, in response to the creation of the release branch, and docker images are hosted. -> This should not block creation of the first release candidate, but should block approval of the release. +After the release branch is cut, make sure it builds and has no significant +issues that would block the creation of the release candidate. + +> **NOTE** +> Dataflow tests will fail if the Dataflow worker container is not created and +> published by this time. Should be done by Google, in response to the +> creation of the release branch, and docker images are hosted. This should +> not block creation of the first release candidate, but should block approval +> of the release. + +- **Script:** + [verify_release_build.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/verify_release_build.sh) + +- **Usage** + 1. Create a personal access token from your Github account. + See instruction [here](https://help.github.com/en/articles/creating-a-personal-access-token-for-the-command-line). + It'll be used by the script for accessing Github API. + You need to enable `repo` and `workflow` permissions for this token. + 2. Update required configurations listed in `RELEASE_BUILD_CONFIGS` in [script.config](https://github.com/apache/beam/blob/master/release/src/main/scripts/script.config) + 3. Then run + ``` + (cd release/src/main/scripts && ./verify_release_build.sh) + ``` + 4. Trigger all Github Action and Jenkins PostCommit jobs from the PR created by the previous step. + For GitHub Action jobs, they should be triggered by the pull_request_target event of a specific placeholder file + added to the PR (`release/trigger_all_tests.json`), so no additional action should be needed. + You can run [mass_comment.py](https://github.com/apache/beam/blob/master/release/src/main/scripts/mass_comment.py) to do that. + Or manually add one trigger phrase per PR comment for Jenkins tests, or rerun the workflow for GitHub Action tests. + See [jenkins_jobs.txt](https://github.com/apache/beam/blob/master/release/src/main/scripts/jenkins_jobs.txt) + for a full list of phrases. + +- **Tasks included in the script** + - Installs `hub` with your agreement and setup local git repo; + - Create a test PR against release branch; + +There are some projects that don't produce the artifacts, e.g. +`beam-test-tools`, you may be able to ignore failures there. + +To triage the failures and narrow things down you may want to look at +`settings.gradle.kts` and run the build only for the projects you're interested +at the moment, e.g. `./gradlew :runners:java-fn-execution`. + +The `verify_release_build.sh` script may include failing or flaky tests. For +each of the failing tests create a GitHub Issue with the following properties: -#### Run automation script (verify_release_build.sh) -* **Script:** [verify_release_build.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/verify_release_build.sh) - -* **Usage** - 1. Create a personal access token from your Github account. - See instruction [here](https://help.github.com/en/articles/creating-a-personal-access-token-for-the-command-line). - It'll be used by the script for accessing Github API. - You need to enable `repo` and `workflow` permissions for this token. - 2. Update required configurations listed in `RELEASE_BUILD_CONFIGS` in [script.config](https://github.com/apache/beam/blob/master/release/src/main/scripts/script.config) - 3. Then run - ``` - (cd release/src/main/scripts && ./verify_release_build.sh) - ``` - 4. Trigger all Jenkins PostCommit jobs from the PR created by the previous step. - You can run [mass_comment.py](https://github.com/apache/beam/blob/master/release/src/main/scripts/mass_comment.py) to do that. - Or manually add one trigger phrase per PR comment. - See [jenkins_jobs.txt](https://github.com/apache/beam/blob/master/release/src/main/scripts/jenkins_jobs.txt) - for a full list of phrases. +* **Issue Type:** Bug -* **Tasks included in the script** - 5. Installs `hub` with your agreement and setup local git repo; - 6. Create a test PR against release branch; +* **Summary:** Name of failing gradle task and name of failing test (where applicable) in form of :MyGradleProject:SomeGradleTask NameOfFailedTest: Short description of failure -#### Verify the build succeeds +* **Priority:** P1 -* Tasks you need to do manually to __verify the build succeed__: - 1. Check the build result. - 2. If build failed, scan log will contain all failures. - 3. You should stabilize the release branch until release build succeeded. +* **Component:** "test-failures" -There are some projects that don't produce the artifacts, e.g. `beam-test-tools`, you may be able to ignore failures there. +* **Milestone:** Release number of verified release branch -To triage the failures and narrow things down you may want to look at `settings.gradle.kts` and run the build only for the projects you're interested at the moment, e.g. `./gradlew :runners:java-fn-execution`. +* **Description:** Description of failure -#### (Alternative) Run release build locally -You will need to have Python interpreters for all supported Python minor -versions to run Python tests. See Python installation tips in [Developer Wiki](https://cwiki.apache.org/confluence/display/BEAM/Python+Tips#PythonTips-InstallingPythoninterpreters). +### Investigate performance regressions -* **Run gradle release build** +Check the Beam load tests for possible performance regressions. Measurements +are available on [metrics.beam.apache.org](http://metrics.beam.apache.org). - 1. Clean current workspace +All Runners which publish data should be checked for the following, in both +*batch* and *streaming* mode: - ``` - git clean -fdx - ./gradlew clean - ``` +- [ParDo](http://metrics.beam.apache.org/d/MOi-kf3Zk/pardo-load-tests) and + [GBK](http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-test): Runtime, + latency, checkpoint duration +- [Nexmark](http://metrics.beam.apache.org/d/ahudA_zGz/nexmark): Query runtime + for all queries +- [IO](http://metrics.beam.apache.org/d/bnlHKP3Wz/java-io-it-tests-dataflow): Runtime - 2. Unlock the secret key - ``` - gpg --output ~/doc.sig --sign ~/.bashrc - ``` +If regressions are found, the release branch can still be created, but the +regressions should be investigated and fixed as part of the release process. +The role of the release manager is to file GitHub issues for each regression +with the milestone set to the to-be-released version. The release manager +oversees these just like any other issue marked with the milestone of the +release. - 3. Run build command - ``` - ./gradlew build -PisRelease --no-parallel --scan --stacktrace --continue - ``` +The mailing list should be informed to allow fixing the regressions in the +course of the release. Issues should be filed and tagged with the milestone. - To speed things up locally you might want to omit `--no-parallel`. You can also omit `--continue` - if you want build fails after the first error instead of continuing, it may be easier and faster - to find environment issues this way without having to wait until the full build completes. +### Triage release-blocking issues in GitHub -#### Create release-blocking issues in GitHub +There could be outstanding release-blocking issues, which should be triaged +before proceeding to build a release candidate. We track them by assigning the +blocked release to the issue's milestone before the issue is resolved. -The verify_release_build.sh script may include failing or flaky tests. -For each of the failing tests create a GitHub Issue with the following properties: +The release manager should triage what does and does not block a release. The +list of release-blocking issues is available at the [milestone status +page](https://github.com/apache/beam/milestones). Triage each unresolved issue +with one of the following resolutions: -* **Issue Type:** Bug + - An issue should not block the release if the problem exists in the current + released version or is a bug in new functionality that does not exist in the + current released version. + - An issue should be a blocker if the problem is a regression between the + currently released version and the release in progress and has no easy + workaround. -* **Summary:** Name of failing gradle task and name of failing test (where applicable) in form of :MyGradleProject:SomeGradleTask NameOfFailedTest: Short description of failure +For all GitHub issues: -* **Priority:** P1 + - If the issue has been resolved and the GitHub issue was not updated, + resolve it accordingly. -* **Component:** "test-failures" +For issues with type "Bug" or labeled "flaky": -* **Milestone:** Release number of verified release branch + - If the issue is a known continuously failing test, it is not acceptable to + defer this until the next release. Please work with the Beam community to + resolve the issue. + - If the issue is a known flaky test, make an attempt to delegate a fix. + However, if the issue may take too long to fix (to the discretion of the + release manager): + - Delegate manual testing of the flaky issue to ensure no release blocking issues. + - Update the milestone to the version of the next release. + Please consider discussing this with stakeholders and the dev@ mailing + list, as appropriate. -* **Description:** Description of failure +For all other GitHub issues: -#### Inform the mailing list + - If the issue has not been resolved and it is acceptable to defer this until the next release, update the milestone to the new version you just created. + Please consider discussing this with stakeholders and the dev@ mailing list, as appropriate. + - If the issue has not been resolved and it is not acceptable to release until it is fixed, the release cannot proceed. + Instead, work with the Beam community to resolve the issue. -The dev@ mailing list should be informed about the release branch being cut. -Alongside with this note, a list of pending issues and to-be-triaged issues should be included. -Afterwards, this list can be refined and updated by the release manager and the Beam community. +If there is a bug found in the RC creation process/tools, those issues should +be considered high priority and fixed in 7 days. ### Review cherry-picks -The release manager is empowered to triage issues, and accept or reject cherry-picks to the release branch. -Cherry picks are necessary if there are outstanding issues at time of the release branch cut, or issues were found in verification. +The release manager is empowered to triage issues, and accept or reject +cherry-picks to the release branch. Cherry picks are necessary if there are +outstanding issues at time of the release branch cut, or issues were found in +verification. + +Check if there are outstanding cherry-picks into the release branch, [e.g. for +`2.14.0`](https://github.com/apache/beam/pulls?utf8=%E2%9C%93&q=is%3Apr+base%3Arelease-2.14.0). +Make sure they have blocker Issues attached and are OK to get into the release +by checking with community if needed. + +You are encouraged to ask the following questions to be answered on each +cherry-pick PR and you can choose to reject cherry-pick requests if these +questions are not satisfactorily answered: + + - Is this a regression from a previous release? (If no, fix could go to a + newer version.) + - Is this a new feature or related to a new feature? (If yes, fix could go to + a new version.) + - Would this impact production workloads for users? (E.g. if this is a direct + runner only fix it may not need to be a cherry pick.) + - What percentage of users would be impacted by this issue if it is not fixed? + (E.g. If this is predicted to be a small number it may not need to be a + cherry pick.) + - Would it be possible for the impacted users to skip this version? (If users + could skip this version, fix could go to a newer version.) + +It is important to accept major/blocking fixes to isolated issues to make a +higher quality release. However, beyond that each cherry pick will increase +the time required for the release and add more last minute code to the release +branch. Neither late releases nor not fully tested code will provide positive +user value. + +> **Tip**: Another tool in your toolbox is the known issues section of the +> release blog. Consider adding known issues there for minor issues instead of +> accepting cherry picks to the release branch. + +## Build a release candidate + +From the release branch, building a candidate involves selecting a commit, +tagging that commit, and building the various artifacts against that commit. +You can also run verifications against the RC commit (verification will also +occur during voting phase). -Check if there are outstanding cherry-picks into the release branch, [e.g. for `2.14.0`](https://github.com/apache/beam/pulls?utf8=%E2%9C%93&q=is%3Apr+base%3Arelease-2.14.0). -Make sure they have blocker Issues attached and are OK to get into the release by checking with community if needed. +#### Checklist before proceeding -You are encouraged to ask the following questions to be answered on each cherry-pick PR and you can choose to reject cherry-pick requests if these questions are not satisfactorily answered: +- [ ] There are no release blocking GitHub issues. +- [ ] There are no open pull requests to release branch. +- [ ] Release Manager’s GPG key is published to `dist.apache.org`. +- [ ] Release Manager’s GPG key is configured in `git` configuration. +- [ ] Set `SIGNING_KEY` to the public key of the Manager's GPG key. +- [ ] Release Manager has `org.apache.beam` listed under `Staging Profiles` in Nexus. +- [ ] Release Manager’s Nexus User Token is configured in `settings.xml`. +- [ ] Set `JAVA_HOME` to JDK 8 (Example: `export JAVA_HOME=/example/path/to/java/jdk8`). +- [ ] Have Java 11 installed. -* Is this a regression from a previous release? (If no, fix could go to a newer version.) -* Is this a new feature or related to a new feature? (If yes, fix could go to a new version.) -* Would this impact production workloads for users? (E.g. if this is a direct runner only fix it may not need to be a cherry pick.) -* What percentage of users would be impacted by this issue if it is not fixed? (E.g. If this is predicted to be a small number it may not need to be a cherry pick.) -* Would it be possible for the impacted users to skip this version? (If users could skip this version, fix could go to a newer version.) +### Tag a chosen commit for the RC -It is important to accept major/blocking fixes to isolated issues to make a higher quality release. -However, beyond that each cherry pick will increase the time required for the release and add more last minute code to the release branch. -Neither late releases nor not fully tested code will provide positive user value. +Release candidates are built from single tagged commits off the release branch. +When you have identified a good commit on the release branch, run +[choose_rc_commit.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/choose_rc_commit.sh) +to set it up correctly. -__Tip__: Another tool in your toolbox is the known issues section of the release blog. -Consider adding known issues there for minor issues instead of accepting cherry picks to the release branch. + ./release/src/main/scripts/choose_rc_commit.sh \ + --release "${RELEASE_VERSION}" \ + --rc "${RC_NUM}" \ + --commit "${COMMIT_REF}" \ + --clone \ + --push-tag -### Build release artifacts +You can do a dry run by omitting the `--push-tag` flag. Then it will only clone +the repo, adjust the version, and add the tag locally. If it looks good, run it +again with `--push-tag`. If you already have a clone that includes the +`${COMMIT_REF}` then you can omit `--clone`. This is perfectly safe since the +script does not depend on the current working tree. -Once the branch is verified, it's time to build +See the source of the script for more details, or to run commands manually in +case of a problem. -#### Checklist before proceeding +The final state of the repository after an RC commit is chosen should match +this diagram: -- [ ] Release Manager’s GPG key is published to `dist.apache.org`; -- [ ] Release Manager’s GPG key is configured in `git` configuration; -- [ ] Set `SIGNING_KEY` to the public key of the Manager's GPG key; -- [ ] Release Manager has `org.apache.beam` listed under `Staging Profiles` in Nexus; -- [ ] Release Manager’s Nexus User Token is configured in `settings.xml`; -- [ ] GitHub issue release item for the subsequent release has been created; -- [ ] All test failures from branch verification have associated GitHub issues; -- [ ] There are no release blocking GitHub issues; -- [ ] Release branch has been created; -- [ ] There are no open pull requests to release branch; -- [ ] Originating branch has the version information updated to the new version; -- [ ] Nightly snapshot is in progress (do revisit it continually); -- [ ] Set `JAVA_HOME` to JDK 8 (Example: `export JAVA_HOME=/example/path/to/java/jdk8`). -- [ ] Have Java 11 installed. +Set version to non-SNAPSHOT, non-dev, on tagged RC commit -The core of the release process is the build-vote-fix cycle. -Each cycle produces one release candidate. -The Release Manager repeats this cycle until the community approves one release candidate, which is then finalized. +The following should be confirmed: -For this step, we recommend you using automation script to create a RC, but you still can perform all steps manually if you want. +- [ ] The release branch is unchanged. +- [ ] There is a commit not on the release branch with the version adjusted. +- [ ] The RC tag points to that commit. -#### Tag a chosen commit for the RC +### Run build_release_candidate GitHub Action to create a release candidate -Release candidates are built from single commits off the release branch. -Before building, the version must be set to a non-SNAPSHOT, non-dev version. -The final state of the repository should match this diagram: +**Action** [build_release_candidate](https://github.com/apache/beam/actions/workflows/build_release_candidate.yml) (click `run workflow`) -Set version to non-SNAPSHOT, non-dev, on tagged RC commit +**The action will:** -- The release branch is unchanged. -- There is a commit not on the release branch with the version adjusted. -- The RC tag points to that commit. +1. Clone the repo at the selected RC tag. +2. Run gradle publish to push java artifacts into Maven staging repo. +3. Build and push java and python source distribution into [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam). +4. Stage SDK docker images to [docker hub Apache + organization](https://hub.docker.com/search?q=apache%2Fbeam&type=image). +5. Build javadoc, pydoc, typedocs for a PR to update beam-site. + - **NOTE**: Do not merge this PR until after an RC has been approved (see + "Finalize the Release"). -* **Script:** [choose_rc_commit.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/choose_rc_commit.sh) +### Verify source distributions -* **Usage** + - [ ] Verify that the source zip of the whole project is present in [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam). + - [ ] Verify that the Python binaries are present in [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam). - ./release/src/main/scripts/choose_rc_commit.sh \ - --release "${RELEASE_VERSION}" \ - --rc "${RC_NUM}" \ - --commit "${COMMIT_REF}" \ - --clone \ - --push-tag +### Verify docker images -You can do a dry run by omitting the `--push-tag` flag. Then it will only clone the repo, -adjust the version, and add the tag locally. If it looks good, run it again with `--push-tag`. -If you already have a clone that includes the `${COMMIT_REF}` then you can omit `--clone`. This -is perfectly safe since the script does not depend on the current working tree. +At +[https://hub.docker.com/u/apache](https://hub.docker.com/search?q=apache%2Fbeam&type=image), +visit each repository and navigate to "tags" tab. Verify images are pushed +with tags: `${RELEASE_VERSION}rc{RC_NUM}` -See the source of the script for more details, or to run commands manually in case of a problem. +Verify that third party licenses are included in Docker. You can do this with a simple script: -#### Run build_release_candidate GitHub Action to create a release candidate + RC_TAG=${RELEASE_VERSION}rc{RC_NUM} + for pyver in 3.8 3.9 3.10 3.11; do + docker run --rm --entrypoint sh \ + apache/beam_python${pyver}_sdk:${RC_TAG} \ + -c 'ls -al /opt/apache/beam/third_party_licenses/ | wc -l' + done -Note: This step is partially automated (in progress), so part of the rc creation is done by GitHub Actions and the rest is done by a script. -You don't need to wait for the action to complete to start running the script. + for javaver in 8 11 17; do + docker run --rm --entrypoint sh \ + apache/beam_java${pyver}_sdk:${RC_TAG} \ + -c 'ls -al /opt/apache/beam/third_party_licenses/ | wc -l' + done -* **Action** [build_release_candidate](https://github.com/apache/beam/actions/workflows/build_release_candidate.yml) (click `run workflow`) +And you may choose to log in to the containers and inspect: -* **The script will:** - 1. Clone the repo at the selected RC tag. - 2. Run gradle publish to push java artifacts into Maven staging repo. - 3. Stage SDK docker images to [docker hub Apache organization](https://hub.docker.com/search?q=apache%2Fbeam&type=image). - 4. Build javadoc, pydoc, typedocs for a PR to update beam-site. - * **NOTE**: Do not merge this PR until after an RC has been approved (see "Finalize the Release"). + docker run --rm -it --entrypoint=/bin/bash \ + apache/beam_java${ver}_sdk:${RC_TAG} + ls -al /opt/apache/beam/third_party_licenses/ -##### Tasks you need to do manually +### Publish Java staging artifacts (manual) -Publish staging artifacts 1. Log in to the [Apache Nexus](https://repository.apache.org/#stagingRepositories) website. 2. Navigate to Build Promotion -> Staging Repositories (in the left sidebar). 3. Select repository `orgapachebeam-NNNN`. 4. Click the Close button. 5. When prompted for a description, enter “Apache Beam, version X, release candidate Y”. - 6. Review all staged artifacts on `https://repository.apache.org/content/repositories/orgapachebeam-NNNN/`. + 6. Review all staged artifacts on `https://repository.apache.org/content/repositories/orgapachebeam-NNNN/`. They should contain all relevant parts for each module, including `pom.xml`, jar, test jar, javadoc, etc. Artifact names should follow [the existing format](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.beam%22) in which artifact name mirrors directory structure, e.g., `beam-sdks-java-io-kafka`. Carefully review any new artifacts. Some additional validation should be done during the rc validation step. -#### Run build_release_candidate.sh to create a release candidate - -* **Script:** [build_release_candidate.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/build_release_candidate.sh) -* **Usage** +### Upload `rc` artifacts to PyPI - ./release/src/main/scripts/build_release_candidate.sh --release "${RELEASE_VERSION}" --rc "${RC_NUM}" --github-user "${GITHUB_USER}" --java11-home "${JAVA11_HOME}" --signing-key "${SIGNING_KEY}" - -* **The script will:** - 1. Clone the repo at the selected RC tag. - 2. Stage source release into dist.apache.org dev [repo](https://dist.apache.org/repos/dist/dev/beam/). -Skip this step if you already did it with the build_release_candidate GitHub Actions workflow. - 3. Stage, sign and hash python source distribution and wheels into dist.apache.org dev repo python dir - 4. Stage SDK docker images to [docker hub Apache organization](https://hub.docker.com/search?q=apache%2Fbeam&type=image). -Skip this step if you already did it with the build_release_candidate GitHub Actions workflow. -Note: if you are not a member of the [`beam` DockerHub team](https://hub.docker.com/orgs/apache/teams/beam) you will need -help with this step. Please email `dev@` mailing list and ask a member of the `beam` DockerHub team for help. - 5. Create a PR to update beam-site, changes includes: - * Copy python doc into beam-site - * Copy java doc into beam-site - * **NOTE**: Do not merge this PR until after an RC has been approved (see "Finalize the Release"). -Skip this step if you already did it with the build_release_candidate GitHub Actions workflow. - -##### Tasks you need to do manually - -Verify the script worked. - - 1. Verify that the source and Python binaries are present in [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam). - 2. Verify Docker images are published. How to find images: - 1. Visit [https://hub.docker.com/u/apache](https://hub.docker.com/search?q=apache%2Fbeam&type=image) - 2. Visit each repository and navigate to *tags* tab. - 3. Verify images are pushed with tags: ${RELEASE_VERSION}_rc{RC_NUM} - 3. Verify that third party licenses are included in Docker containers by logging in to the images. - - For Python SDK images, there should be around 80 ~ 100 dependencies. - Please note that dependencies for the SDKs with different Python versions vary. - Need to verify all Python images by replacing `${ver}` with each supported Python version `X.Y`. - ``` - docker run --rm -it --entrypoint=/bin/bash apache/beam_python${ver}_sdk:${RELEASE_VERSION}rc${RC_NUM} - ls -al /opt/apache/beam/third_party_licenses/ | wc -l - ``` - - For Java SDK images, there should be around 200 dependencies. - ``` - docker run --rm -it --entrypoint=/bin/bash apache/beam_java${ver}_sdk:${RELEASE_VERSION}rc${RC_NUM} - ls -al /opt/apache/beam/third_party_licenses/ | wc -l - ``` - -#### Upload release candidate to PyPi - -* **Script:** [deploy_release_candidate_pypi.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/deploy_release_candidate_pypi.sh) - -* **Usage** +This step uploads artifacts such as `apache-beam-${RELEASE_VERSION}rc${RC_NUM}` +to PyPI, so the RC artifacts can be depended upon directly by consumers, for +ease of RC verification. - ./release/src/main/scripts/deploy_release_candidate_pypi.sh \ - --release "${RELEASE_VERSION}" \ - --rc "${RC_NUM}" \ - --user "${GITHUB_USER}" \ - --deploy +**Action** [deploy_release_candidate_pypi](https://github.com/apache/beam/actions/workflows/deploy_release_candidate_pypi.yml) (click `run workflow`) -* **The script will:** +**The Action will:** -1. Download python binary artifacts -2. Deploy release candidate to PyPI +Download previously build python binary artifacts Deploy release candidate +to PyPI with an `rc` suffix. __Attention:__ Verify that: -* The File names version include ``rc-#`` suffix -* [Download Files](https://pypi.org/project/apache-beam/#files) have: - * All wheels uploaded as artifacts - * Release source's zip published - * Signatures and hashes do not need to be uploaded - -You can do a dry run by omitting the `--deploy` flag. Then it will only download the release candidate binaries. If it looks good, rerun it with `--deploy`. - -See the source of the script for more details or to run commands manually in case of a problem. +- [ ] The File names version include ``rc-#`` suffix +- [ ] [Download Files](https://pypi.org/project/apache-beam/#files) have: + - [ ] All wheels uploaded as artifacts + - [ ] Release source's zip published + - [ ] Signatures and hashes do not need to be uploaded ### Propose pull requests for website updates -Beam publishes API reference manuals for each release on the website. -For Java and Python SDKs, that’s Javadoc and PyDoc, respectively. -The final step of building the candidate is to propose website pull requests that update these manuals. +Beam publishes API reference manuals for each release on the website. For Java +and Python SDKs, that’s Javadoc and PyDoc, respectively. The final step of +building the candidate is to propose website pull requests that update these +manuals. The first pr will get created by the build_release_candidate action, +you will need to create the second one manually -Merge the pull requests only after finalizing the release. -To avoid invalid redirects for the 'current' version, merge these PRs in the order listed. -Once the PR is merged, the new contents will get picked up automatically and served to the Beam website, usually within an hour. -A committer can manually trigger the [beam_PostCommit_Website_Publish](https://ci-beam.apache.org/job/beam_PostCommit_Website_Publish/) task in Jenkins to avoid waiting. +Merge the pull requests only after finalizing the release. To avoid invalid +redirects for the 'current' version, merge these PRs in the order listed. Once +the PR is merged, the new contents will get picked up automatically and served +to the Beam website, usually within an hour. A committer can manually trigger +the +[beam_PostCommit_Website_Publish](https://ci-beam.apache.org/job/beam_PostCommit_Website_Publish/) +task in Jenkins to avoid waiting. **PR 1: apache/beam-site** -This pull request is against the `apache/beam-site` repo, on the `release-docs` branch ([example](https://github.com/apache/beam-site/pull/603)). -It is created by the `build_release_candidate` workflow (see above). +This pull request is against the `apache/beam-site` repo, on the `release-docs` +branch ([example](https://github.com/apache/beam-site/pull/603)). It is +created by the `build_release_candidate` workflow (see above). **PR 2: apache/beam** -This pull request is against the `apache/beam` repo, on the `master` branch ([example](https://github.com/apache/beam/pull/17378)). +This pull request is against the `apache/beam` repo, on the `master` branch +([example](https://github.com/apache/beam/pull/17378)). -* Update `CHANGES.md` to update release date and remove template. -* Update release version in `website/www/site/config.toml`. -* Add new release in `website/www/site/content/en/get-started/downloads.md`. - * Download links will not work until the release is finalized. -* Update links to prior releases to point to https://archive.apache.org (see +- Update `CHANGES.md` to update release date and remove template. +- Update release version in `website/www/site/config.toml`. +- Add new release in `website/www/site/content/en/get-started/downloads.md`. + - Download links will not work until the release is finalized. +- Update links to prior releases to point to https://archive.apache.org (see example PR). -* Create the Blog post: +- Create the Blog post: #### Blog post -Use the template below to write a blog post for the release. -See [beam-2.31.0.md](https://github.com/apache/beam/commit/a32a75ed0657c122c6625aee1ace27994e7df195#diff-1e2b83a4f61dce8014a1989869b6d31eb3f80cb0d6dade42fb8df5d9407b4748) as an example. -- Copy the changes for the current release from `CHANGES.md` to the blog post and edit as necessary. -- Be sure to add yourself to [authors.yml](https://github.com/apache/beam/blob/master/website/www/site/data/authors.yml) if necessary. +Use the template below to write a blog post for the release. See +[beam-2.31.0.md](https://github.com/apache/beam/commit/a32a75ed0657c122c6625aee1ace27994e7df195#diff-1e2b83a4f61dce8014a1989869b6d31eb3f80cb0d6dade42fb8df5d9407b4748) +as an example. + +- Copy the changes for the current release from `CHANGES.md` to the blog post + and edit as necessary. +- Be sure to add yourself to + [authors.yml](https://github.com/apache/beam/blob/master/website/www/site/data/authors.yml) + if necessary. -__Tip__: Use git log to find contributors to the releases. (e.g: `git fetch origin --tags; git log --pretty='%aN' ^v2.10.0 v2.11.0-RC1 | sort | uniq`). -Make sure to clean it up, as there may be duplicate or incorrect user names. +> **TIP** +> Use git log to find contributors to the releases. (e.g: `git fetch +> origin --tags; git log --pretty='%aN' ^v2.10.0 v2.11.0-RC1 | sort | uniq`). +> Make sure to clean it up, as there may be duplicate or incorrect user names. -__NOTE__: Make sure to include any breaking changes, even to `@Experimental` features, -all major features and bug fixes, and all known issues. +> **NOTE** +> Make sure to include any breaking changes, even to `@Experimental` +> features, all major features and bug fixes, and all known issues. **Template:** @@ -755,34 +763,46 @@ all major features and bug fixes, and all known issues. ${CONTRIBUTORS} -### Checklist to proceed to the next step +### Checklist to proceed to the next phase -- [ ] Maven artifacts deployed to the staging repository of [repository.apache.org](https://repository.apache.org/content/repositories/) -. Source distribution deployed to the dev repository of [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam/) -- [ ] Website pull request proposed to list the [release](/get-started/downloads/), publish the [Java API reference manual](https://beam.apache.org/releases/javadoc/), and publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/). -- [ ] Docker images are published to [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags: {RELEASE_VERSION}_rc{RC_NUM}. +- [ ] Maven artifacts deployed to the staging repository of + [repository.apache.org](https://repository.apache.org/content/repositories/) +- [ ] Source distribution deployed to the dev repository of + [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam/) +- [ ] Website pull request proposed to list the + [release](/get-started/downloads/), publish the [Java API reference + manual](https://beam.apache.org/releases/javadoc/), and publish the [Python + API reference manual](https://beam.apache.org/releases/pydoc/). +- [ ] Docker images are published to + [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with + tags: `{RELEASE_VERSION}rc{RC_NUM}`. You can (optionally) also do additional verification by: -- [ ] Check that Python zip file contains the `README.md`, `NOTICE`, and `LICENSE` files. -- [ ] Check hashes (e.g. `md5sum -c *.md5` and `sha1sum -c *.sha1`. Note that signature/checksum files of Java artifacts may not contain filenames. Hence you might need to compare checksums/signatures manually or modify the files by appending the filenames.) -- [ ] Check signatures (e.g. `gpg --verify apache-beam-1.2.3-python.zip.asc apache-beam-1.2.3-python.zip`) + +- [ ] Check that Python zip file contains the `README.md`, `NOTICE`, and + `LICENSE` files. +- [ ] Check hashes (e.g. `md5sum -c *.md5` and `sha1sum -c *.sha1`. Note that + signature/checksum files of Java artifacts may not contain filenames. Hence + you might need to compare checksums/signatures manually or modify the files by + appending the filenames.) +- [ ] Check signatures (e.g. `gpg --verify apache-beam-1.2.3-python.tar.gz.asc + apache-beam-1.2.3-python.tar.gz`) - [ ] `grep` for legal headers in each file. -- [ ] Run all jenkins suites and include links to passing tests in the voting email. -- [ ] Pull docker images to make sure they are pullable. -``` -docker pull {image_name} -docker pull apache/beam_python3.7_sdk:2.39.0rc1 -``` +- [ ] Run all jenkins suites and include links to passing tests in the voting + email. +- [ ] Pull docker images to make sure they are pullable. (e.g. `docker pull apache/beam_python3.7_sdk:2.39.0rc1` ********** -## 4. Vote and validate release candidate +## Vote and validate the release candidate -Once you have built and individually reviewed the release candidate, please share it for the community-wide review. -Please review foundation-wide [voting guidelines](https://www.apache.org/foundation/voting.html) for more information. +Once you have built and individually reviewed the release candidate, please +share it for the community-wide review. Please review foundation-wide [voting +guidelines](https://www.apache.org/foundation/voting.html) for more +information. -Start the review-and-vote thread on the dev@ mailing list. -Here’s an email template; please adjust as you see fit. +Start the review-and-vote thread on the dev@ mailing list. Here’s an email +template; please adjust as you see fit. From: Release Manager To: dev@beam.apache.org @@ -801,11 +821,10 @@ Here’s an email template; please adjust as you see fit. The complete staging area is available for your review, which includes: * GitHub Release notes [1], - * the official Apache source release to be deployed to dist.apache.org [2], which is signed with the key with fingerprint FFFFFFFF [3], + * the official Apache source release to be deployed to dist.apache.org [2], which is signed with the key with fingerprint FFFFFFFF (D20316F712213422 if automated) [3], * all artifacts to be deployed to the Maven Central Repository [4], * source code tag "v1.2.3-RC3" [5], * website pull request listing the release [6], the blog post [6], and publishing the API reference manual [7]. - * Java artifacts were built with Gradle GRADLE_VERSION and OpenJDK/Oracle JDK JDK_VERSION. * Python artifacts are deployed along with the source release to the dist.apache.org [2] and PyPI[8]. * Go artifacts and documentation are available at pkg.go.dev [9] * Validation sheet with a tab for 1.2.3 release to help with validation [10]. @@ -814,7 +833,7 @@ Here’s an email template; please adjust as you see fit. The vote will be open for at least 72 hours. It is adopted by majority approval, with at least 3 PMC affirmative votes. - For guidelines on how to try the release in your projects, check out our blog post at /blog/validate-beam-release/. + For guidelines on how to try the release in your projects, check out our blog post at https://beam.apache.org/blog/validate-beam-release/. Thanks, Release Manager @@ -832,28 +851,43 @@ Here’s an email template; please adjust as you see fit. [11] https://hub.docker.com/search?q=apache%2Fbeam&type=image [12] https://github.com/apache/beam/pull/... -If there are any issues found in the release candidate, reply on the vote thread to cancel the vote. -There’s no need to wait 72 hours. -Proceed to the `Fix issues` step below and address the problem. -However, some issues don’t require cancellation. -For example, if an issue is found in the website pull request, just correct it on the spot and the vote can continue as-is. +If there are any issues found in the release candidate, reply on the vote +thread to cancel the vote. There’s no need to wait 72 hours. Go back to +["Stabilize the Release Branch"](#stabilize-the-release-branch) and address the problem. However, some issues +don’t require cancellation. For example, if an issue is found in the website +pull request, just correct it on the spot and the vote can continue as-is. ### Run validation tests -The community is responsible for performing validation, but as release manager you are expected to contribute as well. -Before accepting an RC, as a community we try to exercise most (if not all) of the tests listed in this -[spreadsheet](https://s.apache.org/beam-release-validation), and those are good validations for you to try out as release manager. -The goal of these tests is to validate that we're able to run basic pipelines from a variety of environments (not just our CI environment). -Since there are many tests, we recommend you running some validations using an automation script. -In case of script failure, you can still run all of them manually. +The community is responsible for performing validation, but as release manager +you are expected to contribute as well. + +Before accepting an RC, as a community we try to exercise most (if not all) of +the tests listed in this +[spreadsheet](https://s.apache.org/beam-release-validation), and those are good +validations for you to try out as release manager. The goal of these tests is +to validate that we're able to run basic pipelines from a variety of +environments (not just our CI environment). + +Since there are many tests, we recommend you running some validations using an +automation script. In case of script failure, you can still run all of them +manually. You may need to have Python interpreters for all supported Python minor -versions to run all of the tests. See Python installation tips in [Developer Wiki](https://cwiki.apache.org/confluence/display/BEAM/Python+Tips#PythonTips-InstallingPythoninterpreters). +versions to run all of the tests. See Python installation tips in [Developer +Wiki](https://cwiki.apache.org/confluence/display/BEAM/Python+Tips#PythonTips-InstallingPythoninterpreters). + +> **Note** +> The community's validation means more than just running the tests +> that we have already run. It includes users trying out the RC on their own +> downstream tests. It also includes double checking that our human-language +> instructions actually still correspond to the automation that we have built. #### Run validations using run_rc_validation.sh -* **Script:** [run_rc_validation.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/run_rc_validation.sh) -* **Usage** +**Script:** [run_rc_validation.sh](https://github.com/apache/beam/blob/master/release/src/main/scripts/run_rc_validation.sh) + +**Usage** 1. First update required configurations listed in `RC_VALIDATE_CONFIGS` in [script.config](https://github.com/apache/beam/blob/master/release/src/main/scripts/script.config) 2. Then run @@ -864,7 +898,7 @@ versions to run all of the tests. See Python installation tips in [Developer Wik **Note:** running the validations requires the ability to do the following in your GCP account: start pipelines, write to BigQuery, and create a cluster of machines for running containers (for x-lang validation). -* **Tasks included** +**Tasks included** 1. Create a PR to trigger Python validation job, including * Python quickstart in batch and streaming mode with direct runner and Dataflow runner. * Python Mobile Games(UserScore, HourlyTeamScore) with direct runner and Dataflow runner. @@ -880,22 +914,25 @@ write to BigQuery, and create a cluster of machines for running containers (for * Start a new terminal to run Python multi-language Java kafka validation with Dataflow Runner. * Start a new terminal to run Python multi-language Java sql validation with Dataflow Runner. -* **Tasks you need to do manually** - 1. Check whether validations succeed by following console output instructions. - 1. Terminate streaming jobs and java injector. - 1. Run Java quickstart (wordcount) and mobile game examples with the staged artifacts. The easiest way to do this is by running the tests on Jenkins. -Other manual validation will follow, but this will at least validate that the staged artifacts can be used. +* **Tasks you need to do manually**. + +- [ ] Check whether validations succeed by following console output instructions. +- [ ] Terminate streaming jobs and java injector. +- [ ] Run Java quickstart (wordcount) and mobile game examples with the staged artifacts. The easiest way to do this is by running the tests on Jenkins. + +- Other manual validation will follow, but this will at least validate that the staged artifacts can be used. * Log in to Jenkins. * Go to https://ci-beam.apache.org/job/beam_PostRelease_NightlySnapshot/. * Click "Build with Parameters". * Set `snapshot_version` to `2.xx.0`, and set `snapshot_url` to point to the staged artifacts in Maven central (https://repository.apache.org/content/repositories/orgapachebeam-NNNN/). * Click "Build". - 1. Sign up [spreadsheet](https://s.apache.org/beam-release-validation). - 1. Vote in the release thread. +- [ ] Sign up [spreadsheet](https://s.apache.org/beam-release-validation). +- [ ] Vote in the release thread. #### Run validations manually -_Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Manager. +> **Note** +> `-Prepourl` and `-Pver` can be found in the RC vote email sent by Release Manager. * **Java Quickstart Validation** @@ -964,14 +1001,14 @@ _Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Ma * **Verify the hashes** ``` - sha512sum -c apache-beam-2.5.0-python.zip.sha512 - sha512sum -c apache-beam-2.5.0-source-release.zip.sha512 + sha512sum -c apache-beam-2.5.0-python.tar.gz.sha512 + sha512sum -c apache-beam-2.5.0-source-release.tar.gz.sha512 ``` * **Build SDK** ``` sudo apt-get install unzip - unzip apache-beam-2.5.0-source-release.zip + unzip apache-beam-2.5.0-source-release.tar.gz python setup.py sdist ``` * **Setup virtual environment** @@ -1103,25 +1140,17 @@ _Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Ma * bq head -n 10 ${USER}_test.game_stats_sessions -### Fix issues - -Any issues identified during the community review and vote should be fixed in this step. -Additionally, any GitHub issues created from the initial branch verification should be fixed. - -Code changes should be proposed as standard pull requests to the `master` branch and reviewed using the normal contributing process. -Then, relevant changes should be cherry-picked into the release branch proposed as pull requests against the release branch, again reviewed and merged using the normal contributing process. - -Once all issues have been resolved as in the `Verify release branch` step, you should go back and build a new release candidate with these changes. - ### Finalize the vote -Reply on the vote thread to close the voting once following conditions are met for the current release candidate. -* At least 72 hours has passed since the voting email. -* No release blocking issues have been identified. -* Voting thread has at least three approving PMC votes. +Reply on the vote thread to close the voting once following conditions are met +for the current release candidate. -Then, tally the votes in a separate email thread. -Here’s an email template; please adjust as you see fit. +- [ ] At least 72 hours has passed since the voting email. +- [ ] No release blocking issues have been identified. +- [ ] Voting thread has at least three approving PMC votes. + +Then, tally the votes in a separate email thread. Here’s an email template; +please adjust as you see fit. From: Release Manager To: dev@beam.apache.org @@ -1146,9 +1175,9 @@ Here’s an email template; please adjust as you see fit. - [ ] Community votes to release the proposed candidate, with at least three approving PMC votes. -********** +---- -## 5. Finalize the release +## Finalize the release Once the release candidate has been reviewed and approved by the community, the release should be finalized. This involves the final deployment of the release candidate to the release repositories, merging of the website changes, etc. @@ -1267,7 +1296,7 @@ Use [reporter.apache.org](https://reporter.apache.org/addrelease.html?beam) to s ********** -## 6. Promote the release +## Promote the release Once the release has been finalized, the last step of the process is to promote the release within the project and beyond. @@ -1298,7 +1327,45 @@ Also, update [the Wikipedia article on Apache Beam](https://en.wikipedia.org/wik ********** -## Post Release Tasks +## Post-Release Tasks + +At the end of the release, go to the GitHub milestones page and mark the recently released version as closed. + +#### Update the Java BOM + +Google releases a BOM that pins compatible versions of their Java libraries. +After the release, try updating the BOM to the latest version. + +To do so, create a draft PR and run test suites following the instructions at +https://github.com/apache/beam/blob/master/contributor-docs/java-dependency-upgrades.md. + +Triage the test failures and rerun any tests that seem potentially unrelated to the upgrade. +If there are no test failures due to the BOM upgrade, request review and merge the PR as normal. + +If there are test failures due to the BOM upgrade, email the dev list and ask for a volunteer to take the update forward. +It is not your responsibility to fix the BOM issues or to find a volunteer (though you are welcome to take it forward). +If nobody volunteers, that is OK and this issue can roll forward to the next release. +You can optionally use the following template for your email to the dev list: + +``` +From: Release Manager +To: dev@beam.apache.org +Subject: Java BOM Update X.Y.Z + +Hi everyone, + +Following the instructions in https://github.com/apache/beam/blob/master/contributor-docs/release-guide.md#post-release-tasks +I've attempted to update the Java Google BOM and have run into test issues caused by the upgrade [1]. +Since the Java Google BOM update is best effort for a release manager, I'm handing this piece off to the community. +If you would like to volunteer to help, you can get started by following the instructions in +https://github.com/apache/beam/blob/master/contributor-docs/java-dependency-upgrades.md#google-cloud-related-dependency-upgrades +otherwise this will roll over to the next release. + +Thanks, +Release Manager + +[1] https://github.com/apache/beam/pull/123 +``` ### Update Beam Playground diff --git a/examples/java/build.gradle b/examples/java/build.gradle index 2e262e8de795a..a43862ae801d6 100644 --- a/examples/java/build.gradle +++ b/examples/java/build.gradle @@ -54,6 +54,10 @@ configurations.sparkRunnerPreCommit { dependencies { implementation enforcedPlatform(library.java.google_cloud_platform_libraries_bom) implementation library.java.vendored_guava_32_1_2_jre + if (project.findProperty('testJavaVersion') == '21' || JavaVersion.current().equals(JavaVersion.VERSION_21)) { + // this dependency is somehow needed for compile only under Java21 + compileOnly library.java.kafka + } implementation library.java.kafka_clients implementation project(path: ":sdks:java:core", configuration: "shadow") implementation project(":sdks:java:extensions:avro") diff --git a/examples/kotlin/build.gradle b/examples/kotlin/build.gradle index 98258401d5881..829aefd447452 100644 --- a/examples/kotlin/build.gradle +++ b/examples/kotlin/build.gradle @@ -80,6 +80,9 @@ dependencies { for (String runner : preCommitRunners) { delegate.add(runner + "PreCommit", project(path: ":examples:kotlin", configuration: "testRuntimeMigration")) } + directRunnerPreCommit project(project.path) + flinkRunnerPreCommit project(project.path) + sparkRunnerPreCommit project(project.path) directRunnerPreCommit project(path: ":runners:direct-java", configuration: "shadow") flinkRunnerPreCommit project(":runners:flink:${project.ext.latestFlinkVersion}") sparkRunnerPreCommit project(":runners:spark:3") @@ -106,10 +109,7 @@ for (String runner : preCommitRunners) { "--runner=" + preCommitRunnerClass[runner], ] classpath = configurations."${runner}PreCommit" - include "**/WordCountIT.class" - if (!"sparkRunner".equals(runner)) { - include "**/WindowedWordCountIT.class" - } + include "**/kotlin/**/*Test.class" forkEvery 1 maxParallelForks 4 systemProperty "beamTestPipelineOptions", JsonOutput.toJson(preCommitBeamTestPipelineOptions) @@ -136,4 +136,3 @@ compileTestKotlin { repositories { mavenCentral() } - diff --git a/examples/notebooks/beam-ml/README.md b/examples/notebooks/beam-ml/README.md index 77bf3fc99f155..0ae937e9e284f 100644 --- a/examples/notebooks/beam-ml/README.md +++ b/examples/notebooks/beam-ml/README.md @@ -57,6 +57,7 @@ This section contains the following example notebooks. * [Apache Beam RunInference with Hugging Face](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_huggingface.ipynb) * [Apache Beam RunInference with XGBoost](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_xgboost.ipynb) * [Use RunInference with TFX](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_tensorflow_with_tfx.ipynb) +* [Use RunInference with a remotely deployed Vertex AI endpoint](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_vertex_ai.ipynb) * [Use RunInference in Apache Beam](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb) * [Use RunInference with a LLM](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_generative_ai.ipynb) * [Use RunInference with Beam's windowing semantics](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_windowing.ipynb) @@ -67,8 +68,10 @@ This section contains the following example notebooks. * [Remote inference](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/custom_remote_inference.ipynb) ### Machine Learning Use Cases -* [Image Processing with Apache Beam](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/image_processing_tensorflow.ipynb) -* [Natural Language Processing with Apache Beam](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/nlp_tensorflow_streaming.ipynb) + +* [Image processing with Apache Beam](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/image_processing_tensorflow.ipynb) +* [Natural language processing with Apache Beam](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/nlp_tensorflow_streaming.ipynb) +* [Speech emotion recognition with Apache Beam](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/speech_emotion_tensorflow.ipynb) ### Automatic Model Refresh @@ -77,6 +80,7 @@ This section contains the following example notebooks. ### Multi-model pipelines * [Ensemble model using an image captioning and ranking](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_multi_model.ipynb) +* [Run ML inference with multiple differently-trained models](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/per_key_models.ipynb) ### Model Evaluation @@ -84,4 +88,5 @@ This section contains the following example notebooks. ### Data processing +* [Preprocess data with MLTransform](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/mltransform_basic.ipynb) * [Preprocessing with the Apache Beam DataFrames API](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/dataframe_api_preprocessing.ipynb) diff --git a/examples/notebooks/beam-ml/automatic_model_refresh.ipynb b/examples/notebooks/beam-ml/automatic_model_refresh.ipynb index 67fe51af12530..cf05979c5b337 100644 --- a/examples/notebooks/beam-ml/automatic_model_refresh.ipynb +++ b/examples/notebooks/beam-ml/automatic_model_refresh.ipynb @@ -1,45 +1,57 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [{ - "cell_type": "code", - "source": [ - "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", - "\n", - "# Licensed to the Apache Software Foundation (ASF) under one\n", - "# or more contributor license agreements. See the NOTICE file\n", - "# distributed with this work for additional information\n", - "# regarding copyright ownership. The ASF licenses this file\n", - "# to you under the Apache License, Version 2.0 (the\n", - "# \"License\"); you may not use this file except in compliance\n", - "# with the License. You may obtain a copy of the License at\n", - "#\n", - "# http://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing,\n", - "# software distributed under the License is distributed on an\n", - "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", - "# KIND, either express or implied. See the License for the\n", - "# specific language governing permissions and limitations\n", - "# under the License" - ], - "metadata": { - "cellView": "form", - "id": "OsFaZscKSPvo" - }, - "execution_count": null, + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ], + "metadata": { + "cellView": "form", + "id": "OsFaZscKSPvo" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -47,62 +59,79 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "# Update ML models in running pipelines\n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " Run in Google Colab\n", - " \n", - " View source on GitHub\n", - "
\n" - ], - "metadata": { - "id": "ZUSiAR62SgO8" - } - }, - { - "cell_type": "markdown", - "source": [ - "This notebook demonstrates how to perform automatic model updates without stopping your Apache Beam pipeline.\n", - "You can use side inputs to update your model in real time, even while the Apache Beam pipeline is running. The side input is passed in a `ModelHandler` configuration object. You can update the model either by leveraging one of Apache Beam's provided patterns, such as the `WatchFilePattern`, or by configuring a custom side input `PCollection` that defines the logic for the model update.\n", - "\n", - "The pipeline in this notebook uses a RunInference `PTransform` with TensorFlow machine learning (ML) models to run inference on images. To update the model, it uses a side input `PCollection` that emits `ModelMetadata`.\n", - "For more information about side inputs, see the [Side inputs](https://beam.apache.org/documentation/programming-guide/#side-inputs) section in the Apache Beam Programming Guide.\n", - "\n", - "This example uses `WatchFilePattern` as a side input. `WatchFilePattern` is used to watch for file updates that match the `file_pattern` based on timestamps. It emits the latest `ModelMetadata`, which is used in the RunInference `PTransform` to automatically update the ML model without stopping the Apache Beam pipeline.\n" - ], - "metadata": { - "id": "tBtqF5UpKJNZ" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Before you begin\n", - "Install the dependencies required to run this notebook.\n", - "\n", - "To use RunInference with side inputs for automatic model updates, use Apache Beam version 2.46.0 or later." - ], - "metadata": { - "id": "SPuXFowiTpWx" - } - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "1RyTYsFEIOlA", - "outputId": "0e6b88a7-82d8-4d94-951c-046a9b8b7abb", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, + }, + { + "cell_type": "markdown", + "source": [ + "# Update ML models in running pipelines\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
\n" + ], + "metadata": { + "id": "ZUSiAR62SgO8" + }, + "outputs": [{ + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + }] + }, + { + "cell_type": "markdown", + "source": [ + "This notebook demonstrates how to perform automatic model updates without stopping your Apache Beam pipeline.\n", + "You can use side inputs to update your model in real time, even while the Apache Beam pipeline is running. The side input is passed in a `ModelHandler` configuration object. You can update the model either by leveraging one of Apache Beam's provided patterns, such as the `WatchFilePattern`, or by configuring a custom side input `PCollection` that defines the logic for the model update.\n", + "\n", + "The pipeline in this notebook uses a RunInference `PTransform` with TensorFlow machine learning (ML) models to run inference on images. To update the model, it uses a side input `PCollection` that emits `ModelMetadata`.\n", + "For more information about side inputs, see the [Side inputs](https://beam.apache.org/documentation/programming-guide/#side-inputs) section in the Apache Beam Programming Guide.\n", + "\n", + "This example uses `WatchFilePattern` as a side input. `WatchFilePattern` is used to watch for file updates that match the `file_pattern` based on timestamps. It emits the latest `ModelMetadata`, which is used in the RunInference `PTransform` to automatically update the ML model without stopping the Apache Beam pipeline.\n" + ], + "metadata": { + "id": "tBtqF5UpKJNZ" + }, + "outputs": [{ + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + }] + }, + { + "cell_type": "markdown", + "source": [ + "## Before you begin\n", + "Install the dependencies required to run this notebook.\n", + "\n", + "To use RunInference with side inputs for automatic model updates, use Apache Beam version 2.46.0 or later." + ], + "metadata": { + "id": "SPuXFowiTpWx" + }, + "outputs": [{ + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + }] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1RyTYsFEIOlA" + }, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -110,41 +139,128 @@ "\n" ] }], - "source": [ - "!pip install apache_beam[gcp]>=2.46.0 --quiet\n", - "!pip install tensorflow\n", - "!pip install tensorflow_hub" - ] - }, - { - "cell_type": "code", - "source": [ - "# Imports required for the notebook.\n", - "import logging\n", - "import time\n", - "from typing import Iterable\n", - "from typing import Tuple\n", - "\n", - "import apache_beam as beam\n", - "from apache_beam.examples.inference.tensorflow_imagenet_segmentation import PostProcessor\n", - "from apache_beam.examples.inference.tensorflow_imagenet_segmentation import read_image\n", - "from apache_beam.ml.inference.base import PredictionResult\n", - "from apache_beam.ml.inference.base import RunInference\n", - "from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor\n", - "from apache_beam.ml.inference.utils import WatchFilePattern\n", - "from apache_beam.options.pipeline_options import GoogleCloudOptions\n", - "from apache_beam.options.pipeline_options import PipelineOptions\n", - "from apache_beam.options.pipeline_options import SetupOptions\n", - "from apache_beam.options.pipeline_options import StandardOptions\n", - "from apache_beam.transforms.periodicsequence import PeriodicImpulse\n", - "import numpy\n", - "from PIL import Image\n", - "import tensorflow as tf" - ], - "metadata": { - "id": "Rs4cwwNrIV9H" - }, - "execution_count": 2, + "source": [ + "!pip install apache_beam[gcp]>=2.46.0 --quiet\n", + "!pip install tensorflow --quiet\n", + "!pip install tensorflow_hub --quiet" + ] + }, + { + "cell_type": "code", + "source": [ + "# Imports required for the notebook.\n", + "import logging\n", + "import time\n", + "from typing import Iterable\n", + "from typing import Tuple\n", + "\n", + "import apache_beam as beam\n", + "from apache_beam.ml.inference.base import PredictionResult\n", + "from apache_beam.ml.inference.base import RunInference\n", + "from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor\n", + "from apache_beam.ml.inference.utils import WatchFilePattern\n", + "from apache_beam.options.pipeline_options import GoogleCloudOptions\n", + "from apache_beam.options.pipeline_options import PipelineOptions\n", + "from apache_beam.options.pipeline_options import SetupOptions\n", + "from apache_beam.options.pipeline_options import StandardOptions\n", + "from apache_beam.options.pipeline_options import WorkerOptions\n", + "from apache_beam.transforms.periodicsequence import PeriodicImpulse\n", + "import numpy\n", + "from PIL import Image\n", + "import tensorflow as tf" + ], + "metadata": { + "id": "Rs4cwwNrIV9H" + }, + "execution_count": null, + "outputs": [{ + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + }] + }, + { + "cell_type": "code", + "source": [ + "# Authenticate to your Google Cloud account.\n", + "def auth_to_colab():\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + "\n", + "auth_to_colab()" + ], + "metadata": { + "id": "jAKpPcmmGm03" + }, + "execution_count": null, + "outputs": [{ + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + }] + }, + { + "cell_type": "markdown", + "source": [ + "## Configure the runner\n", + "\n", + "This pipeline uses the Dataflow Runner. To run the pipeline, you need to complete the following tasks:\n", + "\n", + "* Ensure that you have all the required permissions to run the pipeline on Dataflow.\n", + "* Configure the pipeline options for the pipeline to run on Dataflow. Make sure the pipeline is using streaming mode.\n", + "\n", + "In the following code, replace `BUCKET_NAME` with the the name of your Cloud Storage bucket." + ], + "metadata": { + "id": "ORYNKhH3WQyP" + } + }, + { + "cell_type": "code", + "source": [ + "options = PipelineOptions()\n", + "options.view_as(StandardOptions).streaming = True\n", + "\n", + "BUCKET_NAME = '' # Replace with your bucket name.\n", + "\n", + "# Provide required pipeline options for the Dataflow Runner.\n", + "options.view_as(StandardOptions).runner = \"DataflowRunner\"\n", + "\n", + "# Set the project to the default project in your current Google Cloud environment.\n", + "options.view_as(GoogleCloudOptions).project = ''\n", + "\n", + "# Set the Google Cloud region that you want to run Dataflow in.\n", + "options.view_as(GoogleCloudOptions).region = 'us-central1'\n", + "\n", + "# IMPORTANT: Replace BUCKET_NAME with the the name of your Cloud Storage bucket.\n", + "dataflow_gcs_location = \"gs://%s/dataflow\" % BUCKET_NAME\n", + "\n", + "# The Dataflow staging location. This location is used to stage the Dataflow pipeline and the SDK binary.\n", + "options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location\n", + "\n", + "\n", + "# The Dataflow staging location. This location is used to stage the Dataflow pipeline and the SDK binary.\n", + "options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location\n", + "\n", + "# The Dataflow temp location. This location is used to store temporary files or intermediate results before outputting to the sink.\n", + "options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location\n", + "\n", + "options.view_as(SetupOptions).save_main_session = True\n", + "\n", + "# Launching Dataflow with only one worker might result in processing delays due to\n", + "# initial input processing. This could further postpone the side input model updates.\n", + "# To expedite the model update process, it's recommended to set num_workers>1.\n", + "# https://github.com/apache/beam/issues/28776\n", + "options.view_as(WorkerOptions).num_workers = 5" + ], + "metadata": { + "id": "wWjbnq6X-4uE" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -152,18 +268,28 @@ "\n" ] }] - }, - { - "cell_type": "code", - "source": [ - "# Authenticate to your Google Cloud account.\n", - "from google.colab import auth\n", - "auth.authenticate_user()" - ], - "metadata": { - "id": "jAKpPcmmGm03" - }, - "execution_count": 3, + }, + { + "cell_type": "markdown", + "source": [ + "Install the `tensorflow` and `tensorflow_hub` dependencies on Dataflow. Use the `requirements_file` pipeline option to pass these dependencies." + ], + "metadata": { + "id": "HTJV8pO2Wcw4" + } + }, + { + "cell_type": "code", + "source": [ + "# In a requirements file, define the dependencies required for the pipeline.\n", + "!printf 'tensorflow>=2.12.0\\ntensorflow_hub>=0.10.0\\nPillow>=9.0.0' > ./requirements.txt\n", + "# Install the pipeline dependencies on Dataflow.\n", + "options.view_as(SetupOptions).requirements_file = './requirements.txt'" + ], + "metadata": { + "id": "lEy4PkluWbdm" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -171,52 +297,33 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "## Configure the runner\n", - "\n", - "This pipeline uses the Dataflow Runner. To run the pipeline, you need to complete the following tasks:\n", - "\n", - "* Ensure that you have all the required permissions to run the pipeline on Dataflow.\n", - "* Configure the pipeline options for the pipeline to run on Dataflow. Make sure the pipeline is using streaming mode.\n", - "\n", - "In the following code, replace `BUCKET_NAME` with the the name of your Cloud Storage bucket." - ], - "metadata": { - "id": "ORYNKhH3WQyP" - } - }, - { - "cell_type": "code", - "source": [ - "options = PipelineOptions()\n", - "options.view_as(StandardOptions).streaming = True\n", - "\n", - "# Provide required pipeline options for the Dataflow Runner.\n", - "options.view_as(StandardOptions).runner = \"DataflowRunner\"\n", - "\n", - "# Set the project to the default project in your current Google Cloud environment.\n", - "options.view_as(GoogleCloudOptions).project = 'your-project'\n", - "\n", - "# Set the Google Cloud region that you want to run Dataflow in.\n", - "options.view_as(GoogleCloudOptions).region = 'us-central1'\n", - "\n", - "# IMPORTANT: Replace BUCKET_NAME with the the name of your Cloud Storage bucket.\n", - "dataflow_gcs_location = \"gs://BUCKET_NAME/tmp/\"\n", - "\n", - "# The Dataflow staging location. This location is used to stage the Dataflow pipeline and the SDK binary.\n", - "options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location\n", - "\n", - "# The Dataflow temp location. This location is used to store temporary files or intermediate results before outputting to the sink.\n", - "options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location\n", - "\n" - ], - "metadata": { - "id": "wWjbnq6X-4uE" - }, - "execution_count": 4, + }, + { + "cell_type": "markdown", + "source": [ + "## Use the TensorFlow model handler\n", + " This example uses `TFModelHandlerTensor` as the model handler and the `resnet_101` model trained on [ImageNet](https://www.image-net.org/).\n", + "\n", + "\n", + "For the Dataflow runner, you need to store the model in a remote location that the Apache Beam pipeline can access. For this example, download the `ResNet101` model, and upload it to the Google Cloud Storage bucket.\n" + ], + "metadata": { + "id": "_AUNH_GJk_NE" + } + }, + { + "cell_type": "code", + "source": [ + "model = tf.keras.applications.resnet.ResNet101()\n", + "model.save('resnet101_weights_tf_dim_ordering_tf_kernels.keras')\n", + "# After saving the model locally, upload the model to GCS bucket and provide that gcs bucket `URI` as `model_uri` to the `TFModelHandler`\n", + "# Replace `BUCKET_NAME` value with actual bucket name.\n", + "!gsutil cp resnet101_weights_tf_dim_ordering_tf_kernels.keras gs:///dataflow/resnet101_weights_tf_dim_ordering_tf_kernels.keras" + ], + "metadata": { + "id": "ibkWiwVNvyrn" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -224,34 +331,17 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "Install the `tensorflow` and `tensorflow_hub` dependencies on Dataflow. Use the `requirements_file` pipeline option to pass these dependencies." - ], - "metadata": { - "id": "HTJV8pO2Wcw4" - } - }, - { - "cell_type": "code", - "source": [ - "# In a requirements file, define the dependencies required for the pipeline.\n", - "deps_required_for_pipeline = ['tensorflow>=2.12.0', 'tensorflow-hub>=0.10.0', 'Pillow>=9.0.0']\n", - "requirements_file_path = './requirements.txt'\n", - "# Write the dependencies to the requirements file.\n", - "with open(requirements_file_path, 'w') as f:\n", - " for dep in deps_required_for_pipeline:\n", - " f.write(dep + '\\n')\n", - "\n", - "# Install the pipeline dependencies on Dataflow.\n", - "options.view_as(SetupOptions).requirements_file = requirements_file_path" - ], - "metadata": { - "id": "lEy4PkluWbdm" - }, - "execution_count": 5, + }, + { + "cell_type": "code", + "source": [ + "model_handler = TFModelHandlerTensor(\n", + " model_uri=dataflow_gcs_location + \"/resnet101_weights_tf_dim_ordering_tf_kernels.keras\")" + ], + "metadata": { + "id": "kkSnsxwUk-Sp" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -259,31 +349,32 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "## Use the TensorFlow model handler\n", - " This example uses `TFModelHandlerTensor` as the model handler and the `resnet_101` model trained on [ImageNet](https://www.image-net.org/).\n", - "\n", - " Download the model from [Google Cloud Storage](https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet101_weights_tf_dim_ordering_tf_kernels.h5) (link downloads the model), and place it in the directory that you want to use to update your model.\n", - "\n", - "In the following code, replace `BUCKET_NAME` with the the name of your Cloud Storage bucket." - ], - "metadata": { - "id": "_AUNH_GJk_NE" - } - }, - { - "cell_type": "code", - "source": [ - "model_handler = TFModelHandlerTensor(\n", - " model_uri=\"gs://BUCKET_NAME/resnet101_weights_tf_dim_ordering_tf_kernels.h5\")" - ], - "metadata": { - "id": "kkSnsxwUk-Sp" - }, - "execution_count": 6, + }, + { + "cell_type": "markdown", + "source": [ + "## Preprocess images\n", + "\n", + "Use `preprocess_image` to run the inference, read the image, and convert the image to a TensorFlow tensor." + ], + "metadata": { + "id": "tZH0r0sL-if5" + } + }, + { + "cell_type": "code", + "source": [ + "def preprocess_image(image_name, image_dir):\n", + " img = tf.keras.utils.get_file(image_name, image_dir + image_name)\n", + " img = Image.open(img).resize((224, 224))\n", + " img = numpy.array(img) / 255.0\n", + " img_tensor = tf.cast(tf.convert_to_tensor(img[...]), dtype=tf.float32)\n", + " return img_tensor" + ], + "metadata": { + "id": "dU5imgTt-8Ne" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -291,32 +382,28 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "## Preprocess images\n", - "\n", - "Use `preprocess_image` to run the inference, read the image, and convert the image to a TensorFlow tensor." - ], - "metadata": { - "id": "tZH0r0sL-if5" - } - }, - { - "cell_type": "code", - "source": [ - "def preprocess_image(image_name, image_dir):\n", - " img = tf.keras.utils.get_file(image_name, image_dir + image_name)\n", - " img = Image.open(img).resize((224, 224))\n", - " img = numpy.array(img) / 255.0\n", - " img_tensor = tf.cast(tf.convert_to_tensor(img[...]), dtype=tf.float32)\n", - " return img_tensor" - ], - "metadata": { - "id": "dU5imgTt-8Ne" - }, - "execution_count": 7, + }, + { + "cell_type": "code", + "source": [ + "class PostProcessor(beam.DoFn):\n", + " \"\"\"Process the PredictionResult to get the predicted label.\n", + " Returns predicted label.\n", + " \"\"\"\n", + " def process(self, element: PredictionResult) -> Iterable[Tuple[str, str]]:\n", + " predicted_class = numpy.argmax(element.inference, axis=-1)\n", + " labels_path = tf.keras.utils.get_file(\n", + " 'ImageNetLabels.txt',\n", + " 'https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt' # pylint: disable=line-too-long\n", + " )\n", + " imagenet_labels = numpy.array(open(labels_path).read().splitlines())\n", + " predicted_class_name = imagenet_labels[predicted_class]\n", + " yield predicted_class_name.title(), element.model_id" + ], + "metadata": { + "id": "6V5tJxO6-gyt" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -324,28 +411,17 @@ "\n" ] }] - }, - { - "cell_type": "code", - "source": [ - "class PostProcessor(beam.DoFn):\n", - " \"\"\"Process the PredictionResult to get the predicted label.\n", - " Returns predicted label.\n", - " \"\"\"\n", - " def process(self, element: PredictionResult) -> Iterable[Tuple[str, str]]:\n", - " predicted_class = numpy.argmax(element.inference, axis=-1)\n", - " labels_path = tf.keras.utils.get_file(\n", - " 'ImageNetLabels.txt',\n", - " 'https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt' # pylint: disable=line-too-long\n", - " )\n", - " imagenet_labels = numpy.array(open(labels_path).read().splitlines())\n", - " predicted_class_name = imagenet_labels[predicted_class]\n", - " yield predicted_class_name.title(), element.model_id" - ], - "metadata": { - "id": "6V5tJxO6-gyt" - }, - "execution_count": 8, + }, + { + "cell_type": "code", + "source": [ + "# Define the pipeline object.\n", + "pipeline = beam.Pipeline(options=options)" + ], + "metadata": { + "id": "GpdKk72O_NXT" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -353,22 +429,49 @@ "\n" ] }] - }, - { - "cell_type": "code", - "source": [ - "# Define the pipeline object.\n", - "pipeline = beam.Pipeline(options=options)" - ], - "metadata": { - "id": "GpdKk72O_NXT", - "outputId": "bcbaa8a6-0408-427a-de9e-78a6a7eefd7b", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 400 - } - }, - "execution_count": 9, + }, + { + "cell_type": "markdown", + "source": [ + "Next, review the pipeline steps and examine the code.\n", + "\n", + "### Pipeline steps\n" + ], + "metadata": { + "id": "elZ53uxc_9Hv" + } + }, + { + "cell_type": "markdown", + "source": [ + "1. Create a `PeriodicImpulse` transform, which emits output every `n` seconds. The `PeriodicImpulse` transform generates an infinite sequence of elements with a given runtime interval.\n", + "\n", + " In this example, `PeriodicImpulse` mimics the Pub/Sub source. Because the inputs in a streaming pipeline arrive in intervals, use `PeriodicImpulse` to output elements at `m` intervals.\n", + "To learn more about `PeriodicImpulse`, see the [`PeriodicImpulse` code](https://github.com/apache/beam/blob/9c52e0594d6f0e59cd17ee005acfb41da508e0d5/sdks/python/apache_beam/transforms/periodicsequence.py#L150)." + ], + "metadata": { + "id": "305tkV2sAD-S" + } + }, + { + "cell_type": "code", + "source": [ + "start_timestamp = time.time() # start timestamp of the periodic impulse\n", + "end_timestamp = start_timestamp + 60 * 20 # end timestamp of the periodic impulse (will run for 20 minutes).\n", + "main_input_fire_interval = 60 # interval in seconds at which the main input PCollection is emitted.\n", + "side_input_fire_interval = 60 # interval in seconds at which the side input PCollection is emitted.\n", + "\n", + "periodic_impulse = (\n", + " pipeline\n", + " | \"MainInputPcoll\" >> PeriodicImpulse(\n", + " start_timestamp=start_timestamp,\n", + " stop_timestamp=end_timestamp,\n", + " fire_interval=main_input_fire_interval))" + ], + "metadata": { + "id": "vUFStz66_Tbb" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -376,54 +479,38 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "Next, review the pipeline steps and examine the code.\n", - "\n", - "### Pipeline steps\n" - ], - "metadata": { - "id": "elZ53uxc_9Hv" - } - }, - { - "cell_type": "markdown", - "source": [ - "1. Create a `PeriodicImpulse` transform, which emits output every `n` seconds. The `PeriodicImpulse` transform generates an infinite sequence of elements with a given runtime interval.\n", - "\n", - " In this example, `PeriodicImpulse` mimics the Pub/Sub source. Because the inputs in a streaming pipeline arrive in intervals, use `PeriodicImpulse` to output elements at `m` intervals.\n", - "To learn more about `PeriodicImpulse`, see the [`PeriodicImpulse` code](https://github.com/apache/beam/blob/9c52e0594d6f0e59cd17ee005acfb41da508e0d5/sdks/python/apache_beam/transforms/periodicsequence.py#L150)." - ], - "metadata": { - "id": "305tkV2sAD-S" - } - }, - { - "cell_type": "code", - "source": [ - "start_timestamp = time.time() # start timestamp of the periodic impulse\n", - "end_timestamp = start_timestamp + 60 * 20 # end timestamp of the periodic impulse (will run for 20 minutes).\n", - "main_input_fire_interval = 60 # interval in seconds at which the main input PCollection is emitted.\n", - "side_input_fire_interval = 60 # interval in seconds at which the side input PCollection is emitted.\n", - "\n", - "periodic_impulse = (\n", - " pipeline\n", - " | \"MainInputPcoll\" >> PeriodicImpulse(\n", - " start_timestamp=start_timestamp,\n", - " stop_timestamp=end_timestamp,\n", - " fire_interval=main_input_fire_interval))" - ], - "metadata": { - "id": "vUFStz66_Tbb", - "outputId": "39f2704b-021e-4d41-fce3-a2fac90a5bad", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 133 - } - }, - "execution_count": 10, + }, + { + "cell_type": "markdown", + "source": [ + "2. To read and preprocess the images, use the `preprocess_image` function. This example uses `Cat-with-beanie.jpg` for all inferences.\n", + "\n", + " **Note**: The image used for prediction is licensed in CC-BY. The creator is listed in the [LICENSE.txt](https://storage.googleapis.com/apache-beam-samples/image_captioning/LICENSE.txt) file." + ], + "metadata": { + "id": "8-sal2rFAxP2" + } + }, + { + "cell_type": "markdown", + "source": [ + "![download.png]()" + ], + "metadata": { + "id": "gW4cE8bhXS-d" + } + }, + { + "cell_type": "code", + "source": [ + "image_data = (periodic_impulse | beam.Map(lambda x: \"Cat-with-beanie.jpg\")\n", + " | \"ReadImage\" >> beam.Map(lambda image_name: preprocess_image(\n", + " image_name=image_name, image_dir='https://storage.googleapis.com/apache-beam-samples/image_captioning/')))" + ], + "metadata": { + "id": "dGg11TpV_aV6" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -431,43 +518,39 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "2. To read and preprocess the images, use the `read_image` function. This example uses `Cat-with-beanie.jpg` for all inferences.\n", - "\n", - " **Note**: Image used for prediction is licensed in CC-BY. The creator is listed in the [LICENSE.txt](https://storage.googleapis.com/apache-beam-samples/image_captioning/LICENSE.txt) file." - ], - "metadata": { - "id": "8-sal2rFAxP2" - } - }, - { - "cell_type": "markdown", - "source": [ - "![download.png]()" - ], - "metadata": { - "id": "gW4cE8bhXS-d" - } - }, - { - "cell_type": "code", - "source": [ - "image_data = (periodic_impulse | beam.Map(lambda x: \"Cat-with-beanie.jpg\")\n", - " | \"ReadImage\" >> beam.Map(lambda image_name: read_image(\n", - " image_name=image_name, image_dir='https://storage.googleapis.com/apache-beam-samples/image_captioning/')))" - ], - "metadata": { - "id": "dGg11TpV_aV6", - "outputId": "a57e8197-6756-4fd8-a664-f51ef2fea730", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 - } - }, - "execution_count": 11, + }, + { + "cell_type": "markdown", + "source": [ + "3. Pass the images to the RunInference `PTransform`. RunInference takes `model_handler` and `model_metadata_pcoll` as input parameters.\n", + " * `model_metadata_pcoll` is a side input `PCollection` to the RunInference `PTransform`. This side input updates the `model_uri` in the `model_handler` while the Apache Beam pipeline runs.\n", + " * Use `WatchFilePattern` as side input to watch a `file_pattern` matching `.keras` files. In this case, the `file_pattern` is `'gs://BUCKET_NAME/dataflow/*keras'`.\n", + "\n" + ], + "metadata": { + "id": "eB0-ewd-BCKE" + } + }, + { + "cell_type": "code", + "source": [ + " # The side input used to watch for the .keras file and update the model_uri of the TFModelHandlerTensor.\n", + "file_pattern = dataflow_gcs_location + '/*.keras'\n", + "side_input_pcoll = (\n", + " pipeline\n", + " | \"WatchFilePattern\" >> WatchFilePattern(file_pattern=file_pattern,\n", + " interval=side_input_fire_interval,\n", + " stop_timestamp=end_timestamp))\n", + "inferences = (\n", + " image_data\n", + " | \"ApplyWindowing\" >> beam.WindowInto(beam.window.FixedWindows(10))\n", + " | \"RunInference\" >> RunInference(model_handler=model_handler,\n", + " model_metadata_pcoll=side_input_pcoll))" + ], + "metadata": { + "id": "_AjvvexJ_hUq" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -475,44 +558,29 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "3. Pass the images to the RunInference `PTransform`. RunInference takes `model_handler` and `model_metadata_pcoll` as input parameters.\n", - " * `model_metadata_pcoll` is a side input `PCollection` to the RunInference `PTransform`. This side input is used to update the `model_uri` in the `model_handler` without needing to stop the Apache Beam pipeline\n", - " * Use `WatchFilePattern` as side input to watch a `file_pattern` matching `.h5` files. In this case, the `file_pattern` is `'gs://BUCKET_NAME/*.h5'`.\n", - "\n" - ], - "metadata": { - "id": "eB0-ewd-BCKE" - } - }, - { - "cell_type": "code", - "source": [ - " # The side input used to watch for the .h5 file and update the model_uri of the TFModelHandlerTensor.\n", - "file_pattern = 'gs://BUCKET_NAME/*.h5'\n", - "side_input_pcoll = (\n", - " pipeline\n", - " | \"WatchFilePattern\" >> WatchFilePattern(file_pattern=file_pattern,\n", - " interval=side_input_fire_interval,\n", - " stop_timestamp=end_timestamp))\n", - "inferences = (\n", - " image_data\n", - " | \"ApplyWindowing\" >> beam.WindowInto(beam.window.FixedWindows(10))\n", - " | \"RunInference\" >> RunInference(model_handler=model_handler,\n", - " model_metadata_pcoll=side_input_pcoll))" - ], - "metadata": { - "id": "_AjvvexJ_hUq", - "outputId": "291fcc38-0abb-4b11-f840-4a850097a56f", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 133 - } - }, - "execution_count": 12, + }, + { + "cell_type": "markdown", + "source": [ + "4. Post-process the `PredictionResult` object.\n", + "When the inference is complete, RunInference outputs a `PredictionResult` object that contains the fields `example`, `inference`, and `model_id`. The `model_id` field identifies the model used to run the inference. The `PostProcessor` returns the predicted label and the model ID used to run the inference on the predicted label." + ], + "metadata": { + "id": "lTA4wRWNDVis" + } + }, + { + "cell_type": "code", + "source": [ + "post_processor = (\n", + " inferences\n", + " | \"PostProcessResults\" >> beam.ParDo(PostProcessor())\n", + " | \"LogResults\" >> beam.Map(logging.info))" + ], + "metadata": { + "id": "9TB76fo-_vZJ" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -520,34 +588,30 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "4. Post-process the `PredictionResult` object.\n", - "When the inference is complete, RunInference outputs a `PredictionResult` object that contains the fields `example`, `inference`, and `model_id`. The `model_id` field identifies the model used to run the inference. The `PostProcessor` returns the predicted label and the model ID used to run the inference on the predicted label." - ], - "metadata": { - "id": "lTA4wRWNDVis" - } - }, - { - "cell_type": "code", - "source": [ - "post_processor = (\n", - " inferences\n", - " | \"PostProcessResults\" >> beam.ParDo(PostProcessor())\n", - " | \"LogResults\" >> beam.Map(logging.info))" - ], - "metadata": { - "id": "9TB76fo-_vZJ", - "outputId": "3e12d482-1bdf-4136-fbf7-9d5bb4bb62c3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 222 - } - }, - "execution_count": 13, + }, + { + "cell_type": "markdown", + "source": [ + "### Watch for the model update\n", + "\n", + "After the pipeline starts processing data, when you see output emitted from the RunInference `PTransform`, upload a `resnet152` model saved in the `.keras` format to a Google Cloud Storage bucket location that matches the `file_pattern` you defined earlier.\n" + ], + "metadata": { + "id": "wYp-mBHHjOjA" + } + }, + { + "cell_type": "code", + "source": [ + "model = tf.keras.applications.resnet.ResNet152()\n", + "model.save('resnet152_weights_tf_dim_ordering_tf_kernels.keras')\n", + "# Replace the `BUCKET_NAME` with the actual bucket name.\n", + "!gsutil cp resnet152_weights_tf_dim_ordering_tf_kernels.keras gs:///resnet152_weights_tf_dim_ordering_tf_kernels.keras" + ], + "metadata": { + "id": "FpUfNBSWH9Xy" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -555,44 +619,28 @@ "\n" ] }] - }, - { - "cell_type": "markdown", - "source": [ - "### Watch for the model update\n", - "\n", - "After the pipeline starts processing data and when you see output emitted from the RunInference `PTransform`, upload a `resnet152` model saved in `.h5` format to a Google Cloud Storage bucket location that matches the `file_pattern` you defined earlier. You can [download a copy of the model](https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet152_weights_tf_dim_ordering_tf_kernels.h5) (link downloads the model). RunInference uses `WatchFilePattern` as a side input to update the `model_uri` of `TFModelHandlerTensor`." - ], - "metadata": { - "id": "wYp-mBHHjOjA" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Run the pipeline\n", - "\n", - "Use the following code to run the pipeline." - ], - "metadata": { - "id": "_ty03jDnKdKR" - } - }, - { - "cell_type": "code", - "source": [ - "# Run the pipeline.\n", - "result = pipeline.run().wait_until_finish()" - ], - "metadata": { - "id": "wd0VJLeLEWBU", - "outputId": "3489c891-05d2-4739-d693-1899cfe78859", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 186 - } - }, - "execution_count": 14, + }, + { + "cell_type": "markdown", + "source": [ + "## Run the pipeline\n", + "\n", + "Use the following code to run the pipeline." + ], + "metadata": { + "id": "_ty03jDnKdKR" + } + }, + { + "cell_type": "code", + "source": [ + "# Run the pipeline.\n", + "result = pipeline.run().wait_until_finish()" + ], + "metadata": { + "id": "wd0VJLeLEWBU" + }, + "execution_count": null, "outputs": [{ "output_type": "stream", "name": "stdout", @@ -600,6 +648,6 @@ "\n" ] }] - } - ] + } + ] } diff --git a/examples/notebooks/beam-ml/custom_remote_inference.ipynb b/examples/notebooks/beam-ml/custom_remote_inference.ipynb index 2fad42bc0d9d0..6657a137d6b08 100644 --- a/examples/notebooks/beam-ml/custom_remote_inference.ipynb +++ b/examples/notebooks/beam-ml/custom_remote_inference.ipynb @@ -1,725 +1,665 @@ { - "cells": [{ - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "paYiulysGrwR" - }, - "outputs": [], - "source": [ - "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", - "\n", - "# Licensed to the Apache Software Foundation (ASF) under one\n", - "# or more contributor license agreements. See the NOTICE file\n", - "# distributed with this work for additional information\n", - "# regarding copyright ownership. The ASF licenses this file\n", - "# to you under the Apache License, Version 2.0 (the\n", - "# \"License\"); you may not use this file except in compliance\n", - "# with the License. You may obtain a copy of the License at\n", - "#\n", - "# http://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing,\n", - "# software distributed under the License is distributed on an\n", - "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", - "# KIND, either express or implied. See the License for the\n", - "# specific language governing permissions and limitations\n", - "# under the License" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0UGzzndTBPWQ" - }, - "source": [ - "# Remote inference in Apache Beam\n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " Run in Google Colab\n", - " \n", - " View source on GitHub\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GNbarEZsalS2" - }, - "source": [ - "This example demonstrates how to implement a custom inference call in Apache Beam using the Google Cloud Vision API.\n", - "\n", - "The prefered way to run inference in Apache Beam is by using the [RunInference API](https://beam.apache.org/documentation/sdks/python-machine-learning/).\n", - "The RunInference API enables you to run models as part of your pipeline in a way that is optimized for machine learning inference.\n", - "To reduce the number of steps that you need to take, RunInference supports features like batching. For more infomation about the RunInference API, review the [RunInference API](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.html#apache_beam.ml.inference.RunInference),\n", - "which demonstrates how to implement model inference in PyTorch, scikit-learn, and TensorFlow.\n", - "\n", - "Currently, the RunInference API doesn't support making remote inference calls using the Natural Language API, Cloud Vision API, and so on.\n", - "Therefore, to use these remote APIs with Apache Beam, you need to write custom inference calls.\n", - "\n", - "**Note:** all images are licensed CC-BY, creators are listed in the [LICENSE.txt](https://storage.googleapis.com/apache-beam-samples/image_captioning/LICENSE.txt) file." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GNbarEZsalS1" - }, - "source": [ - "## Run the Cloud Vision API\n", - "\n", - "You can use the Cloud Vision API to retrieve labels that describe an image.\n", - "For example, the following image shows a cat with possible labels." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q-jVQn3maZ81" - }, - "source": [ - "![cat-with-labels.png]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4io1vzkzF683" - }, - "source": [ - "We want to run the Google Cloud Vision API on a large set of images, and Apache Beam is the ideal tool to handle this workflow.\n", - "This example demonstates how to retrieve image labels with this API on a small set of images.\n", - "\n", - "The example follows these steps to implement this workflow:\n", - "* Read the images.\n", - "* Batch the images together to optimize the model call.\n", - "* Send the images to an external API to run inference.\n", - "* Postprocess the results of your API.\n", - "\n", - "**Caution:** Be aware of API quotas and the heavy load you might incur on your external API. Verify that your pipeline and API are configured correctly for your use case.\n", - "\n", - "To optimize the calls to the external API, limit the parallel calls to the external remote API by configuring [PipelineOptions](https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options).\n", - "In Apache Beam, different runners provide options to handle the parallelism, for example:\n", - "* With the [Direct Runner](https://beam.apache.org/documentation/runners/direct/), use the `direct_num_workers` pipeline option.\n", - "* With the [Google Cloud Dataflow Runner](https://beam.apache.org/documentation/runners/dataflow/), use the `max_num_workers` pipeline option.\n", - "\n", - "For information about other runners, see the [Beam capability matrix](https://beam.apache.org/documentation/runners/capability-matrix/)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FAawWOaiIYaS" - }, - "source": [ - "## Before you begin\n", - "\n", - "This section provides installation steps." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XhpKOxINrIqz" - }, - "source": [ - "First, download and install the dependencies." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "bA7MLR8OptJw", - "outputId": "7b06e838-5e81-4094-c345-b129e889ad03", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - } - }, - "outputs": [{ - "output_type": "stream", - "name": "stdout", - "text": [ - "...\n" - ] - } - ], - "source": [ - "!pip install --upgrade pip\n", - "!pip install protobuf==3.19.4\n", - "!pip install apache-beam[interactive,gcp]>=2.40.0\n", - "!pip install google-cloud-vision==3.1.1\n", - "!pip install requests\n", - "\n", - "# To use the newly installed version, restart the runtime.\n", - "exit()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "C-RVR2eprc0r" - }, - "source": [ - "To use the Cloud Vision API, authenticate with Google Cloud." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "qGDJCbxgTprh", - "outputId": "3d127ab7-abb9-41cd-e3e2-a85236408e9a", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [{ - "output_type": "stream", - "name": "stdout", - "text": [ - "...\n" - ] - }], - "source": [ - "# Follow the steps to configure your Google Cloup setup.\n", - "!gcloud init --console-only" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "74acX7AlT91N", - "outputId": "db6ec03e-745e-4b22-9d8c-97b7dbd73633", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [{ - "output_type": "stream", - "name": "stdout", - "text": [ - "...\n" - ] - }], - "source": [ - "\n", - "!gcloud auth application-default login" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mL4MaHm_XOVd" - }, - "source": [ - "## Run remote inference on Cloud Vision API\n", - "\n", - "This section demonstates the steps to run remote inference on the Cloud Vision API.\n", - "\n", - "Download and install Apache Beam and the required modules." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "gE0go8CpnTy3" - }, - "outputs": [{ - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - }], - "source": [ - "from typing import List\n", - "import io\n", - "import os\n", - "import requests\n", - "\n", - "from google.cloud import vision\n", - "from google.cloud.vision_v1.types import Feature\n", - "import apache_beam as beam" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "09k08IYlLmON" - }, - "source": [ - "This example uses images from the [MSCoco dataset](https://cocodataset.org/#explore) as a list of image URLs.\n", - "This data is used as the pipeline input." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "_89eN_1QeYEd" - }, - "outputs": [{ - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - }], - "source": [ - "image_urls = [\n", - " \"http://farm3.staticflickr.com/2824/10213933686_6936eb402b_z.jpg\",\n", - " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", - " \"http://farm8.staticflickr.com/7003/6528937031_10e1ce0960_z.jpg\",\n", - " \"http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg\",\n", - " \"http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg\",\n", - " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", - " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", - "]\n", - "\n", - "def read_image(image_url):\n", - " \"\"\"Read image from url and return image_url, image bytes\"\"\"\n", - " response = requests.get(image_url)\n", - " image_bytes = io.BytesIO(response.content).read()\n", - " return image_url, image_bytes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HLy7VKJhLrmT" - }, - "source": [ - "### Create a custom DoFn\n", - "\n", - "In order to implement remote inference, create a DoFn class. This class sends a batch of images to the Cloud vision API.\n", - "\n", - "The custom DoFn makes it possible to initialize the API. In case of a custom model, a model can also be loaded in the `setup` function.\n", - "\n", - "The `process` function is the most interesting part. In this function, we implement the model call and return its results.\n", - "\n", - "When running remote inference, prepare to encounter, identify, and handle failure as gracefully as possible. We recommend using the following techniques:\n", - "\n", - "* **Exponential backoff:** Retry failed remote calls with exponentially growing pauses between retries. Using exponential backoff ensures that failures don't lead to an overwhelming number of retries in quick succession.\n", - "\n", - "* **Dead-letter queues:** Route failed inferences to a separate `PCollection` without failing the whole transform. You can continue execution without failing the job (batch jobs' default behavior) or retrying indefinitely (streaming jobs' default behavior).\n", - "You can then run custom pipeline logic on the dead-letter queue (unprocessed messages queue) to log the failure, alert, and push the failed message to temporary storage so that it can eventually be reprocessed." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "LnaisJ_JiY_Q" - }, - "outputs": [{ - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - }], - "source": [ - "class RemoteBatchInference(beam.DoFn):\n", - " \"\"\"DoFn that accepts a batch of images as bytearray\n", - " and sends that batch to the Cloud vision API for remote inference.\"\"\"\n", - " def setup(self):\n", - " \"\"\"Init the Google Vision API client.\"\"\"\n", - " self._client = vision.ImageAnnotatorClient()\n", - "\n", - " def process(self, images_batch):\n", - " feature = Feature()\n", - " feature.type_ = Feature.Type.LABEL_DETECTION\n", - "\n", - " # The list of image_urls\n", - " image_urls = [image_url for (image_url, image_bytes) in images_batch]\n", - "\n", - " # Create a batch request for all images in the batch.\n", - " images = [vision.Image(content=image_bytes) for (image_url, image_bytes) in images_batch]\n", - " image_requests = [vision.AnnotateImageRequest(image=image, features=[feature]) for image in images]\n", - " batch_image_request = vision.BatchAnnotateImagesRequest(requests=image_requests)\n", - "\n", - " # Send the batch request to the remote endpoint.\n", - " responses = self._client.batch_annotate_images(request=batch_image_request).responses\n", - "\n", - " return list(zip(image_urls, responses))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lHJuyHhvL0-a" - }, - "source": [ - "### Manage batching\n", - "\n", - "Before we can chain together the pipeline steps, we need to understand batching.\n", - "When running inference with your model, either in Apache Beam or in an external API, you can batch your input to increase the efficiency of the model execution.\n", - "When using a custom DoFn, as in this example, you need to manage the batching.\n", - "\n", - "To manage the batching in this pipeline, include a `BatchElements` transform to group elements together and form a batch of the desired size.\n", - "\n", - "* If you have a streaming pipeline, consider using [GroupIntoBatches](https://beam.apache.org/documentation/transforms/python/aggregation/groupintobatches/),\n", - "because `BatchElements` doesn't batch items across bundles. `GroupIntoBatches` requires choosing a key within which items are batched.\n", - "\n", - "* When batching, make sure that the input batch matches the maximum payload of the external API.\n", - "\n", - "* If you are designing your own API endpoint, make sure that it can handle batches.\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4sXHwZk9Url2" - }, - "source": [ - "### Create the pipeline\n", - "\n", - "This section demonstrates how to chain the steps together to do the following:\n", - "\n", - "* Read data.\n", - "\n", - "* Transform the data to fit the model input.\n", - "\n", - "* Run remote inference.\n", - "\n", - "* Process and display the results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LLg0OTvNkqo4", - "outputId": "7250b11d-a805-436a-990b-0a864404a536" - }, - "outputs": [{ - "name": "stdout", - "output_type": "stream", - "text": [ - "('http://farm3.staticflickr.com/2824/10213933686_6936eb402b_z.jpg', label_annotations {\n", - " mid: \"/m/083wq\"\n", - " description: \"Wheel\"\n", - " score: 0.9790800213813782\n", - " topicality: 0.9790800213813782\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0h9mv\"\n", - " description: \"Tire\"\n", - " score: 0.9781236052513123\n", - " topicality: 0.9781236052513123\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/043g5f\"\n", - " description: \"Fuel tank\"\n", - " score: 0.9584090113639832\n", - " topicality: 0.9584090113639832\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/05s2s\"\n", - " description: \"Plant\"\n", - " score: 0.956047534942627\n", - " topicality: 0.956047534942627\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0h8lk_j\"\n", - " description: \"Automotive fuel system\"\n", - " score: 0.9403533339500427\n", - " topicality: 0.9403533339500427\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/07yv9\"\n", - " description: \"Vehicle\"\n", - " score: 0.9362041354179382\n", - " topicality: 0.9362041354179382\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02qwkrn\"\n", - " description: \"Vehicle brake\"\n", - " score: 0.9050074815750122\n", - " topicality: 0.9050074815750122\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0h8pb3l\"\n", - " description: \"Automotive tire\"\n", - " score: 0.8968825936317444\n", - " topicality: 0.8968825936317444\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0768fx\"\n", - " description: \"Automotive lighting\"\n", - " score: 0.8944322466850281\n", - " topicality: 0.8944322466850281\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/04tkfx\"\n", - " description: \"Tread\"\n", - " score: 0.878828227519989\n", - " topicality: 0.878828227519989\n", - "}\n", - ")\n", - "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", - " mid: \"/m/054_l\"\n", - " description: \"Mirror\"\n", - " score: 0.9682560563087463\n", - " topicality: 0.9682560563087463\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02jz0l\"\n", - " description: \"Tap\"\n", - " score: 0.9611372947692871\n", - " topicality: 0.9611372947692871\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0130jx\"\n", - " description: \"Sink\"\n", - " score: 0.9328749775886536\n", - " topicality: 0.9328749775886536\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0h8lr5r\"\n", - " description: \"Bathroom sink\"\n", - " score: 0.9324912428855896\n", - " topicality: 0.9324912428855896\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02pkr5\"\n", - " description: \"Plumbing fixture\"\n", - " score: 0.9191171526908875\n", - " topicality: 0.9191171526908875\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02dgv\"\n", - " description: \"Door\"\n", - " score: 0.8910166621208191\n", - " topicality: 0.8910166621208191\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/09ggk\"\n", - " description: \"Purple\"\n", - " score: 0.8799519538879395\n", - " topicality: 0.8799519538879395\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/01j2bj\"\n", - " description: \"Bathroom\"\n", - " score: 0.8725592494010925\n", - " topicality: 0.8725592494010925\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/04wnmd\"\n", - " description: \"Fixture\"\n", - " score: 0.8603869080543518\n", - " topicality: 0.8603869080543518\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/04y4h8h\"\n", - " description: \"Bathroom cabinet\"\n", - " score: 0.80011385679245\n", - " topicality: 0.80011385679245\n", - "}\n", - ")\n", - "('http://farm8.staticflickr.com/7003/6528937031_10e1ce0960_z.jpg', error {\n", - " code: 3\n", - " message: \"Bad image data.\"\n", - "}\n", - ")\n", - "('http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg', error {\n", - " code: 3\n", - " message: \"Bad image data.\"\n", - "}\n", - ")\n", - "('http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg', error {\n", - " code: 3\n", - " message: \"Bad image data.\"\n", - "}\n", - ")\n", - "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", - " mid: \"/m/054_l\"\n", - " description: \"Mirror\"\n", - " score: 0.9682560563087463\n", - " topicality: 0.9682560563087463\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02jz0l\"\n", - " description: \"Tap\"\n", - " score: 0.9611372947692871\n", - " topicality: 0.9611372947692871\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0130jx\"\n", - " description: \"Sink\"\n", - " score: 0.9328749775886536\n", - " topicality: 0.9328749775886536\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0h8lr5r\"\n", - " description: \"Bathroom sink\"\n", - " score: 0.9324912428855896\n", - " topicality: 0.9324912428855896\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02pkr5\"\n", - " description: \"Plumbing fixture\"\n", - " score: 0.9191171526908875\n", - " topicality: 0.9191171526908875\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02dgv\"\n", - " description: \"Door\"\n", - " score: 0.8910166621208191\n", - " topicality: 0.8910166621208191\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/09ggk\"\n", - " description: \"Purple\"\n", - " score: 0.8799519538879395\n", - " topicality: 0.8799519538879395\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/01j2bj\"\n", - " description: \"Bathroom\"\n", - " score: 0.8725592494010925\n", - " topicality: 0.8725592494010925\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/04wnmd\"\n", - " description: \"Fixture\"\n", - " score: 0.8603869080543518\n", - " topicality: 0.8603869080543518\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/04y4h8h\"\n", - " description: \"Bathroom cabinet\"\n", - " score: 0.80011385679245\n", - " topicality: 0.80011385679245\n", - "}\n", - ")\n", - "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", - " mid: \"/m/054_l\"\n", - " description: \"Mirror\"\n", - " score: 0.9682560563087463\n", - " topicality: 0.9682560563087463\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02jz0l\"\n", - " description: \"Tap\"\n", - " score: 0.9611372947692871\n", - " topicality: 0.9611372947692871\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0130jx\"\n", - " description: \"Sink\"\n", - " score: 0.9328749775886536\n", - " topicality: 0.9328749775886536\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/0h8lr5r\"\n", - " description: \"Bathroom sink\"\n", - " score: 0.9324912428855896\n", - " topicality: 0.9324912428855896\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02pkr5\"\n", - " description: \"Plumbing fixture\"\n", - " score: 0.9191171526908875\n", - " topicality: 0.9191171526908875\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/02dgv\"\n", - " description: \"Door\"\n", - " score: 0.8910166621208191\n", - " topicality: 0.8910166621208191\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/09ggk\"\n", - " description: \"Purple\"\n", - " score: 0.8799519538879395\n", - " topicality: 0.8799519538879395\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/01j2bj\"\n", - " description: \"Bathroom\"\n", - " score: 0.8725592494010925\n", - " topicality: 0.8725592494010925\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/04wnmd\"\n", - " description: \"Fixture\"\n", - " score: 0.8603869080543518\n", - " topicality: 0.8603869080543518\n", - "}\n", - "label_annotations {\n", - " mid: \"/m/04y4h8h\"\n", - " description: \"Bathroom cabinet\"\n", - " score: 0.80011385679245\n", - " topicality: 0.80011385679245\n", - "}\n", - ")\n" - ] - }], - "source": [ - "with beam.Pipeline() as pipeline:\n", - " _ = (pipeline | \"Create inputs\" >> beam.Create(image_urls)\n", - " | \"Read images\" >> beam.Map(read_image)\n", - " | \"Batch images\" >> beam.BatchElements(min_batch_size=2, max_batch_size=4)\n", - " | \"Inference\" >> beam.ParDo(RemoteBatchInference())\n", - " | \"Print image_url and annotation\" >> beam.Map(print)\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7gwn5bF1XaDm" - }, - "source": [ - "## Monitor the pipeline\n", - "\n", - "Because monitoring can provide insight into the status and health of the application, consider monitoring and measuring pipeline performance.\n", - "For information about the available tracking metrics, see [RunInference Metrics](https://beam.apache.org/documentation/ml/runinference-metrics/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TcBFS0rluusJ" - }, - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10.7 (main, Dec 7 2022, 13:34:16) [Clang 14.0.0 (clang-1400.0.29.102)]" - }, - "vscode": { - "interpreter": { - "hash": "40c55305dca37c951f6b497e2e996ca59c449c4502b9f8a4515c118ec923845d" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "paYiulysGrwR" + }, + "outputs": [], + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0UGzzndTBPWQ" + }, + "source": [ + "# Remote inference in Apache Beam\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GNbarEZsalS2" + }, + "source": [ + "This example demonstrates how to implement a custom inference call in Apache Beam using the Google Cloud Vision API.\n", + "\n", + "The prefered way to run inference in Apache Beam is by using the [RunInference API](https://beam.apache.org/documentation/sdks/python-machine-learning/).\n", + "The RunInference API enables you to run models as part of your pipeline in a way that is optimized for machine learning inference.\n", + "To reduce the number of steps that you need to take, RunInference supports features like batching. For more infomation about the RunInference API, review the [RunInference API](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.html#apache_beam.ml.inference.RunInference),\n", + "which demonstrates how to implement model inference in PyTorch, scikit-learn, and TensorFlow.\n", + "\n", + "There is [VertexAIModelHandlerJson](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/ml/inference/vertex_ai_inference.py) which is used to make remote inference calls to VertexAI. In this notebook, we will make custom `ModelHandler` to do remote inference calls using CloudVision API.\n", + "\n", + "**Note:** all images are licensed CC-BY, creators are listed in the [LICENSE.txt](https://storage.googleapis.com/apache-beam-samples/image_captioning/LICENSE.txt) file." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GNbarEZsalS1" + }, + "source": [ + "## Run the Cloud Vision API\n", + "\n", + "You can use the Cloud Vision API to retrieve labels that describe an image.\n", + "For example, the following image shows a cat with possible labels." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q-jVQn3maZ81" + }, + "source": [ + "![cat-with-labels.png]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4io1vzkzF683" + }, + "source": [ + "We want to run the Google Cloud Vision API on a large set of images, and Apache Beam is the ideal tool to handle this workflow.\n", + "This example demonstates how to retrieve image labels with this API on a small set of images.\n", + "\n", + "The example follows these steps to implement this workflow:\n", + "* Read the images.\n", + "* Send the images to an external API to run inference using `RunInference` PTransform.\n", + "* Postprocess the results of your API.\n", + "\n", + "**Caution:** Be aware of API quotas and the heavy load you might incur on your external API. Verify that your pipeline and API are configured correctly for your use case.\n", + "\n", + "To optimize the calls to the external API, limit the parallel calls to the external remote API by configuring [PipelineOptions](https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options).\n", + "In Apache Beam, different runners provide options to handle the parallelism, for example:\n", + "* With the [Direct Runner](https://beam.apache.org/documentation/runners/direct/), use the `direct_num_workers` pipeline option.\n", + "* With the [Google Cloud Dataflow Runner](https://beam.apache.org/documentation/runners/dataflow/), use the `max_num_workers` pipeline option.\n", + "\n", + "For information about other runners, see the [Beam capability matrix](https://beam.apache.org/documentation/runners/capability-matrix/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FAawWOaiIYaS" + }, + "source": [ + "## Before you begin\n", + "\n", + "This section provides installation steps." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XhpKOxINrIqz" + }, + "source": [ + "First, download and install the dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bA7MLR8OptJw" + }, + "outputs": [], + "source": [ + "!pip install --upgrade pip\n", + "!pip install protobuf==3.19.4\n", + "!pip install apache-beam[interactive,gcp]>=2.40.0\n", + "!pip install google-cloud-vision==3.1.1\n", + "!pip install requests\n", + "\n", + "# To use the newly installed version, restart the runtime.\n", + "exit()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C-RVR2eprc0r" + }, + "source": [ + "To use the Cloud Vision API, authenticate with Google Cloud." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qGDJCbxgTprh" + }, + "outputs": [], + "source": [ + "# Follow the steps to configure your Google Cloup setup.\n", + "!gcloud init" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "74acX7AlT91N" + }, + "outputs": [], + "source": [ + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mL4MaHm_XOVd" + }, + "source": [ + "## Run remote inference on Cloud Vision API\n", + "\n", + "This section demonstates the steps to run remote inference on the Cloud Vision API.\n", + "\n", + "Download and install Apache Beam and the required modules." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gE0go8CpnTy3" + }, + "outputs": [], + "source": [ + "from typing import List\n", + "import io\n", + "import os\n", + "import requests\n", + "\n", + "from google.cloud import vision\n", + "from google.cloud.vision_v1.types import Feature\n", + "import apache_beam as beam\n", + "from apache_beam.ml.inference.base import ModelHandler\n", + "from apache_beam.ml.inference.base import RunInference\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "09k08IYlLmON" + }, + "source": [ + "This example uses images from the [MSCoco dataset](https://cocodataset.org/#explore) as a list of image URLs.\n", + "This data is used as the pipeline input." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_89eN_1QeYEd" + }, + "outputs": [], + "source": [ + "image_urls = [\n", + " \"http://farm3.staticflickr.com/2824/10213933686_6936eb402b_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7003/6528937031_10e1ce0960_z.jpg\",\n", + " \"http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg\",\n", + " \"http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", + "]\n", + "\n", + "def read_image(image_url):\n", + " \"\"\"Read image from url and return image_url, image bytes\"\"\"\n", + " response = requests.get(image_url)\n", + " image_bytes = io.BytesIO(response.content).read()\n", + " return image_url, image_bytes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HLy7VKJhLrmT" + }, + "source": [ + "### Create a Custom ModelHandler\n", + "\n", + "In order to implement remote inference, create a custom model handler. The `run_inference` method is the most interesting part. In this function, we implement the model call and return its results.\n", + "\n", + "When running remote inference, prepare to encounter, identify, and handle failure as gracefully as possible. We recommend using the following techniques:\n", + "\n", + "* **Exponential backoff:** Retry failed remote calls with exponentially growing pauses between retries. Using exponential backoff ensures that failures don't lead to an overwhelming number of retries in quick succession.\n", + "\n", + "* **Dead-letter queues:** Route failed inferences to a separate `PCollection` without failing the whole transform. You can continue execution without failing the job (batch jobs' default behavior) or retrying indefinitely (streaming jobs' default behavior).\n", + "You can then run custom pipeline logic on the dead-letter queue (unprocessed messages queue) to log the failure, alert, and push the failed message to temporary storage so that it can eventually be reprocessed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LnaisJ_JiY_Q" + }, + "outputs": [], + "source": [ + "class CloudVisionModelHandler(ModelHandler):\n", + " \"\"\"DoFn that accepts a batch of images as bytearray\n", + " and sends that batch to the Cloud vision API for remote inference.\"\"\"\n", + " def load_model(self):\n", + " \"\"\"Init the Google Vision API client.\"\"\"\n", + " client = vision.ImageAnnotatorClient()\n", + " return client\n", + "\n", + " def run_inference(self, batch, model, inference):\n", + " feature = Feature()\n", + " feature.type_ = Feature.Type.LABEL_DETECTION\n", + "\n", + " # The list of image_urls\n", + " image_urls = [image_url for (image_url, image_bytes) in batch]\n", + "\n", + " # Create a batch request for all images in the batch.\n", + " images = [vision.Image(content=image_bytes) for (image_url, image_bytes) in batch]\n", + " image_requests = [vision.AnnotateImageRequest(image=image, features=[feature]) for image in images]\n", + " batch_image_request = vision.BatchAnnotateImagesRequest(requests=image_requests)\n", + "\n", + " # Send the batch request to the remote endpoint.\n", + " responses = model.batch_annotate_images(request=batch_image_request).responses\n", + "\n", + " return list(zip(image_urls, responses))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lHJuyHhvL0-a" + }, + "source": [ + "### Manage batching\n", + "\n", + "Before we can chain together the pipeline steps, we need to understand batching.\n", + "When running inference with your model, either in Apache Beam or in an external API, you can batch your input to increase the efficiency of the model execution.\n", + "`RunInference` PTransform manages batching in this pipeline with `BatchElements` transform to group elements together and form a batch of the desired size.\n", + "\n", + "* If you are designing your own API endpoint, make sure that it can handle batches.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4sXHwZk9Url2" + }, + "source": [ + "### Create the pipeline\n", + "\n", + "This section demonstrates how to chain the steps together to do the following:\n", + "\n", + "* Read data.\n", + "\n", + "* Transform the data to fit the model input.\n", + "\n", + "* RunInference with custom CloudVision ModelHandler.\n", + "\n", + "* Process and display the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLg0OTvNkqo4", + "outputId": "3eee9ae4-f4cb-49e5-e03b-9af6aaef8805" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "('http://farm3.staticflickr.com/2824/10213933686_6936eb402b_z.jpg', label_annotations {\n", + " mid: \"/m/083wq\"\n", + " description: \"Wheel\"\n", + " score: 0.977976143\n", + " topicality: 0.977976143\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h9mv\"\n", + " description: \"Tire\"\n", + " score: 0.977934957\n", + " topicality: 0.977934957\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/043g5f\"\n", + " description: \"Fuel tank\"\n", + " score: 0.958490431\n", + " topicality: 0.958490431\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/05s2s\"\n", + " description: \"Plant\"\n", + " score: 0.95674181\n", + " topicality: 0.95674181\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lk_j\"\n", + " description: \"Automotive fuel system\"\n", + " score: 0.941456497\n", + " topicality: 0.941456497\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/07yv9\"\n", + " description: \"Vehicle\"\n", + " score: 0.936428607\n", + " topicality: 0.936428607\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02qwkrn\"\n", + " description: \"Vehicle brake\"\n", + " score: 0.905624092\n", + " topicality: 0.905624092\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8pb3l\"\n", + " description: \"Automotive tire\"\n", + " score: 0.897686064\n", + " topicality: 0.897686064\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0768fx\"\n", + " description: \"Automotive lighting\"\n", + " score: 0.897505879\n", + " topicality: 0.897505879\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8p7_l\"\n", + " description: \"Automotive exhaust\"\n", + " score: 0.877965152\n", + " topicality: 0.877965152\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", + " mid: \"/m/054_l\"\n", + " description: \"Mirror\"\n", + " score: 0.969698846\n", + " topicality: 0.969698846\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02jz0l\"\n", + " description: \"Tap\"\n", + " score: 0.962297797\n", + " topicality: 0.962297797\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lr5r\"\n", + " description: \"Bathroom sink\"\n", + " score: 0.933002412\n", + " topicality: 0.933002412\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0130jx\"\n", + " description: \"Sink\"\n", + " score: 0.930314779\n", + " topicality: 0.930314779\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02pkr5\"\n", + " description: \"Plumbing fixture\"\n", + " score: 0.920037031\n", + " topicality: 0.920037031\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02dgv\"\n", + " description: \"Door\"\n", + " score: 0.890176594\n", + " topicality: 0.890176594\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09ggk\"\n", + " description: \"Purple\"\n", + " score: 0.878831089\n", + " topicality: 0.878831089\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/01j2bj\"\n", + " description: \"Bathroom\"\n", + " score: 0.866840482\n", + " topicality: 0.866840482\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04wnmd\"\n", + " description: \"Fixture\"\n", + " score: 0.862223864\n", + " topicality: 0.862223864\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09qqq\"\n", + " description: \"Wall\"\n", + " score: 0.809348285\n", + " topicality: 0.809348285\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7003/6528937031_10e1ce0960_z.jpg', error {\n", + " code: 3\n", + " message: \"Bad image data.\"\n", + "}\n", + ")\n", + "('http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg', error {\n", + " code: 3\n", + " message: \"Bad image data.\"\n", + "}\n", + ")\n", + "('http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg', error {\n", + " code: 3\n", + " message: \"Bad image data.\"\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", + " mid: \"/m/054_l\"\n", + " description: \"Mirror\"\n", + " score: 0.969698846\n", + " topicality: 0.969698846\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02jz0l\"\n", + " description: \"Tap\"\n", + " score: 0.962297797\n", + " topicality: 0.962297797\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lr5r\"\n", + " description: \"Bathroom sink\"\n", + " score: 0.933002412\n", + " topicality: 0.933002412\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0130jx\"\n", + " description: \"Sink\"\n", + " score: 0.930314779\n", + " topicality: 0.930314779\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02pkr5\"\n", + " description: \"Plumbing fixture\"\n", + " score: 0.920037031\n", + " topicality: 0.920037031\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02dgv\"\n", + " description: \"Door\"\n", + " score: 0.890176594\n", + " topicality: 0.890176594\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09ggk\"\n", + " description: \"Purple\"\n", + " score: 0.878831089\n", + " topicality: 0.878831089\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/01j2bj\"\n", + " description: \"Bathroom\"\n", + " score: 0.866840482\n", + " topicality: 0.866840482\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04wnmd\"\n", + " description: \"Fixture\"\n", + " score: 0.862223864\n", + " topicality: 0.862223864\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09qqq\"\n", + " description: \"Wall\"\n", + " score: 0.809348285\n", + " topicality: 0.809348285\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", + " mid: \"/m/054_l\"\n", + " description: \"Mirror\"\n", + " score: 0.969698846\n", + " topicality: 0.969698846\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02jz0l\"\n", + " description: \"Tap\"\n", + " score: 0.962297797\n", + " topicality: 0.962297797\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lr5r\"\n", + " description: \"Bathroom sink\"\n", + " score: 0.933002412\n", + " topicality: 0.933002412\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0130jx\"\n", + " description: \"Sink\"\n", + " score: 0.930314779\n", + " topicality: 0.930314779\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02pkr5\"\n", + " description: \"Plumbing fixture\"\n", + " score: 0.920037031\n", + " topicality: 0.920037031\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02dgv\"\n", + " description: \"Door\"\n", + " score: 0.890176594\n", + " topicality: 0.890176594\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09ggk\"\n", + " description: \"Purple\"\n", + " score: 0.878831089\n", + " topicality: 0.878831089\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/01j2bj\"\n", + " description: \"Bathroom\"\n", + " score: 0.866840482\n", + " topicality: 0.866840482\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04wnmd\"\n", + " description: \"Fixture\"\n", + " score: 0.862223864\n", + " topicality: 0.862223864\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09qqq\"\n", + " description: \"Wall\"\n", + " score: 0.809348285\n", + " topicality: 0.809348285\n", + "}\n", + ")\n" + ] + } + ], + "source": [ + "with beam.Pipeline() as pipeline:\n", + " _ = (pipeline | \"Create inputs\" >> beam.Create(image_urls)\n", + " | \"Read images\" >> beam.Map(read_image)\n", + " | \"Inference\" >> RunInference(model_handler=CloudVisionModelHandler())\n", + " | \"Print image_url and annotation\" >> beam.Map(print)\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7gwn5bF1XaDm" + }, + "source": [ + "## Monitor the pipeline\n", + "\n", + "Because monitoring can provide insight into the status and health of the application, consider monitoring and measuring pipeline performance.\n", + "For information about the available tracking metrics, see [RunInference Metrics](https://beam.apache.org/documentation/ml/runinference-metrics/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TcBFS0rluusJ" + }, + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.7 (main, Dec 7 2022, 13:34:16) [Clang 14.0.0 (clang-1400.0.29.102)]" + }, + "vscode": { + "interpreter": { + "hash": "40c55305dca37c951f6b497e2e996ca59c449c4502b9f8a4515c118ec923845d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/examples/notebooks/beam-ml/mltransform_basic.ipynb b/examples/notebooks/beam-ml/mltransform_basic.ipynb index 820bc3400b580..e44be91fe1cd5 100644 --- a/examples/notebooks/beam-ml/mltransform_basic.ipynb +++ b/examples/notebooks/beam-ml/mltransform_basic.ipynb @@ -1,15 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, { "cell_type": "code", "source": [ @@ -65,7 +55,7 @@ "id": "d3b81cf2-8603-42bd-995e-9e14631effd0" }, "source": [ - "This notebook demonstrates how to use `MLTransform` to preprocess your data for machine learning models. `MLTransform` is a `PTransform` that wraps multiple Apache Beam data processing transforms. As a result, `MLTransform` gives you the ability to preprocess different types of data in multiple ways with one transform.\n", + "This notebook demonstrates how to use `MLTransform` to preprocess your data for machine learning models. `MLTransform` is a `PTransform` that wraps multiple Apache Beam data processing transforms. With `MLTransform`, you can preprocess different types of data in multiple ways with one transform.\n", "\n", "This notebook uses data processing transforms defined in the [apache_beam/ml/transforms/tft](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.transforms.tft.html) module." ] @@ -77,7 +67,7 @@ "id": "f0097dbd-2657-4cbe-a334-e0401816db01" }, "source": [ - "## Import the requried modules\n", + "## Import the required modules\n", "\n", "To use `MLTransfrom`, install `tensorflow_transform` and the Apache Beam SDK version 2.50.0 or later.\n" ] @@ -423,8 +413,6 @@ "source": [ "### Scale the data by using the z-score\n", "\n", - "Scale to the data using the z-score\n", - "\n", "Similar to `ScaleTo01`, use [ScaleToZScore](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.transforms.tft.html#apache_beam.ml.transforms.tft.ScaleToZScore) to scale the values by using the [z-score]([z-score](https://www.tensorflow.org/tfx/transform/api_docs/python/tft/scale_to_z_score#:~:text=Scaling%20to%20z%2Dscore%20subtracts%20out%20the%20mean%20and%20divides%20by%20standard%20deviation.%20Note%20that%20the%20standard%20deviation%20computed%20here%20is%20based%20on%20the%20biased%20variance%20(0%20delta%20degrees%20of%20freedom)%2C%20as%20computed%20by%20analyzers.var.).\n" ], "metadata": { @@ -607,7 +595,7 @@ "\n", "The previous examples show how to preprocess data for model training. This example uses the same preprocessing steps on the inference data. By using the same steps on the inference data, you can maintain consistent results.\n", "\n", - "Preprocess the data going into the inference by using the same preprocessing steps used on the data prior to training. To do this with `MLTransform`, pass the artifact location from the previous transforms to the parameter `read_artifact_location`. `MLTransform` uses the values and artifacts produced in the previous steps. You don't need to provide the transforms, because they are saved with the artifacts in the artifact location.\n" + "Preprocess the data used by the inference by using the same preprocessing steps that you used on the data prior to training. When using `MLTransform`, pass the artifact location from the previous transforms to the parameter `read_artifact_location`. `MLTransform` uses the values and artifacts produced in the previous steps. You don't need to provide the transforms, because they are saved with the artifacts in the artifact location.\n" ], "metadata": { "id": "kcnQSwkA-eSA" diff --git a/examples/notebooks/beam-ml/per_key_models.ipynb b/examples/notebooks/beam-ml/per_key_models.ipynb new file mode 100644 index 0000000000000..53845c0b3e191 --- /dev/null +++ b/examples/notebooks/beam-ml/per_key_models.ipynb @@ -0,0 +1,603 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ], + "metadata": { + "id": "OsFaZscKSPvo" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Run ML inference with multiple differently-trained models\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
\n" + ], + "metadata": { + "id": "ZUSiAR62SgO8" + } + }, + { + "cell_type": "markdown", + "source": [ + "Running inference with multiple differently-trained models performing the same task is useful in many scenarios, including the following examples:\n", + "\n", + "* You want to compare the performance of multiple different models.\n", + "* You have models trained on different datasets that you want to use conditionally based on additional metadata.\n", + "\n", + "In Apache Beam, the recommended way to run inference is to use the `RunInference` transform. By using a `KeyedModelHandler`, you can efficiently run inference with O(100s) of models without having to manage memory yourself.\n", + "\n", + "This notebook demonstrates how to use a `KeyedModelHandler` to run inference in an Apache Beam pipeline with multiple different models on a per-key basis. This notebook uses pretrained pipelines from Hugging Face. Before continuing with this notebook, it is recommended that you walk through the [Use RunInference in Apache Beam](https://colab.sandbox.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb) notebook." + ], + "metadata": { + "id": "ZAVOrrW2An1n" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Install dependencies\n", + "\n", + "Install both Apache Beam and the dependencies needed by Hugging Face." + ], + "metadata": { + "id": "_fNyheQoDgGt" + } + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B-ENznuJqArA", + "outputId": "f72963fc-82db-4d0d-9225-07f6b501e256" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "" + ] + } + ], + "source": [ + "!pip install apache_beam[gcp]>=2.51.0 --quiet\n", + "!pip install torch --quiet\n", + "!pip install transformers --quiet\n", + "\n", + "# To use the newly installed versions, restart the runtime.\n", + "exit()" + ] + }, + { + "cell_type": "code", + "source": [ + "from typing import Dict\n", + "from typing import Iterable\n", + "from typing import Tuple\n", + "\n", + "from transformers import pipeline\n", + "\n", + "import apache_beam as beam\n", + "from apache_beam.ml.inference.base import KeyedModelHandler\n", + "from apache_beam.ml.inference.base import KeyModelMapping\n", + "from apache_beam.ml.inference.base import PredictionResult\n", + "from apache_beam.ml.inference.huggingface_inference import HuggingFacePipelineModelHandler\n", + "from apache_beam.ml.inference.base import RunInference" + ], + "metadata": { + "id": "wUmBEglvsOYW" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Define the model configurations\n", + "\n", + "A model handler is the Apache Beam method used to define the configuration needed to load and invoke models. Because this example uses two models, we define two model handlers, one for each model. Because both models are incapsulated within Hugging Face pipelines, we use the model handler `HuggingFacePipelineModelHandler`.\n", + "\n", + "For this example, load the models using Hugging Face, and then run them against an example. The models produce different outputs." + ], + "metadata": { + "id": "uEqljVgCD7hx" + } + }, + { + "cell_type": "code", + "source": [ + "distilbert_mh = HuggingFacePipelineModelHandler('text-classification', model=\"distilbert-base-uncased-finetuned-sst-2-english\")\n", + "roberta_mh = HuggingFacePipelineModelHandler('text-classification', model=\"roberta-large-mnli\")\n", + "\n", + "distilbert_pipe = pipeline('text-classification', model=\"distilbert-base-uncased-finetuned-sst-2-english\")\n", + "roberta_large_pipe = pipeline(model=\"roberta-large-mnli\")" + ], + "metadata": { + "id": "v2NJT5ZcxgH5", + "outputId": "3924d72e-5c49-477d-c50f-6d9098f5a4b2" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading (…)lve/main/config.json: 0%| | 0.00/629 [00:00-`." + ], + "metadata": { + "id": "r6GXL5PLFBY7" + } + }, + { + "cell_type": "code", + "source": [ + "class FormatExamples(beam.DoFn):\n", + " \"\"\"\n", + " Map each example to a tuple of ('-', 'example').\n", + " Use these keys to map our elements to the correct models.\n", + " \"\"\"\n", + " def process(self, element: Tuple[str, str]) -> Iterable[Tuple[str, str]]:\n", + " yield (f'distilbert-{element[1]}', element[0])\n", + " yield (f'roberta-{element[1]}', element[0])" + ], + "metadata": { + "id": "p2uVwws8zRpg" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Use the formatted keys to define a `KeyedModelHandler` that maps keys to the `ModelHandler` used for those keys. The `KeyedModelHandler` method lets you define an optional `max_models_per_worker_hint`, which limits the number of models that can be held in a single worker process at one time. If your worker might run out of memory, use this option. For more information about managing memory, see [Use a keyed ModelHandler](https://beam.apache.org/documentation/sdks/python-machine-learning/index.html#use-a-keyed-modelhandler)." + ], + "metadata": { + "id": "IP65_5nNGIb8" + } + }, + { + "cell_type": "code", + "source": [ + "per_key_mhs = [\n", + " KeyModelMapping(['distilbert-positive', 'distilbert-neutral', 'distilbert-negative'], distilbert_mh),\n", + " KeyModelMapping(['roberta-positive', 'roberta-neutral', 'roberta-negative'], roberta_mh)\n", + "]\n", + "mh = KeyedModelHandler(per_key_mhs, max_models_per_worker_hint=2)" + ], + "metadata": { + "id": "DZpfjeGL2hMG" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Postprocess the results\n", + "\n", + "The `RunInference` transform returns a tuple that contains the following objects:\n", + "* the original key\n", + "* a `PredictionResult` object containing the original example and the inference\n", + "Use those outputs to extract the relevant data. Then, to compare each model's prediction, group this data by the original example." + ], + "metadata": { + "id": "_a4ZmnD5FSeG" + } + }, + { + "cell_type": "code", + "source": [ + "class ExtractResults(beam.DoFn):\n", + " \"\"\"\n", + " Extract the relevant data from the PredictionResult object.\n", + " \"\"\"\n", + " def process(self, element: Tuple[str, PredictionResult]) -> Iterable[Tuple[str, Dict[str, str]]]:\n", + " actual_sentiment = element[0].split('-')[1]\n", + " model = element[0].split('-')[0]\n", + " result = element[1]\n", + " example = result.example\n", + " predicted_sentiment = result.inference[0]['label']\n", + "\n", + " yield (example, {'model': model, 'actual_sentiment': actual_sentiment, 'predicted_sentiment': predicted_sentiment})" + ], + "metadata": { + "id": "FOwFNQA053TG" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Finally, print the results produced by each model." + ], + "metadata": { + "id": "JVnv4gGbFohk" + } + }, + { + "cell_type": "code", + "source": [ + "class PrintResults(beam.DoFn):\n", + " \"\"\"\n", + " Print the results produced by each model along with the actual sentiment.\n", + " \"\"\"\n", + " def process(self, element: Tuple[str, Iterable[Dict[str, str]]]):\n", + " example = element[0]\n", + " actual_sentiment = element[1][0]['actual_sentiment']\n", + " predicted_sentiment_1 = element[1][0]['predicted_sentiment']\n", + " model_1 = element[1][0]['model']\n", + " predicted_sentiment_2 = element[1][1]['predicted_sentiment']\n", + " model_2 = element[1][1]['model']\n", + "\n", + " if model_1 == 'distilbert':\n", + " distilbert_prediction = predicted_sentiment_1\n", + " roberta_prediction = predicted_sentiment_2\n", + " else:\n", + " roberta_prediction = predicted_sentiment_1\n", + " distilbert_prediction = predicted_sentiment_2\n", + "\n", + " print(f'Example: {example}\\nActual Sentiment: {actual_sentiment}\\n'\n", + " f'Distilbert Prediction: {distilbert_prediction}\\n'\n", + " f'Roberta Prediction: {roberta_prediction}\\n------------')" + ], + "metadata": { + "id": "kUQJNYOa9Q5-" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Run the pipeline\n", + "\n", + "To run a single Apache Beam pipeline, combine the previous steps." + ], + "metadata": { + "id": "-LrpmM2PGAkf" + } + }, + { + "cell_type": "code", + "source": [ + "with beam.Pipeline() as beam_pipeline:\n", + "\n", + " formatted_examples = (\n", + " beam_pipeline\n", + " | \"ReadExamples\" >> beam.Create(examples)\n", + " | \"FormatExamples\" >> beam.ParDo(FormatExamples()))\n", + " inferences = (\n", + " formatted_examples\n", + " | \"Run Inference\" >> RunInference(mh)\n", + " | \"ExtractResults\" >> beam.ParDo(ExtractResults())\n", + " | \"GroupByExample\" >> beam.GroupByKey()\n", + " )\n", + "\n", + " inferences | beam.ParDo(PrintResults())\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 463 + }, + "id": "B9Wti3XH0Iqe", + "outputId": "528ad732-ecf8-4877-ab6a-badad7944fed" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Example: This restaurant is awesome\n", + "Actual Sentiment: positive\n", + "Distilbert Prediction: POSITIVE\n", + "Roberta Prediction: NEUTRAL\n", + "------------\n", + "Example: This restaurant is bad\n", + "Actual Sentiment: negative\n", + "Distilbert Prediction: NEGATIVE\n", + "Roberta Prediction: NEUTRAL\n", + "------------\n", + "Example: I love chocolate\n", + "Actual Sentiment: positive\n", + "Distilbert Prediction: POSITIVE\n", + "Roberta Prediction: NEUTRAL\n", + "------------\n", + "Example: I feel fine\n", + "Actual Sentiment: neutral\n", + "Distilbert Prediction: POSITIVE\n", + "Roberta Prediction: ENTAILMENT\n", + "------------\n" + ] + } + ] + } + ] +} diff --git a/examples/notebooks/beam-ml/run_custom_inference.ipynb b/examples/notebooks/beam-ml/run_custom_inference.ipynb index df81ae5af56f7..a66c5847de0ee 100644 --- a/examples/notebooks/beam-ml/run_custom_inference.ipynb +++ b/examples/notebooks/beam-ml/run_custom_inference.ipynb @@ -356,6 +356,7 @@ " model_name: The spaCy model name. Default is en_core_web_sm.\n", " \"\"\"\n", " self._model_name = model_name\n", + " self._env_vars = {}\n", "\n", " def load_model(self) -> Language:\n", " \"\"\"Loads and initializes a model for processing.\"\"\"\n", diff --git a/examples/notebooks/beam-ml/run_inference_windowing.ipynb b/examples/notebooks/beam-ml/run_inference_windowing.ipynb index 02b1d42f42210..27c56acdbd11d 100644 --- a/examples/notebooks/beam-ml/run_inference_windowing.ipynb +++ b/examples/notebooks/beam-ml/run_inference_windowing.ipynb @@ -93,9 +93,9 @@ { "cell_type": "code", "source": [ - "!pip install apache-beam==2.47.0\n", - "!pip install xgboost", - "# You may need to install a different version of Datatable directly depending on environment", + "!pip install apache-beam>=2.47.0\n", + "!pip install xgboost\n", + "# You may need to install a different version of Datatable directly depending on environment\n", "!pip install datatable" ], "metadata": { diff --git a/examples/notebooks/healthcare/beam_nlp.ipynb b/examples/notebooks/healthcare/beam_nlp.ipynb index 5106aaa607d9b..c2061bc4d75f6 100644 --- a/examples/notebooks/healthcare/beam_nlp.ipynb +++ b/examples/notebooks/healthcare/beam_nlp.ipynb @@ -22,7 +22,7 @@ "colab_type": "text" }, "source": [ - "\"Open" + "\"Open" ] }, { @@ -146,7 +146,7 @@ { "cell_type": "markdown", "source": [ - "Then, download [this raw CSV file](https://https://github.com/socd06/medical-nlp/blob/master/data/test.csv), and then upload it into Colab. You should be able to view this file (*test.csv*) in the \"Files\" tab in Colab after uploading." + "Then, download [this raw CSV file](https://github.com/socd06/medical-nlp/blob/master/data/test.csv), and then upload it into Colab. You should be able to view this file (*test.csv*) in the \"Files\" tab in Colab after uploading." ], "metadata": { "id": "1IArtEm8QuCR" diff --git a/examples/notebooks/healthcare/beam_post_hl7_messages_to_hcapi.ipynb b/examples/notebooks/healthcare/beam_post_hl7_messages_to_hcapi.ipynb new file mode 100644 index 0000000000000..ab6b2d9233cb0 --- /dev/null +++ b/examples/notebooks/healthcare/beam_post_hl7_messages_to_hcapi.ipynb @@ -0,0 +1,528 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "private_outputs": true, + "toc_visible": true, + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zQ_JXPR3RoFV" + }, + "outputs": [], + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License\n", + "\n", + "##################################\n", + "# Author: Devansh Modi #\n", + "##################################\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Highlevel Architecture**\n", + "\n", + "![Screenshot 2023-10-18 at 3.53.31 PM.png]()" + ], + "metadata": { + "id": "RL1LDp645ogr" + } + }, + { + "cell_type": "markdown", + "source": [ + "# **Post Hl7v2 messages to Google Cloud Healthcare API HL7v2 store pipeline**\n", + "\n", + "This example demonstrates how to set up an Apache Beam pipeline that reads a HL7 file from [Google Cloud Storage](https://https://cloud.google.com/storage), and calls the [Google Cloud Healthcare API Hl7v2 store to store Hl7 messages](https://cloud.google.com/healthcare-api/docs/how-tos/hl7v2-messages) to extract information from unstructured data. This application can be used in contexts such as reading raw Hl7 messages, if needed parse them or modify them as per your defined Hl7v2 store configurations and store data into Hl7v2 store.\n", + "\n", + "An Apache Beam pipeline is a pipeline that reads input data, transforms that data, and writes output data. It consists of PTransforms and PCollections. A PCollection represents a distributed data set that your Beam pipeline operates on. A PTransform represents a data processing operation, or a step, in your pipeline. It takes one or more PCollections as input, performs a processing function that you provide on the elements of that PCollection, and produces zero or more output PCollection objects.\n", + "\n", + "For details about Apache Beam pipelines, including PTransforms and PCollections, visit the [Beam Programming Guide](https://beam.apache.org/documentation/programming-guide/).\n", + "\n", + "You'll be able to use this notebook to explore the data in each PCollection." + ], + "metadata": { + "id": "wC9KRrlORwKu" + } + }, + { + "cell_type": "markdown", + "source": [ + "**What is an HL7v2 message?**\n", + "\n", + "HL7 Messages are used to transfer electronic data between disparate healthcare systems, each sending information about a particular event such as a patient admission.\n", + "\n", + "An HL7 message consists of one or more segments. Each segment is displayed on a different line of text. A carriage return character (\\r, which is 0D in hexadecimal) separates one segment from another.\n", + "\n", + "Each segment consists of one or more composites, also known as fields. A pipe (|) character is used to separate one composite from another. If a composite contains other composites, these sub-composites (or sub-fields) are normally separated by caret (^) characters.\n", + "\n" + ], + "metadata": { + "id": "AOVYgtyaqSxa" + } + }, + { + "cell_type": "markdown", + "source": [ + "***Sample HL7v2 Message***\n", + "\n", + "The below reference message shows a sample Hl7v2 messages seperated by \\r.\n", + "\n", + "**MSH|^~\\&|FROM_APP|FROM_FACILITY|TO_APP|TO_FACILITY|20150503223000||ADT^A01|20150503223000|P|2.5|\\r\n", + "EVN|A01|20110613083617|\\r\n", + "PID|1||21004053^^^^MRN||SULLY^BRIAN||19611209|M|||123 MAIN ST^^MOUNTAIN SPRINGS^CO^80439|\\r\n", + "PV1||I|H73 RM1^1^^HIGHWAY 73 CLINIC||||5148^MARY QUINN|||||||||Y||||||||||||||||||||||||||||20150503223000|**\n", + "\n", + "The file contains many such messages and the objective of this code will be to split and construct messages and POST it to Google Cloud HealthCare API HL7v2 store." + ], + "metadata": { + "id": "-lpbvwHmX1L5" + } + }, + { + "cell_type": "markdown", + "source": [ + "Lets install necessary packages" + ], + "metadata": { + "id": "81wCK9XnS6Sc" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install apache-beam[gcp]" + ], + "metadata": { + "id": "Yv1phmRZS23c" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Google Cloud Authentication**\n", + "\n", + "As we are using Google Clous Storage and HealthCare API, we will be requiring tokens to make sure our connection is secure.\n", + "\n", + "Click [this](https://cloud.google.com/free) link to create a new Google Cloud Platform account\n" + ], + "metadata": { + "id": "3EcdPBczYQlB" + } + }, + { + "cell_type": "markdown", + "source": [ + "**GCP Setup**\n", + "1. Authenticate your notebook by `gcloud auth application-default login` in the Colab terminal.\n", + "\n", + "2. Run `gcloud config set project `\n", + "\n", + "Set the variables in the next cell based upon your project and preferences.\n", + "\n", + "Note that below, **us-central1** is hardcoded as the location. This is because of the limited number of [locations](https://cloud.google.com/healthcare-api/docs/how-tos/hl7v2-messages) the API currently supports." + ], + "metadata": { + "id": "tpePe_yOsdSJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "Before running please set the following variables as arguments as mentioned below\n" + ], + "metadata": { + "id": "_1Q3mw1usnoE" + } + }, + { + "cell_type": "code", + "source": [ + "args = {'gcp_project':'xxx', #GCP project ID\n", + " 'gcp_region':'xxx', # GCP project region\n", + " 'temp_location':'gs:///tmp', #input location where your HL7 messages are stored in GCS bucket\n", + " 'input_file':'gs:///my_message.hl7', #input location where your HL7 messages are stored in GCS bucket\n", + " 'hcapi_project_id':'xxxxxx', #healthcare API project ID\n", + " 'hcapi_dataset':'xxxx', #healthcare dataset\n", + " 'hcapi_version':'v1', #healthcare API version by defualt v1\n", + " 'hcapi_location':'xxxx', #healthcare API configured location\n", + " 'hcapi_hl7_store':'xxx', #healthcare api hl7 store\n", + " 'hcapi_fhir_store':''}" + ], + "metadata": { + "id": "a722GbqdvgOX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Google Cloud Healthcare (HCAPI) API Utils class**\n", + "\n", + "Below is the code snippet which describes the class having healthcare API connections and configurations. Basic functionality includes constructing the hcapi_url as per the input parameters, cleaning the HL7 message in a proper format and posting hl7v2 message to hl7v2 store. You can add more transformations as per your requirements." + ], + "metadata": { + "id": "NHzk8JIqxQoa" + } + }, + { + "cell_type": "code", + "source": [ + "import google.auth\n", + "import google.auth.transport.requests\n", + "import base64\n", + "import json\n", + "import hashlib\n", + "import requests\n", + "import logging\n", + "import apache_beam as beam\n", + "from apache_beam.options.pipeline_options import PipelineOptions\n", + "from apache_beam.options.pipeline_options import SetupOptions\n", + "from apache_beam.testing.test_pipeline import TestPipeline\n", + "import apache_beam.runners.interactive.interactive_beam as ib\n", + "from apache_beam import io\n", + "\n", + "logging.basicConfig(level=logging.INFO, format='%(asctime)s :: %(levelname)s :: %(message)s')\n", + "\n", + "class hcapi_cls:\n", + "\n", + " def __init__(self, args):\n", + " self.hcapi_hl7_store = str(args['hcapi_hl7_store'])\n", + " self.hcapi_project_id = str(args['hcapi_project_id'])\n", + " self.hcapi_version = str(args['hcapi_version'])\n", + " self.hcapi_location = str(args['hcapi_location'])\n", + " self.hcapi_dataset = str(args['hcapi_dataset'])\n", + " self.hcapi_fhir_store = str(args['hcapi_fhir_store'])\n", + " self.token = None\n", + "\n", + " def google_api_headers(self):\n", + " \"\"\" Function gets the token for the request \"\"\"\n", + " logging.info(\"fetching token and refreshing credentials\")\n", + " creds, project = google.auth.default()\n", + " auth_req = google.auth.transport.requests.Request()\n", + " creds.refresh(auth_req)\n", + " return {\n", + " \"Authorization\": f\"Bearer {creds.token}\",\n", + " \"Prefer\": \"handling=strict\"\n", + " }\n", + "\n", + " def hcapi_dataset_url(self, version=None, project=None, location=None, dataset=None):\n", + " \"\"\" This function creates base hcapi dataset url and returns it \"\"\"\n", + " base = 'https://healthcare.googleapis.com'\n", + " version = self.hcapi_version\n", + " project = self.hcapi_project_id\n", + " location = self.hcapi_location\n", + " dataset = self.hcapi_dataset\n", + " return f'{base}/{version}/projects/{project}/locations/{location}/datasets/{dataset}'\n", + "\n", + " def hcapi_get(self, url):\n", + " \"\"\" Function to send get request to HCAPI \"\"\"\n", + " response = requests.get(url, headers=self.google_api_headers())\n", + " if not response.ok:\n", + " raise Exception(f'Error with HC API get:\\n{response.text}')\n", + " return response.json()\n", + "\n", + " def hcapi_post(self, url, data):\n", + " \"\"\" Function to send post request to HCAPI \"\"\"\n", + " response = requests.post(url, headers=self.google_api_headers(), json=data)\n", + " if not response.ok:\n", + " raise Exception(f'Error with HC API post:\\n{response.text}')\n", + " return response.json()\n", + "\n", + " def hcapi_delete(self, url):\n", + " \"\"\" Function to send delete request to HCAPI \"\"\"\n", + " response = requests.delete(url, headers=self.google_api_headers())\n", + " if not response.ok:\n", + " raise Exception(f'Error with HC API get:\\n{response.text}')\n", + " return response.json()\n", + "\n", + " def hcapi_hl7_url(self, version=None, project=None, location=None, dataset=None, store=None):\n", + " \"\"\" This function creates hcapi hl7V2store url and returns the url \"\"\"\n", + " base_url = self.hcapi_dataset_url(version=version, project=project,\n", + " location=location, dataset=dataset)\n", + " hl7_store = self.hcapi_hl7_store\n", + " return f'{base_url}/hl7V2Stores/{hl7_store}'\n", + "\n", + " def get_hl7_message(self, message_id):\n", + " \"\"\" Function to get message from HL7v2 store using HCAPI URL \"\"\"\n", + " url = f'{self.hcapi_hl7_url()}/messages/{message_id}'\n", + " return self.hcapi_get(url)\n", + "\n", + " def post_hl7_message(self, payload):\n", + " \"\"\" Function to post messages to HL7v2 store \"\"\"\n", + " url = f'{self.hcapi_hl7_url()}/messages'\n", + " return self.hcapi_post(url, payload)\n", + "\n", + " def message_to_hl7_store(self, message):\n", + " \"\"\" Function to clean up Hl7 messages with \\r seperator before posting to HCAPI \"\"\"\n", + " messase =str(message)\n", + " message = message.replace('\\n', '\\r')\n", + " message = message.replace('\\\\r', '\\r')\n", + " message = message.replace('\\r\\r', '\\r')\n", + " encoded = base64.b64encode(str(message).encode())\n", + " payload = {\n", + " \"message\": {\n", + " \"data\": encoded.decode()\n", + " }\n", + " }\n", + " return self.post_hl7_message(payload)\n", + "\n", + " def hcapi_fhir_url(self, version=None, project=None, location=None, dataset=None, store=None):\n", + " \"\"\" This function creates hcapi fhir store url and returns it \"\"\"\n", + " base_url = self.hcapi_dataset_url(version=version, project=project,\n", + " location=location, dataset=dataset)\n", + " if store is None:\n", + " raise Exception('No FHIR store specified')\n", + " return f'{base_url}/fhirStores/{store}/fhir'\n", + "\n", + " def hcapi_fhir_request(self, store_key, query, data={}, method='GET'):\n", + " \"\"\" Function to send post request to HCAPI FHIR store \"\"\"\n", + " store = self.hcapi_fhir_store\n", + " if not store:\n", + " raise Exception(f\"Couldn't FHIR find store named {store_key} in config\")\n", + " url = self.hcapi_fhir_url(store=store)\n", + " url = f'{url}/{query}' if query else url\n", + " get = lambda q, d: self.hcapi_get(url)\n", + " post = lambda q, d: self.hcapi_post(url, data)\n", + " delete = lambda q, d: self.hcapi_delete(url)\n", + " return {'GET': get, 'POST': post, 'DELETE' : delete}[method](query, data)\n", + "\n" + ], + "metadata": { + "id": "H7g4_-rGS9P_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Pipeline Setup**\n", + "\n", + "We will use InteractiveRunner in this notebook.\n", + "Following are the DoFn classes which carry out their respective operations" + ], + "metadata": { + "id": "lXnzAtbHyUd2" + } + }, + { + "cell_type": "markdown", + "source": [ + "The following class **BuildFileName** takes the file name from the element and converts its into string. You can enhance this class to construct GCS bucket URL, if your GCS bucket prefix remains constant." + ], + "metadata": { + "id": "TKnL8kxh3Kms" + } + }, + { + "cell_type": "code", + "source": [ + "class BuildFileName(beam.DoFn):\n", + " \"\"\" Class to get file name from variable and returns the filename \"\"\"\n", + " def process(self, element):\n", + " logging.info(\"processing the following file: {}\".format(element))\n", + " file_path = str(element)\n", + " yield file_path" + ], + "metadata": { + "id": "N01E3dQd3Jr3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The following class **BuildMessages** takes the GCS URL from the above class reads it, separates out each message, appends them into a list and return the list for the next class." + ], + "metadata": { + "id": "Jej68R8w3i2Z" + } + }, + { + "cell_type": "code", + "source": [ + "class BuildMessages(beam.DoFn):\n", + " \"\"\" Class to read file, clean and seperate messgaes based on MSH\"\"\"\n", + " def process(self, file_name):\n", + " try:\n", + " logging.info(\"starting to read file: {}\".format(file_name))\n", + " file = io.gcsio.GcsIO().open(filename=file_name, mode='r')\n", + " read_file = file.read()\n", + " new_file = str(read_file, encoding='utf-8').replace('\\n', '\\r')\n", + " logging.info(\"starting to seperate HL7 messages into list\")\n", + " messages=[]\n", + " for line in new_file.split('\\r'):\n", + " if line[:3] =='MSH':\n", + " messages.append(line)\n", + " else:\n", + " messages[-1]+= line\n", + "\n", + "\n", + " logging.info(\"total number of messages parsed are {}\".format(len(messages)))\n", + " return messages\n", + " except Exception as error:\n", + " logging.error(\"got the following error while processing : {}\".format('\\n'+str(error)))\n", + " raise Exception\n", + "\n", + "\n" + ], + "metadata": { + "id": "MC6tr_sGyNKG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The following class **PostToHL7V2Store** takes the messages return in the earlier class and POST each messages to Hl7v2 store ." + ], + "metadata": { + "id": "1hpuoUGA33jo" + } + }, + { + "cell_type": "code", + "source": [ + "class PostToHL7V2Store(beam.DoFn):\n", + " \"\"\" Class to read file, clean and seperate messgaes based on MSH\"\"\"\n", + " def process(self, element):\n", + " try:\n", + " logging.info(\"starting to prepare and post message\")\n", + " hl7v2_store_response = hcapi.message_to_hl7_store(element)\n", + " message_id = hl7v2_store_response['name'].split(\"/\")[-1]\n", + " logging.info(\"successfully posted message to Hl7v2 store with message id :- {}\".format(message_id))\n", + "\n", + " yield message_id\n", + " except Exception as error:\n", + " logging.error(\"got the following error while processing : {}\".format(error))\n", + " raise Exception" + ], + "metadata": { + "id": "lVjqYfb2330k" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The following function sets up a beam pipeline with various other Pipeline options that will extracts messages from Hl7 text and post each hl7 message to hl7v2 store using Google Cloud Healthcare API (HCAPI) api methods.\n", + "\n", + "**\"|\"** is an overloaded operator that applies a PTransform to a PCollection to produce a new PCollection. Together with |, >> allows you to optionally name a PTransform.\n", + "\n", + "Usage:[PCollection] | [PTransform], **or** [PCollection] | [name] >> [PTransform]" + ], + "metadata": { + "id": "g5oJgXCk4O1a" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "import apache_beam.runners.interactive.interactive_beam as ib\n", + "def run(beam_args,argv=None,save_main_session=True):\n", + " runnertype = \"InteractiveRunner\"\n", + " project=beam_args['gcp_project']\n", + " region=beam_args['gcp_region']\n", + " temp_location=beam_args['temp_location']\n", + "\n", + " options = PipelineOptions(\n", + " flags=argv,\n", + " runner=runnertype,\n", + " project=project,\n", + " job_name=\"my-beam-hl7to-hcapi\",\n", + " temp_location=temp_location,\n", + " region=region)\n", + " beam_pipeline_options = PipelineOptions(beam_args)\n", + " beam_pipeline_options.view_as(SetupOptions).save_main_session = save_main_session\n", + " with beam.Pipeline(options=beam_pipeline_options) as pipeline:\n", + " file = (\n", + " pipeline\n", + " | 'reading filename' >> beam.Create([args_dict['input_file']])\n", + " | 'preparing file path' >> beam.ParDo(BuildFileName())\n", + " )\n", + " hl7_messages=(\n", + " file\n", + " | 'parsing hl7 messages' >> beam.ParDo(BuildMessages())\n", + " )\n", + " post_hl7_messages = (\n", + " hl7_messages\n", + " | \"posting to hl7v2 Store\" >> beam.ParDo(PostToHL7V2Store())\n", + " )\n", + "\n", + "\n", + " ib.show_graph(pipeline)\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " logging.getLogger().setLevel(logging.INFO)\n", + " args_dict = dict(args)\n", + " hcapi= hcapi_cls(args_dict)\n", + " run(beam_args=args_dict)" + ], + "metadata": { + "id": "Dynn2PDuyRBT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "![Screenshot 2023-09-29 at 4.42.51 PM.png]()" + ], + "metadata": { + "id": "tweQCiuX5RVK" + } + } + ] +} \ No newline at end of file diff --git a/gradle.properties b/gradle.properties index 6bad220e641ba..ef84ea9a5cc98 100644 --- a/gradle.properties +++ b/gradle.properties @@ -30,8 +30,8 @@ signing.gnupg.useLegacyGpg=true # buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy. # To build a custom Beam version make sure you change it in both places, see # https://github.com/apache/beam/issues/21302. -version=2.52.0-SNAPSHOT -sdk_version=2.52.0.dev +version=2.53.0-SNAPSHOT +sdk_version=2.53.0.dev javaVersion=1.8 diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index afba109285af7..7f93135c49b76 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 4e86b9270786f..3fa8f862f7533 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.2-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.4-bin.zip networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew index 65dcd68d65c82..1aa94a4269074 100755 --- a/gradlew +++ b/gradlew @@ -83,10 +83,8 @@ done # This is normally unused # shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum @@ -133,10 +131,13 @@ location of your Java installation." fi else JAVACMD=java - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." + fi fi # Increase the maximum file descriptors if we can. @@ -144,7 +145,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then case $MAX_FD in #( max*) # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC2039,SC3045 MAX_FD=$( ulimit -H -n ) || warn "Could not query maximum file descriptor limit" esac @@ -152,7 +153,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then '' | soft) :;; #( *) # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC2039,SC3045 ulimit -n "$MAX_FD" || warn "Could not set maximum file descriptor limit to $MAX_FD" esac @@ -197,11 +198,15 @@ if "$cygwin" || "$msys" ; then done fi -# Collect all arguments for the java command; -# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of -# shell script including quotes and variable substitutions, so put them in -# double quotes to make sure that they get re-expanded; and -# * put everything else in single quotes, so that it's not re-expanded. + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. set -- \ "-Dorg.gradle.appname=$APP_BASE_NAME" \ diff --git a/it/build.gradle b/it/build.gradle index 35ccbba4c3606..42a9ad9f4ee88 100644 --- a/it/build.gradle +++ b/it/build.gradle @@ -30,4 +30,8 @@ tasks.register('GCSPerformanceTest') { tasks.register('BigTablePerformanceTest') { dependsOn(":it:google-cloud-platform:BigTablePerformanceTest") +} + +tasks.register('BigQueryStorageApiStreamingPerformanceTest') { + dependsOn(":it:google-cloud-platform:BigQueryStorageApiStreamingPerformanceTest") } \ No newline at end of file diff --git a/it/cassandra/src/main/java/org/apache/beam/it/cassandra/CassandraResourceManagerUtils.java b/it/cassandra/src/main/java/org/apache/beam/it/cassandra/CassandraResourceManagerUtils.java index ef617de518b13..f01800763787b 100644 --- a/it/cassandra/src/main/java/org/apache/beam/it/cassandra/CassandraResourceManagerUtils.java +++ b/it/cassandra/src/main/java/org/apache/beam/it/cassandra/CassandraResourceManagerUtils.java @@ -30,7 +30,7 @@ final class CassandraResourceManagerUtils { Pattern.compile("[/\\\\. \"\0$]"); // i.e. [/\. "$] private static final String REPLACE_DATABASE_NAME_CHAR = "-"; private static final DateTimeFormatter TIME_FORMAT = - DateTimeFormatter.ofPattern("yyyyMMdd-HHmmss"); + DateTimeFormatter.ofPattern("yyyyMMdd-HHmmss-SSSSSS"); private CassandraResourceManagerUtils() {} diff --git a/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java b/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java index 6aecc6609cfb4..61f730bf3579d 100644 --- a/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java +++ b/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java @@ -31,7 +31,7 @@ public class CassandraAsserts { /** - * Convert Cassandra {@link com.datastax.oss.driver.api.core.cql.Row} list to a list of maps. + * Convert Cassandra {@link Row} list to a list of maps. * * @param rows Rows to parse. * @return List of maps to use in {@link RecordsSubject}. diff --git a/it/cassandra/src/test/java/org/apache/beam/it/cassandra/CassandraResourceManagerTest.java b/it/cassandra/src/test/java/org/apache/beam/it/cassandra/CassandraResourceManagerTest.java index fe00457159fa5..318ef6d76c68c 100644 --- a/it/cassandra/src/test/java/org/apache/beam/it/cassandra/CassandraResourceManagerTest.java +++ b/it/cassandra/src/test/java/org/apache/beam/it/cassandra/CassandraResourceManagerTest.java @@ -72,7 +72,8 @@ public void testGetUriShouldReturnCorrectValue() { @Test public void testGetKeyspaceNameShouldReturnCorrectValue() { - assertThat(testManager.getKeyspaceName()).matches(TEST_ID.replace('-', '_') + "_\\d{8}_\\d{6}"); + assertThat(testManager.getKeyspaceName()) + .matches(TEST_ID.replace('-', '_') + "_\\d{8}_\\d{6}_\\d{6}"); } @Test diff --git a/it/common/src/main/java/org/apache/beam/it/common/PipelineLauncher.java b/it/common/src/main/java/org/apache/beam/it/common/PipelineLauncher.java index 8777bbec6c409..6d1aeae21dd45 100644 --- a/it/common/src/main/java/org/apache/beam/it/common/PipelineLauncher.java +++ b/it/common/src/main/java/org/apache/beam/it/common/PipelineLauncher.java @@ -121,6 +121,7 @@ class LaunchConfig { private final @Nullable String specPath; private final @Nullable Sdk sdk; private final @Nullable String executable; + private final @Nullable String requirementsFile; private final @Nullable Pipeline pipeline; private LaunchConfig(Builder builder) { @@ -130,6 +131,7 @@ private LaunchConfig(Builder builder) { this.specPath = builder.specPath; this.sdk = builder.sdk; this.executable = builder.executable; + this.requirementsFile = builder.requirementsFile; this.pipeline = builder.pipeline; } @@ -161,6 +163,10 @@ public ImmutableMap environment() { return executable; } + public @Nullable String requirementsFile() { + return requirementsFile; + } + public @Nullable Pipeline pipeline() { return pipeline; } @@ -185,6 +191,7 @@ public static final class Builder { private Map parameters; private Sdk sdk; private String executable; + private String requirementsFile; private Pipeline pipeline; private Builder(String jobName, String specPath) { @@ -243,6 +250,15 @@ public Builder setExecutable(String executable) { return this; } + public @Nullable String getRequirementsFile() { + return requirementsFile; + } + + public Builder setRequirementsFile(String requirementsFile) { + this.requirementsFile = requirementsFile; + return this; + } + public @Nullable Pipeline getPipeline() { return pipeline; } diff --git a/it/conditions/src/main/java/org/apache/beam/it/conditions/ConditionCheck.java b/it/conditions/src/main/java/org/apache/beam/it/conditions/ConditionCheck.java index b562b4a068c8a..de1b2b0cf2b70 100644 --- a/it/conditions/src/main/java/org/apache/beam/it/conditions/ConditionCheck.java +++ b/it/conditions/src/main/java/org/apache/beam/it/conditions/ConditionCheck.java @@ -77,6 +77,14 @@ public CheckResult(boolean success, String message) { this.message = message; } + public boolean isSuccess() { + return success; + } + + public String getMessage() { + return message; + } + @Override public String toString() { return "CheckResult{" + "success=" + success + ", message='" + message + '\'' + '}'; diff --git a/it/google-cloud-platform/build.gradle b/it/google-cloud-platform/build.gradle index 0917ddd3e21aa..4c5327b44c9ac 100644 --- a/it/google-cloud-platform/build.gradle +++ b/it/google-cloud-platform/build.gradle @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +import groovy.json.JsonOutput import org.apache.beam.gradle.IoPerformanceTestUtilities plugins { id 'org.apache.beam.module' } @@ -73,9 +74,11 @@ dependencies { testImplementation project(path: ":sdks:java:io:google-cloud-platform") testImplementation project(path: ":sdks:java:io:synthetic") testImplementation library.java.mockito_inline + testImplementation project(path: ":sdks:java:extensions:google-cloud-platform-core", configuration: "testRuntimeMigration") testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadowTest") testRuntimeOnly library.java.slf4j_simple } tasks.register("GCSPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'FileBasedIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) -tasks.register("BigTablePerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) \ No newline at end of file +tasks.register("BigTablePerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) +tasks.register("BigQueryStorageApiStreamingPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigQueryStreamingLT', ['configuration':'large', 'project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java index 14bb05394de26..44a439b0ce91e 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java @@ -516,20 +516,20 @@ public abstract static class MetricsConfiguration { public abstract @Nullable String outputPCollectionV2(); - public static MetricsConfiguration.Builder builder() { + public static Builder builder() { return new AutoValue_LoadTestBase_MetricsConfiguration.Builder(); } @AutoValue.Builder public abstract static class Builder { - public abstract MetricsConfiguration.Builder setInputPCollection(@Nullable String value); + public abstract Builder setInputPCollection(@Nullable String value); - public abstract MetricsConfiguration.Builder setInputPCollectionV2(@Nullable String value); + public abstract Builder setInputPCollectionV2(@Nullable String value); - public abstract MetricsConfiguration.Builder setOutputPCollection(@Nullable String value); + public abstract Builder setOutputPCollection(@Nullable String value); - public abstract MetricsConfiguration.Builder setOutputPCollectionV2(@Nullable String value); + public abstract Builder setOutputPCollectionV2(@Nullable String value); public abstract MetricsConfiguration build(); } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/artifacts/utils/JsonTestUtil.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/artifacts/utils/JsonTestUtil.java index 1ef12d33fa111..9a83558f7bfc7 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/artifacts/utils/JsonTestUtil.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/artifacts/utils/JsonTestUtil.java @@ -18,13 +18,22 @@ package org.apache.beam.it.gcp.artifacts.utils; import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.MappingIterator; +import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.json.JsonMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.TreeMap; +import java.util.stream.Collectors; /** * The {@link JsonTestUtil} class provides common utilities used for executing tests that involve @@ -56,6 +65,67 @@ public static List> readRecords(byte[] contents) throws IOEx return records; } + /** + * Reads NDJSON (Newline Delimited JSON) data from a byte array and returns a list of parsed JSON + * objects. Each JSON object is represented as a Map of String keys to Object values. + * + * @param jsonBytes A byte array containing NDJSON data. + * @return A list of parsed JSON objects as {@code Map}. + * @throws IOException if there's an issue reading or parsing the data. + */ + public static List> readNDJSON(byte[] jsonBytes) throws IOException { + try (ByteArrayInputStream inputStream = new ByteArrayInputStream(jsonBytes)) { + InputStreamReader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); + JsonMapper mapper = new JsonMapper(); + + return new BufferedReader(reader) + .lines() + .map( + line -> { + try { + // Deserialize each line as a Map + return mapper.readValue(line, mapTypeRef); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + } + } + + /** + * Recursively sorts the keys of a nested JSON represented as a Map. + * + * @param jsonMap A {@code Map} representing the nested JSON. + * @return A sorted {@code Map} where the keys are sorted in natural order. + */ + public static Map sortJsonMap(Map jsonMap) { + return jsonMap.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + entry -> { + Object value = entry.getValue(); + if (value instanceof Map) { + return sortJsonMap((Map) value); + } else if (value instanceof List) { + return ((List) value) + .stream() + .map( + item -> + item instanceof Map + ? sortJsonMap((Map) item) + : item) + .collect(Collectors.toList()); + } else { + return value; + } + }, + (a, b) -> a, // Merge function (not needed for a TreeMap) + TreeMap::new // Resulting map is a TreeMap + )); + } + /** * Read JSON records to a list of Maps. * @@ -86,4 +156,83 @@ public static Map readRecord(byte[] contents) throws IOException public static Map readRecord(String contents) throws IOException { return readRecord(contents.getBytes(StandardCharsets.UTF_8)); } + + /** + * Parses a JSON string and returns either a List of Maps or a Map, depending on whether the JSON + * represents an array or an object. + * + * @param jsonString The JSON string to parse. + * @return A List of Maps if the JSON is an array, or a Map if it's an object. + * @throws IOException If there's an error while parsing the JSON string. + */ + public static Object parseJsonString(String jsonString) throws IOException { + ObjectMapper objectMapper = new ObjectMapper(); + JsonNode jsonNode = objectMapper.readTree(jsonString); + if (jsonNode.isArray()) { + return parseJsonArray((ArrayNode) jsonNode); + } else if (jsonNode.isObject()) { + return parseJsonObject(jsonNode); + } else { + throw new IllegalArgumentException("Input is not a valid JSON object or array."); + } + } + + /** + * Parses a JSON array represented by an ArrayNode and returns a List of Maps. + * + * @param arrayNode The JSON array to parse. + * @return A List of Maps containing the parsed data. + */ + private static List parseJsonArray(ArrayNode arrayNode) { + List result = new ArrayList<>(); + for (JsonNode element : arrayNode) { + if (element.isObject()) { + result.add(parseJsonObject(element)); + } else { + result.add(parseSimpleNode(element)); + } + } + return result; + } + + /** + * Parses a JSON object represented by a JsonNode and returns a Map. + * + * @param objectNode The JSON object to parse. + * @return A Map containing the parsed data. + */ + private static Map parseJsonObject(JsonNode objectNode) { + Map result = new HashMap<>(); + objectNode + .fields() + .forEachRemaining( + entry -> { + String key = entry.getKey(); + JsonNode value = entry.getValue(); + if (value.isObject()) { + result.put(key, parseJsonObject(value)); + } else if (value.isArray()) { + result.put(key, parseJsonArray((ArrayNode) value)); + } else { + result.put(key, parseSimpleNode(value)); + } + }); + return result; + } + + /** Parse following value from JSON node: text, number, boolean, null. */ + @SuppressWarnings("nullness") + private static Object parseSimpleNode(JsonNode element) { + if (element.isTextual()) { + return element.asText(); + } else if (element.isNumber()) { + return element.numberValue(); + } else if (element.isBoolean()) { + return element.asBoolean(); + } else if (element.isNull()) { + return null; + } else { + throw new IllegalArgumentException("Element is not a valid JSON object or array."); + } + } } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java index ad2dcafc007bd..620d24d4e1174 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java @@ -99,7 +99,7 @@ public class DefaultPipelineLauncher extends AbstractPipelineLauncher { .put(PipelineResult.State.UNRECOGNIZED, JobState.UNKNOWN) .build(); - private DefaultPipelineLauncher(DefaultPipelineLauncher.Builder builder) { + private DefaultPipelineLauncher(Builder builder) { super( new Dataflow( Utils.getDefaultTransport(), @@ -109,8 +109,8 @@ private DefaultPipelineLauncher(DefaultPipelineLauncher.Builder builder) { : new HttpCredentialsAdapter(builder.getCredentials()))); } - public static DefaultPipelineLauncher.Builder builder(Credentials credentials) { - return new DefaultPipelineLauncher.Builder(credentials); + public static Builder builder(Credentials credentials) { + return new Builder(credentials); } @Override @@ -360,11 +360,22 @@ public LaunchInfo launch(String project, String region, LaunchConfig options) th options.executable() != null, "Cannot launch a dataflow job " + "without executable specified. Please specify executable and try again!"); + if (options.requirementsFile() != null) { + // install requirements + cmd.add( + "virtualenv . && source ./bin/activate && pip3 install -r " + + options.requirementsFile()); + cmd.add("&&"); + } LOG.info("Using the executable at {}", options.executable()); cmd.add("python3"); cmd.add(options.executable()); cmd.addAll(extractOptions(project, region, options)); - jobId = executeCommandAndParseResponse(cmd); + if (options.requirementsFile() != null) { + cmd.add("&&"); + cmd.add("deactivate"); + } + jobId = executeCommandAndParseResponse(String.join(" ", cmd)); break; case GO: checkState( @@ -376,7 +387,7 @@ public LaunchInfo launch(String project, String region, LaunchConfig options) th cmd.add("run"); cmd.add(options.executable()); cmd.addAll(extractOptions(project, region, options)); - jobId = executeCommandAndParseResponse(cmd); + jobId = executeCommandAndParseResponse(String.join(" ", cmd)); break; default: throw new RuntimeException( @@ -441,10 +452,13 @@ private List extractOptions(String project, String region, LaunchConfig } /** Executes the specified command and parses the response to get the Job ID. */ - private String executeCommandAndParseResponse(List cmd) throws IOException { - Process process = new ProcessBuilder().command(cmd).redirectErrorStream(true).start(); + private String executeCommandAndParseResponse(String cmd) throws IOException { + LOG.info("Running command: {}", cmd); + Process process = + new ProcessBuilder().command("/bin/bash", "-c", cmd).redirectErrorStream(true).start(); String output = new String(ByteStreams.toByteArray(process.getInputStream()), StandardCharsets.UTF_8); + LOG.info(output); Matcher m = JOB_ID_PATTERN.matcher(output); if (!m.find()) { throw new RuntimeException( diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java index 57f8ad40c1b6e..8017009ff3787 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java @@ -53,8 +53,8 @@ public class DirectRunnerClient implements PipelineLauncher { this.mainClass = builder.getMainClass(); } - public static DirectRunnerClient.Builder builder(Class mainClass) { - return new DirectRunnerClient.Builder(mainClass); + public static Builder builder(Class mainClass) { + return new Builder(mainClass); } @Override @@ -172,7 +172,7 @@ public Class getMainClass() { return mainClass; } - public DirectRunnerClient.Builder setCredentials(Credentials value) { + public Builder setCredentials(Credentials value) { credentials = value; return this; } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java index 832a75defd95b..99016b5dd3a46 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java @@ -61,16 +61,14 @@ private DataGenerator(Builder builder) { .build(); } - public static DataGenerator.Builder builderWithSchemaLocation( - String testName, String schemaLocation) { - return new DataGenerator.Builder(testName + "-data-generator") + public static Builder builderWithSchemaLocation(String testName, String schemaLocation) { + return new Builder(testName + "-data-generator") .setSchemaLocation(schemaLocation) .setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED); } - public static DataGenerator.Builder builderWithSchemaTemplate( - String testName, String schemaTemplate) { - return new DataGenerator.Builder(testName + "-data-generator") + public static Builder builderWithSchemaTemplate(String testName, String schemaTemplate) { + return new Builder(testName + "-data-generator") .setSchemaTemplate(schemaTemplate) .setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED); } @@ -131,27 +129,27 @@ public Map getParameters() { return parameters; } - public DataGenerator.Builder setSchemaTemplate(String value) { + public Builder setSchemaTemplate(String value) { parameters.put("schemaTemplate", value); return this; } - public DataGenerator.Builder setSchemaLocation(String value) { + public Builder setSchemaLocation(String value) { parameters.put("schemaLocation", value); return this; } - public DataGenerator.Builder setMessagesLimit(String value) { + public Builder setMessagesLimit(String value) { parameters.put(MESSAGES_LIMIT, value); return this; } - public DataGenerator.Builder setQPS(String value) { + public Builder setQPS(String value) { parameters.put("qps", value); return this; } - public DataGenerator.Builder setSinkType(String value) { + public Builder setSinkType(String value) { parameters.put("sinkType", value); return this; } @@ -166,87 +164,87 @@ public Builder setNumWorkers(String value) { return this; } - public DataGenerator.Builder setMaxNumWorkers(String value) { + public Builder setMaxNumWorkers(String value) { parameters.put("maxNumWorkers", value); return this; } - public DataGenerator.Builder setAutoscalingAlgorithm(AutoscalingAlgorithmType value) { + public Builder setAutoscalingAlgorithm(AutoscalingAlgorithmType value) { parameters.put("autoscalingAlgorithm", value.toString()); return this; } - public DataGenerator.Builder setOutputDirectory(String value) { + public Builder setOutputDirectory(String value) { parameters.put("outputDirectory", value); return this; } - public DataGenerator.Builder setOutputType(String value) { + public Builder setOutputType(String value) { parameters.put("outputType", value); return this; } - public DataGenerator.Builder setNumShards(String value) { + public Builder setNumShards(String value) { parameters.put("numShards", value); return this; } - public DataGenerator.Builder setAvroSchemaLocation(String value) { + public Builder setAvroSchemaLocation(String value) { parameters.put("avroSchemaLocation", value); return this; } - public DataGenerator.Builder setTopic(String value) { + public Builder setTopic(String value) { parameters.put("topic", value); return this; } - public DataGenerator.Builder setProjectId(String value) { + public Builder setProjectId(String value) { parameters.put("projectId", value); return this; } - public DataGenerator.Builder setSpannerInstanceName(String value) { + public Builder setSpannerInstanceName(String value) { parameters.put("spannerInstanceName", value); return this; } - public DataGenerator.Builder setSpannerDatabaseName(String value) { + public Builder setSpannerDatabaseName(String value) { parameters.put("spannerDatabaseName", value); return this; } - public DataGenerator.Builder setSpannerTableName(String value) { + public Builder setSpannerTableName(String value) { parameters.put("spannerTableName", value); return this; } - public DataGenerator.Builder setDriverClassName(String value) { + public Builder setDriverClassName(String value) { parameters.put("driverClassName", value); return this; } - public DataGenerator.Builder setConnectionUrl(String value) { + public Builder setConnectionUrl(String value) { parameters.put("connectionUrl", value); return this; } - public DataGenerator.Builder setUsername(String value) { + public Builder setUsername(String value) { parameters.put("username", value); return this; } - public DataGenerator.Builder setPassword(String value) { + public Builder setPassword(String value) { parameters.put("password", value); return this; } - public DataGenerator.Builder setConnectionProperties(String value) { + public Builder setConnectionProperties(String value) { parameters.put("connectionProperties", value); return this; } - public DataGenerator.Builder setStatement(String value) { + public Builder setStatement(String value) { parameters.put("statement", value); return this; } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java index 78fa7543150fd..ef67a5a5c4fb0 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java @@ -61,8 +61,7 @@ public static List> datastoreResultsToRecords(Collection results) { diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java index de818a1bbff18..f59794af3e1ff 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java @@ -113,9 +113,8 @@ public void cleanupAll() { * @param project the GCP project ID * @return a new instance of Builder */ - public static DlpResourceManager.Builder builder( - String project, CredentialsProvider credentialsProvider) { - return new DlpResourceManager.Builder(project, credentialsProvider); + public static Builder builder(String project, CredentialsProvider credentialsProvider) { + return new Builder(project, credentialsProvider); } /** A builder class for creating instances of {@link DlpResourceManager}. */ diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java index 2cad6d0b9faba..7e1a403c73525 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java @@ -72,9 +72,8 @@ private KMSResourceManager(Builder builder) { this.keyRing = null; } - public static KMSResourceManager.Builder builder( - String projectId, CredentialsProvider credentialsProvider) { - return new KMSResourceManager.Builder(projectId, credentialsProvider); + public static Builder builder(String projectId, CredentialsProvider credentialsProvider) { + return new Builder(projectId, credentialsProvider); } /** diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java index 06591ea4fe0ae..0fc5614a36300 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java @@ -150,8 +150,8 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aggregation.Aligner.ALIGN_MEAN) - .setCrossSeriesReducer(Aggregation.Reducer.REDUCE_MEAN) + .setPerSeriesAligner(Aligner.ALIGN_MEAN) + .setCrossSeriesReducer(Reducer.REDUCE_MEAN) .addGroupByFields("resource.instance_id") .build(); ListTimeSeriesRequest request = @@ -188,7 +188,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aggregation.Aligner.ALIGN_MEAN) + .setPerSeriesAligner(Aligner.ALIGN_MEAN) .setCrossSeriesReducer(Reducer.REDUCE_MAX) .build(); ListTimeSeriesRequest request = @@ -225,7 +225,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aggregation.Aligner.ALIGN_MEAN) + .setPerSeriesAligner(Aligner.ALIGN_MEAN) .setCrossSeriesReducer(Reducer.REDUCE_MAX) .build(); ListTimeSeriesRequest request = @@ -269,7 +269,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aggregation.Aligner.ALIGN_RATE) + .setPerSeriesAligner(Aligner.ALIGN_RATE) .build(); ListTimeSeriesRequest request = ListTimeSeriesRequest.newBuilder() @@ -312,7 +312,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aggregation.Aligner.ALIGN_RATE) + .setPerSeriesAligner(Aligner.ALIGN_RATE) .build(); ListTimeSeriesRequest request = ListTimeSeriesRequest.newBuilder() diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/spanner/matchers/SpannerAsserts.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/spanner/matchers/SpannerAsserts.java index c9964d16f3b1e..5a101e08d3757 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/spanner/matchers/SpannerAsserts.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/spanner/matchers/SpannerAsserts.java @@ -17,13 +17,17 @@ */ package org.apache.beam.it.gcp.spanner.matchers; +import static org.apache.beam.it.gcp.artifacts.utils.JsonTestUtil.parseJsonString; import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatRecords; import com.google.cloud.spanner.Mutation; import com.google.cloud.spanner.Struct; import com.google.cloud.spanner.Type; +import com.google.cloud.spanner.Type.Code; import com.google.cloud.spanner.Value; +import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -81,6 +85,47 @@ public static List> mutationsToRecords(List mutati } } + /** + * Convert a list of Spanner {@link Mutation} objects into a list of maps, extracting specified + * columns. + * + * @param mutations The list of mutations to process. + * @param columns The columns to extract. + * @return List of maps to use in {@link RecordsSubject} + */ + public static List> mutationsToRecords( + List mutations, List columns) { + try { + List> records = new ArrayList<>(); + mutations.forEach( + entry -> { + records.add( + entry.asMap().entrySet().stream() + .filter((e) -> columns.contains(e.getKey())) + .collect( + Collectors.toMap( + Map.Entry::getKey, + (e) -> { + if (e.getValue().getType().getCode() == Code.ARRAY) { + return e.getValue().getAsStringList(); + } + if (Arrays.asList(Code.JSON, Code.PG_JSONB) + .contains(e.getValue().getType().getCode())) { + try { + return parseJsonString(e.getValue().getJson()); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + return e.getValue().getAsString(); + }))); + }); + return records; + } catch (Exception e) { + throw new RuntimeException("Error converting TableResult to Records", e); + } + } + /** * Creates a {@link RecordsSubject} to assert information within a list of records. * diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigquery/BigQueryStreamingLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigquery/BigQueryStreamingLT.java new file mode 100644 index 0000000000000..4589f79f1aaa6 --- /dev/null +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigquery/BigQueryStreamingLT.java @@ -0,0 +1,643 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.it.gcp.bigquery; + +import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils.toTableReference; +import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils.toTableSpec; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; + +import com.google.api.core.ApiFuture; +import com.google.api.services.bigquery.model.TableFieldSchema; +import com.google.api.services.bigquery.model.TableRow; +import com.google.api.services.bigquery.model.TableSchema; +import com.google.auto.value.AutoValue; +import com.google.cloud.bigquery.storage.v1.FlushRowsResponse; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.ThreadLocalRandom; +import java.util.stream.Collectors; +import org.apache.beam.it.common.PipelineLauncher; +import org.apache.beam.it.common.PipelineOperator; +import org.apache.beam.it.common.TestProperties; +import org.apache.beam.it.gcp.IOLoadTestBase; +import org.apache.beam.runners.dataflow.DataflowRunner; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServicesImpl; +import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestPipelineOptions; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.PeriodicImpulse; +import org.apache.beam.sdk.transforms.Reshuffle; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Load test for the Storage Write API sink + * + *

This test is set up to first write rows using batch FILE_LOADS mode to a "source of truth" + * table. Afterwards, it will write the same rows in streaming mode with Storage API to a second + * table. Then it will query between these two tables to check that they are identical. There is + * also the option of providing an existing table with the expected data, in which case the test + * will skip the first step. + * + *

The throughput, length of test (in minutes), and data shape can be changed via pipeline + * options. See the cases in `getOptions()` for examples. + * + *

This also includes the option of testing the sink's retry resilience by setting the + * `crashIntervalSeconds` System property. This intentionally fails the worker or work item + * periodically and expects the sink to recover appropriately. Note: Metrics are not published when + * this is used. + */ +public class BigQueryStreamingLT extends IOLoadTestBase { + private static final Logger LOG = LoggerFactory.getLogger(BigQueryStreamingLT.class); + + private static final BigqueryClient BQ_CLIENT = new BigqueryClient("BigQueryStreamingLT"); + private static final String BIG_QUERY_DATASET_ID = + "storage_api_sink_load_test_" + System.nanoTime(); + + private TestConfiguration config; + private Integer crashIntervalSeconds; + + @Rule public final transient TestPipeline fileLoadsPipeline = TestPipeline.create(); + @Rule public final transient TestPipeline storageApiPipeline = TestPipeline.create(); + + @BeforeClass + public static void setUpTestClass() throws IOException, InterruptedException { + PipelineOptionsFactory.register(TestPipelineOptions.class); + BQ_CLIENT.createNewDataset(project, BIG_QUERY_DATASET_ID); + } + + @Before + public void setUpTest() { + String testConfig = + TestProperties.getProperty("configuration", "small", TestProperties.Type.PROPERTY); + config = TEST_CONFIGS.get(testConfig); + if (config == null) { + throw new IllegalArgumentException( + String.format( + "Unknown test configuration: [%s]. Known configs: %s", + testConfig, TEST_CONFIGS.keySet())); + } + // tempLocation needs to be set for file loads + if (!Strings.isNullOrEmpty(tempBucketName)) { + String tempLocation = String.format("gs://%s/temp/", tempBucketName); + fileLoadsPipeline.getOptions().as(TestPipelineOptions.class).setTempRoot(tempLocation); + fileLoadsPipeline.getOptions().setTempLocation(tempLocation); + } + + // Set expected table if the property is provided, + @Nullable + String expectedTable = + TestProperties.getProperty("expectedTable", "", TestProperties.Type.PROPERTY); + if (!Strings.isNullOrEmpty(expectedTable)) { + config.toBuilder().setExpectedTable(expectedTable).build(); + } + + crashIntervalSeconds = + Integer.parseInt( + TestProperties.getProperty("crashIntervalSeconds", "-1", TestProperties.Type.PROPERTY)); + } + + @AfterClass + public static void cleanup() { + BQ_CLIENT.deleteDataset(project, BIG_QUERY_DATASET_ID); + } + + private static final Map TEST_CONFIGS = + ImmutableMap.of( + "local", // 300K rows, >3 MB, 1K rows/s, >10KB/s + TestConfiguration.of(5, 5, 2, 1_000, "DirectRunner", null), + "small", // 600K rows, >30 MB, 1K rows/s, >50KB/s + TestConfiguration.of(10, 10, 5, 1_000, "DataflowRunner", null), + "medium", // 6M rows, >1.2 GB, 5K rows/s, >1MB/s + TestConfiguration.of(20, 20, 10, 5_000, "DataflowRunner", null), + "large", // 18M rows, >18 GB, 10K rows/s, >10MB/s + TestConfiguration.of(30, 50, 20, 10_000, "DataflowRunner", null)); + + /** Options for Bigquery IO Streaming load test. */ + @AutoValue + abstract static class TestConfiguration { + /** Rows will be generated for this many minutes. */ + abstract Integer getMinutes(); + + /** Data shape: The byte-size for each field. */ + abstract Integer getByteSizePerField(); + + /** Data shape: The number of fields per row. */ + abstract Integer getNumFields(); + + /** + * Rate of generated elements sent to the sink. Will run with a minimum of 1k rows per second. + */ + abstract Integer getRowsPerSecond(); + + abstract String getRunner(); + + /** + * The expected table to check against for correctness. If unset, the test will run a batch + * FILE_LOADS job and use the resulting table as a source of truth. + */ + @Nullable + abstract String getExpectedTable(); + + static TestConfiguration of( + int numMin, + int byteSizePerField, + int numFields, + int rowsPerSecond, + String runner, + @Nullable String expectedTable) { + return new AutoValue_BigQueryStreamingLT_TestConfiguration.Builder() + .setMinutes(numMin) + .setByteSizePerField(byteSizePerField) + .setNumFields(numFields) + .setRowsPerSecond(rowsPerSecond) + .setRunner(runner) + .setExpectedTable(expectedTable) + .build(); + } + + @AutoValue.Builder + abstract static class Builder { + abstract Builder setMinutes(int numMin); + + abstract Builder setByteSizePerField(int byteSizePerField); + + abstract Builder setNumFields(int numFields); + + abstract Builder setRowsPerSecond(int rowsPerSecond); + + abstract Builder setRunner(String runner); + + abstract Builder setExpectedTable(@Nullable String expectedTable); + + abstract TestConfiguration build(); + } + + abstract Builder toBuilder(); + } + + @Test + public void testExactlyOnceStreaming() throws IOException, InterruptedException { + runTest(BigQueryIO.Write.Method.STORAGE_WRITE_API); + } + + @Test + @Ignore + public void testAtLeastOnceStreaming() throws IOException, InterruptedException { + runTest(BigQueryIO.Write.Method.STORAGE_API_AT_LEAST_ONCE); + } + + public void runTest(BigQueryIO.Write.Method writeMethod) + throws IOException, InterruptedException { + long millis = Duration.standardMinutes(config.getMinutes()).getMillis(); + int rowsPerSecond = Math.max(config.getRowsPerSecond(), 1000); + + // The PeriodicImpulse source will generate an element every this many millis: + int fireInterval = 1; + // Each element from PeriodicImpulse will fan out to this many elements + // (applicable when a high row-per-second rate is set) + long multiplier = rowsPerSecond / 1000; + long totalRows = multiplier * millis / fireInterval; + // If we run with DataflowRunner and have not specified a positive crash duration for the sink, + // this signifies a performance test, and so we publish metrics to a BigQuery dataset + boolean publishMetrics = + config.getRunner().equalsIgnoreCase(DataflowRunner.class.getSimpleName()) + && crashIntervalSeconds <= 0; + + String expectedTable = config.getExpectedTable(); + GenerateTableRow genRow = + new GenerateTableRow(config.getNumFields(), config.getByteSizePerField()); + TableSchema schema = generateTableSchema(config.getNumFields()); + if (Strings.isNullOrEmpty(expectedTable)) { + String fileLoadsDescription = + String.format("fileloads-%s-records", withScaleSymbol(totalRows)); + expectedTable = + String.format("%s.%s.%s", project, BIG_QUERY_DATASET_ID, fileLoadsDescription); + LOG.info( + "No expected table was set. Will run a batch job to load {} rows to {}." + + " This will be used as the source of truth.", + totalRows, + expectedTable); + + fileLoadsPipeline + .apply(GenerateSequence.from(0).to(totalRows)) + .apply( + "Write to source of truth", + BigQueryIO.write() + .to(expectedTable) + .withFormatFunction(genRow) + .withMethod(BigQueryIO.Write.Method.FILE_LOADS) + .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE) + .withSchema(schema)); + + // If running on Dataflow, launch pipeline via launcher utils + if (publishMetrics) { + PipelineLauncher.LaunchConfig options = + PipelineLauncher.LaunchConfig.builder("test-" + fileLoadsDescription) + .setSdk(PipelineLauncher.Sdk.JAVA) + .setPipeline(fileLoadsPipeline) + .addParameter("runner", config.getRunner()) + .build(); + + // Don't use PipelineOperator because we don't want to wait on this batch job + // The streaming job will run in parallel and it will take longer anyways; this job will + // finish by then. + pipelineLauncher.launch(project, region, options); + } else { + fileLoadsPipeline.run(); + } + } + + String atLeastOnce = + writeMethod == BigQueryIO.Write.Method.STORAGE_API_AT_LEAST_ONCE ? "-atleastonce" : ""; + String storageApiDescription = + String.format( + "storageapi%s-load-%sqps-%smin-%stotal", + atLeastOnce, + withScaleSymbol(rowsPerSecond), + config.getMinutes(), + withScaleSymbol(totalRows)); + String destTable = + String.format("%s.%s.%s", project, BIG_QUERY_DATASET_ID, storageApiDescription); + LOG.info( + "Preparing a source generating at a rate of {} rows per second for a period of {} minutes." + + " This results in a total of {} rows written to {}.", + rowsPerSecond, + config.getMinutes(), + totalRows, + destTable); + + PCollection source = + storageApiPipeline + .apply( + PeriodicImpulse.create() + .stopAfter(Duration.millis(millis - 1)) + .withInterval(Duration.millis(fireInterval))) + .apply( + "Extract row IDs", + MapElements.into(TypeDescriptors.longs()) + .via(instant -> instant.getMillis() % totalRows)); + if (multiplier > 1) { + source = + source + .apply( + String.format("One input to %s outputs", multiplier), + ParDo.of(new MultiplierDoFn(multiplier))) + .apply("Reshuffle fanout", Reshuffle.viaRandomKey()); + } + + BigQueryIO.Write storageWriteTransform = + BigQueryIO.write() + .to(destTable) + .withFormatFunction(genRow) + .withMethod(writeMethod) + .withTriggeringFrequency(Duration.standardSeconds(1)) + .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) + .withSchema(schema); + + // If a crash interval is specified, use our crashing service implementation + if (crashIntervalSeconds > 0) { + LOG.info( + "A crash interval of {} seconds has been set. The Storage API sink will periodically crash.", + crashIntervalSeconds); + storageWriteTransform = + storageWriteTransform.withTestServices( + new CrashingBigQueryServices(crashIntervalSeconds)); + } + source.apply(storageWriteTransform); + + // If we're publishing metrics, launch pipeline via Dataflow launcher utils and export metrics + if (publishMetrics) { + // Set up dataflow job + PipelineLauncher.LaunchConfig storageApiOptions = + PipelineLauncher.LaunchConfig.builder("test-" + storageApiDescription) + .setSdk(PipelineLauncher.Sdk.JAVA) + .setPipeline(storageApiPipeline) + .addParameter("runner", config.getRunner()) + .addParameter("streaming", "true") + .addParameter("experiments", GcpOptions.STREAMING_ENGINE_EXPERIMENT) + .build(); + // Launch job + PipelineLauncher.LaunchInfo storageApiInfo = + pipelineLauncher.launch(project, region, storageApiOptions); + // Wait until the streaming pipeline is finished and drained, get the result. + PipelineOperator.Result storageApiResult = + pipelineOperator.waitUntilDoneAndFinish( + PipelineOperator.Config.builder() + .setJobId(storageApiInfo.jobId()) + .setProject(project) + .setRegion(region) + .setTimeoutAfter(java.time.Duration.ofMinutes(config.getMinutes() * 2L)) + .setCheckAfter(java.time.Duration.ofSeconds(config.getMinutes() * 60 / 20)) + .build()); + // Check the initial launch didn't fail + assertNotEquals(PipelineOperator.Result.LAUNCH_FAILED, storageApiResult); + // Check that the pipeline succeeded + assertEquals( + PipelineLauncher.JobState.DONE, + pipelineLauncher.getJobStatus(project, region, storageApiInfo.jobId())); + + // Export metrics + MetricsConfiguration metricsConfig = + MetricsConfiguration.builder() + .setInputPCollection( + (multiplier > 1) ? "Extract row IDs.out0" : "Reshuffle fanout.out0") + .build(); + try { + exportMetricsToBigQuery(storageApiInfo, getMetrics(storageApiInfo, metricsConfig)); + } catch (Exception e) { + // Just log the error. Don't re-throw because we have accuracy checks that are more + // important below + LOG.error("Encountered an error while exporting metrics to BigQuery:\n{}", e); + } + } + // If we're not publishing metrics, just run the pipeline normally + else { + storageApiPipeline.run().waitUntilFinish(); + } + + LOG.info( + "Write pipeline finished writing to {}. Will now perform accuracy checks against the rows in {}.", + destTable, + expectedTable); + // Filter our structs and arrays because they are not supported when querying with `EXCEPT + // DISTINCT` + String columnNames = + schema.getFields().stream() + .map(TableFieldSchema::getName) + .filter(fieldName -> fieldName.startsWith(FIELD_PREFIX)) + .collect(Collectors.joining(", ")); + checkCorrectness(columnNames, destTable, expectedTable); + // check non-duplication for STORAGE_WRITE_API + if (writeMethod == BigQueryIO.Write.Method.STORAGE_WRITE_API) { + checkNonDuplication(destTable, expectedTable, totalRows); + } + } + + // A BigQueryServices class that is almost identical to BigQueryServicesImpl, except that + // it returns a dataset service implementation that periodically crashes on flush() + private static class CrashingBigQueryServices extends BigQueryServicesImpl { + public final Integer crashIntervalSeconds; + + public CrashingBigQueryServices(Integer crashIntervalSeconds) { + this.crashIntervalSeconds = crashIntervalSeconds; + } + + @Override + public DatasetService getDatasetService(BigQueryOptions options) { + return new CrashingDatasetService(options); + } + + private class CrashingDatasetService extends BigQueryServicesImpl.DatasetServiceImpl { + private Instant lastCrash; + + public CrashingDatasetService(BigQueryOptions bqOptions) { + super(bqOptions); + } + + // We choose flush() to host the crash logic because it's called frequently during + // the span of a Storage Write API pipeline + @Override + public ApiFuture flush(String streamName, long flushOffset) + throws IOException, InterruptedException { + maybeCrash(); + return super.flush(streamName, flushOffset); + } + + // When specified, crash when the interval is met by: + // throwing an exception (failed work item) or + // performing a System exit (worker failure) + private void maybeCrash() { + if (crashIntervalSeconds != -1) { + Instant last = lastCrash; + if (last == null) { + lastCrash = Instant.now(); + } else if (Instant.now().isAfter(last.plusSeconds(crashIntervalSeconds))) { + lastCrash = Instant.now(); + + // Only crash 30% of the time (this is arbitrary) + if (ThreadLocalRandom.current().nextInt(100) < 30) { + // Half the time throw an exception (which fails this specific work item) + // Other half crash the entire worker, which fails all work items on this worker + if (ThreadLocalRandom.current().nextBoolean()) { + throw new RuntimeException( + "Throwing a random exception! This is for testing retry resilience."); + } else { + LOG.error("Crashing this worker! This is for testing retry resilience."); + System.exit(0); + } + } + } + } + } + } + } + + public void checkCorrectness(String columnNames, String destTable, String expectedTable) + throws IOException, InterruptedException { + // Need table spec to be in the format `myproject.mydataset.mytable` to include in BQ queries. + destTable = toTableSpec(toTableReference(destTable)); + expectedTable = toTableSpec(toTableReference(expectedTable)); + + String checkCorrectnessQuery = + String.format( + "WITH \n" + + "storage_api_table AS (SELECT %s FROM `%s`), \n" + + "expected_table AS (SELECT %s FROM `%s`), \n" + + "rows_mismatched AS (SELECT * FROM expected_table EXCEPT DISTINCT SELECT * FROM storage_api_table) \n" + + "SELECT COUNT(*) FROM rows_mismatched", + columnNames, destTable, columnNames, expectedTable); + + LOG.info("Executing query to check correctness:\n{}", checkCorrectnessQuery); + + TableRow queryResponse = + Iterables.getOnlyElement( + BQ_CLIENT.queryUnflattened(checkCorrectnessQuery, "google.com:clouddfe", true, true)); + long result = Long.parseLong((String) queryResponse.get("f0_")); + + LOG.info("Number of mismatched rows: {}", result); + assertEquals( + String.format("Saw %s rows that are missing from %s.", result, destTable), 0, result); + } + + public void checkNonDuplication(String destTable, String expectedTable, long totalRows) + throws IOException, InterruptedException { + String checkDuplicationQuery = + String.format( + "SELECT \n" + + "(SELECT COUNT(*) FROM `%s`) AS actualCount,\n" + + "(SELECT COUNT(*) FROM `%s`) AS expectedCount", + destTable, expectedTable); + + LOG.info("Executing query to check non-duplication:\n{}", checkDuplicationQuery); + + TableRow queryResponse = + Iterables.getOnlyElement( + BQ_CLIENT.queryUnflattened(checkDuplicationQuery, "google.com:clouddfe", true, true)); + long actualCount = Long.parseLong((String) queryResponse.get("actualCount")); + long expectedCount = Long.parseLong((String) queryResponse.get("expectedCount")); + assertEquals( + "Comparing actual table count and expected table count.", expectedCount, actualCount); + assertEquals( + "Comparing actual table count and calculated expected count.", totalRows, actualCount); + } + + // From a value, get the appropriate shortened name that includes the scale + // For example, from 12,345,678 return 12M + public String withScaleSymbol(long value) { + List scales = Arrays.asList("", "K", "M", "B", "T", "Q"); + int scaleIndex = 0; + while (value / 1000 > 0) { + scaleIndex++; + value /= 1000; + } + + return String.format("%s%s", value, scales.get(scaleIndex)); + } + + public static class MultiplierDoFn extends DoFn { + private long multiplier; + + MultiplierDoFn(long multiplier) { + this.multiplier = multiplier; + } + + @ProcessElement + public void processElement(@Element Long element, OutputReceiver outputReceiver) { + for (int i = 0; i < multiplier; i++) { + outputReceiver.output(element); + } + } + } + + static final String FIELD_PREFIX = "byte_field_"; + static final String RECORD_FIELD_PREFIX = "record_" + FIELD_PREFIX; + static final String NESTED_FIELD_PREFIX = "nested_" + FIELD_PREFIX; + static final String REPEATED_FIELD_PREFIX = "repeated_" + FIELD_PREFIX; + + public static TableSchema generateTableSchema(int numFields) { + List fields = new ArrayList<>(numFields); + fields.add(new TableFieldSchema().setType("INTEGER").setName("id")); + int j = 1; + for (int i = 1; i <= numFields; i++) { + TableFieldSchema fieldSchema = new TableFieldSchema(); + // Every 4th field will be a struct, every 5th field will be an array + if (j == 4) { + fieldSchema + .setType("RECORD") + .setName(RECORD_FIELD_PREFIX + i) + .setFields( + Arrays.asList( + new TableFieldSchema().setType("BYTES").setName(NESTED_FIELD_PREFIX + 1), + new TableFieldSchema().setType("BYTES").setName(NESTED_FIELD_PREFIX + 2))); + } else if (j == 5) { + fieldSchema.setType("BYTES").setMode("REPEATED").setName(REPEATED_FIELD_PREFIX + i); + j = 0; + } else { + fieldSchema.setType("BYTES").setName(FIELD_PREFIX + i); + } + j++; + fields.add(fieldSchema); + } + return new TableSchema().setFields(fields); + } + + static class GenerateTableRow implements SerializableFunction { + private final int numFields; + private final int sizePerField; + + public GenerateTableRow(int numFields, int sizePerField) { + assert numFields >= 0; + this.numFields = numFields; + this.sizePerField = sizePerField; + } + + @Override + public TableRow apply(Long rowId) { + TableRow row = new TableRow(); + row.set("id", rowId); + byte[] payload = getPayload(sizePerField, rowId).array(); + int j = 1; + for (int i = 1; i <= numFields; i++) { + // TODO: we can also make the struct and array sizes variable + if (j == 4) { + row.set( + RECORD_FIELD_PREFIX + i, + new TableRow() + .set(NESTED_FIELD_PREFIX + 1, Arrays.copyOfRange(payload, 0, sizePerField / 2)) + .set( + NESTED_FIELD_PREFIX + 2, + Arrays.copyOfRange(payload, sizePerField / 2, sizePerField))); + } else if (j == 5) { + row.set( + REPEATED_FIELD_PREFIX + i, + Arrays.asList( + Arrays.copyOfRange(payload, 0, sizePerField / 3), + Arrays.copyOfRange(payload, sizePerField / 3, sizePerField * 2 / 3), + Arrays.copyOfRange(payload, sizePerField * 2 / 3, sizePerField))); + j = 0; + } else { + row.set(FIELD_PREFIX + i, payload); + } + j++; + } + return row; + } + + private @Nullable ByteBuffer getPayload(int payloadSize, long rowId) { + if (payloadSize <= 0) { + return null; + } + byte[] payload = new byte[payloadSize]; + Random localRandom = ThreadLocal.withInitial(() -> new Random(rowId)).get(); + localRandom.setSeed(rowId); + localRandom.nextBytes(payload); + + return ByteBuffer.wrap(payload); + } + } +} diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java index e232ed31cb5a3..a6516863b8d79 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java @@ -59,7 +59,7 @@ * -DfailIfNoTests=false". * *

Example trigger command for specific test: "mvn test -pl it/google-cloud-platform -am \ - * -Dtest="BigTableIOLT#testWriteAndRead" -Dconfiguration=local -Dproject=[gcpProject] \ + * -Dtest="BigTableIOLT#testBigtableWriteAndRead" -Dconfiguration=local -Dproject=[gcpProject] \ * -DartifactBucket=[temp bucket] -DfailIfNoTests=false". */ public class BigTableIOLT extends IOLoadTestBase { @@ -67,7 +67,7 @@ public class BigTableIOLT extends IOLoadTestBase { private static final String COLUMN_FAMILY_NAME = "cf"; private static final long TABLE_MAX_AGE_MINUTES = 100L; - private static BigtableResourceManager resourceManager; + private BigtableResourceManager resourceManager; private static final String READ_ELEMENT_METRIC_NAME = "read_count"; private Configuration configuration; private String tableId; @@ -114,7 +114,7 @@ public void teardown() { /** Run integration test with configurations specified by TestProperties. */ @Test - public void testWriteAndRead() throws IOException { + public void testBigtableWriteAndRead() throws IOException { tableId = generateTableId(testName); resourceManager.createTable( @@ -205,7 +205,7 @@ private PipelineLauncher.LaunchInfo testRead() throws IOException { return pipelineLauncher.launch(project, region, options); } - /** Options for Bigquery IO load test. */ + /** Options for BigtableIO load test. */ @AutoValue abstract static class Configuration { abstract Long getNumRows(); @@ -227,18 +227,18 @@ static Configuration of(long numRows, int pipelineTimeout, String runner, int va @AutoValue.Builder abstract static class Builder { - abstract Configuration.Builder setNumRows(long numRows); + abstract Builder setNumRows(long numRows); - abstract Configuration.Builder setPipelineTimeout(int timeOutMinutes); + abstract Builder setPipelineTimeout(int timeOutMinutes); - abstract Configuration.Builder setRunner(String runner); + abstract Builder setRunner(String runner); - abstract Configuration.Builder setValueSizeBytes(int valueSizeBytes); + abstract Builder setValueSizeBytes(int valueSizeBytes); abstract Configuration build(); } - abstract Configuration.Builder toBuilder(); + abstract Builder toBuilder(); } /** Maps long number to the BigTable format record. */ diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/spanner/SpannerIOLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/spanner/SpannerIOLT.java new file mode 100644 index 0000000000000..949b863be3df4 --- /dev/null +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/spanner/SpannerIOLT.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.it.gcp.spanner; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.cloud.ByteArray; +import com.google.cloud.spanner.Mutation; +import java.io.IOException; +import java.io.Serializable; +import java.text.ParseException; +import java.time.Duration; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.Map; +import java.util.Objects; +import java.util.Random; +import java.util.UUID; +import org.apache.beam.it.common.PipelineLauncher; +import org.apache.beam.it.common.PipelineOperator; +import org.apache.beam.it.common.TestProperties; +import org.apache.beam.it.common.utils.ResourceManagerUtils; +import org.apache.beam.it.gcp.IOLoadTestBase; +import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.io.gcp.spanner.SpannerIO; +import org.apache.beam.sdk.io.synthetic.SyntheticSourceOptions; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; + +/** + * SpannerIO performance tests. + * + *

Example trigger command for all tests: "mvn test -pl it/google-cloud-platform -am + * -Dtest=SpannerIOLT \ -Dproject=[gcpProject] -DartifactBucket=[temp bucket] + * -DfailIfNoTests=false". + * + *

Example trigger command for specific test: "mvn test -pl it/google-cloud-platform -am \ + * -Dtest="SpannerIOLT#testSpannerWriteAndRead" -Dconfiguration=local -Dproject=[gcpProject] \ + * -DartifactBucket=[temp bucket] -DfailIfNoTests=false". + */ +public class SpannerIOLT extends IOLoadTestBase { + @Rule public TestPipeline writePipeline = TestPipeline.create(); + @Rule public TestPipeline readPipeline = TestPipeline.create(); + private String tableName; + private SpannerResourceManager resourceManager; + private Configuration configuration; + private static final String READ_ELEMENT_METRIC_NAME = "read_count"; + + @Before + public void setup() throws IOException { + // generate a random table name + tableName = + "io_spanner_" + + DateTimeFormatter.ofPattern("MMddHHmmssSSS") + .withZone(ZoneId.of("UTC")) + .format(java.time.Instant.now()) + + UUID.randomUUID().toString().replace("-", "").substring(0, 10); + + resourceManager = SpannerResourceManager.builder(testName, project, region).build(); + + // parse configuration + String testConfig = + TestProperties.getProperty("configuration", "local", TestProperties.Type.PROPERTY); + configuration = TEST_CONFIGS_PRESET.get(testConfig); + if (configuration == null) { + try { + configuration = Configuration.fromJsonString(testConfig, Configuration.class); + } catch (IOException e) { + throw new IllegalArgumentException( + String.format( + "Unknown test configuration: [%s]. Pass to a valid configuration json, or use" + + " config presets: %s", + testConfig, TEST_CONFIGS_PRESET.keySet())); + } + } + // prepare schema + String createTable = + createTableStatement( + tableName, configuration.numColumns, (int) configuration.valueSizeBytes); + // Create table + resourceManager.executeDdlStatement(createTable); + } + + @After + public void teardown() { + ResourceManagerUtils.cleanResources(resourceManager); + } + + private static final Map TEST_CONFIGS_PRESET; + + static { + try { + TEST_CONFIGS_PRESET = + ImmutableMap.of( + "local", + Configuration.fromJsonString( + "{\"numRecords\":1000,\"valueSizeBytes\":1000,\"pipelineTimeout\":2,\"runner\":\"DirectRunner\"}", + Configuration.class), // 1 MB + "medium", + Configuration.fromJsonString( + "{\"numRecords\":10000000,\"valueSizeBytes\":1000,\"pipelineTimeout\":20,\"runner\":\"DataflowRunner\"}", + Configuration.class), // 10 GB + "large", + Configuration.fromJsonString( + "{\"numRecords\":100000000,\"valueSizeBytes\":1000,\"pipelineTimeout\":80,\"runner\":\"DataflowRunner\"}", + Configuration.class) // 100 GB + ); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Test + public void testSpannerWriteAndRead() throws IOException { + PipelineLauncher.LaunchInfo writeInfo = testWrite(); + PipelineOperator.Result writeResult = + pipelineOperator.waitUntilDone( + createConfig(writeInfo, Duration.ofMinutes(configuration.pipelineTimeout))); + assertNotEquals(PipelineOperator.Result.LAUNCH_FAILED, writeResult); + + PipelineLauncher.LaunchInfo readInfo = testRead(); + PipelineOperator.Result result = + pipelineOperator.waitUntilDone( + createConfig(readInfo, Duration.ofMinutes(configuration.pipelineTimeout))); + assertNotEquals(PipelineOperator.Result.LAUNCH_FAILED, result); + assertEquals( + PipelineLauncher.JobState.DONE, + pipelineLauncher.getJobStatus(project, region, readInfo.jobId())); + double numRecords = + pipelineLauncher.getMetric( + project, + region, + readInfo.jobId(), + getBeamMetricsName(PipelineMetricsType.COUNTER, READ_ELEMENT_METRIC_NAME)); + assertEquals(configuration.numRecords, numRecords, 0.5); + + // export metrics + MetricsConfiguration metricsConfig = + MetricsConfiguration.builder() + .setInputPCollection("Map records.out0") + .setInputPCollectionV2("Map records/ParMultiDo(GenerateMutations).out0") + .setOutputPCollection("Counting element.out0") + .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") + .build(); + try { + exportMetricsToBigQuery(writeInfo, getMetrics(writeInfo, metricsConfig)); + exportMetricsToBigQuery(readInfo, getMetrics(readInfo, metricsConfig)); + } catch (ParseException | InterruptedException e) { + throw new RuntimeException(e); + } + } + + private PipelineLauncher.LaunchInfo testWrite() throws IOException { + SpannerIO.Write writeTransform = + SpannerIO.write() + .withProjectId(project) + .withInstanceId(resourceManager.getInstanceId()) + .withDatabaseId(resourceManager.getDatabaseId()); + + writePipeline + .apply(GenerateSequence.from(0).to(configuration.numRecords)) + .apply( + "Map records", + ParDo.of( + new GenerateMutations( + tableName, configuration.numColumns, (int) configuration.valueSizeBytes))) + .apply("Write to Spanner", writeTransform); + + PipelineLauncher.LaunchConfig options = + PipelineLauncher.LaunchConfig.builder("write-spanner") + .setSdk(PipelineLauncher.Sdk.JAVA) + .setPipeline(writePipeline) + .addParameter("runner", configuration.runner) + .build(); + + return pipelineLauncher.launch(project, region, options); + } + + private PipelineLauncher.LaunchInfo testRead() throws IOException { + SpannerIO.Read readTrabsfirn = + SpannerIO.read() + .withProjectId(project) + .withInstanceId(resourceManager.getInstanceId()) + .withDatabaseId(resourceManager.getDatabaseId()) + .withQuery(String.format("SELECT * FROM %s", tableName)); + + readPipeline + .apply("Read from Spanner", readTrabsfirn) + .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))); + + PipelineLauncher.LaunchConfig options = + PipelineLauncher.LaunchConfig.builder("read-spanner") + .setSdk(PipelineLauncher.Sdk.JAVA) + .setPipeline(readPipeline) + .addParameter("runner", configuration.runner) + .build(); + + return pipelineLauncher.launch(project, region, options); + } + + /** Options for SpannerIO load test. */ + static class Configuration extends SyntheticSourceOptions { + + /** + * Number of columns (besides the primary key) of each record. The column size is equally + * distributed as valueSizeBytes/numColumns. + */ + @JsonProperty public int numColumns = 1; + + /** Pipeline timeout in minutes. Must be a positive value. */ + @JsonProperty public int pipelineTimeout = 20; + + /** Runner specified to run the pipeline. */ + @JsonProperty public String runner = "DirectRunner"; + } + + /** + * Generate a create table sql statement with 1 integer column (Id) and additional numBytesCol + * columns. + */ + static String createTableStatement(String tableId, int numBytesCol, int valueSizeBytes) { + int sizePerCol = valueSizeBytes / numBytesCol; + StringBuilder statement = new StringBuilder(); + statement.append(String.format("CREATE TABLE %s (Id INT64", tableId)); + for (int col = 0; col < numBytesCol; ++col) { + statement.append(String.format(",\n COL%d BYTES(%d)", col + 1, sizePerCol)); + } + statement.append(") PRIMARY KEY(Id)"); + return statement.toString(); + } + + /** Maps long number to the Spanner format record. */ + private static class GenerateMutations extends DoFn implements Serializable { + private final String table; + private final int numBytesCol; + private final int sizePerCol; + + public GenerateMutations(String tableId, int numBytesCol, int valueSizeBytes) { + checkArgument(valueSizeBytes >= numBytesCol); + this.table = tableId; + this.numBytesCol = numBytesCol; + this.sizePerCol = valueSizeBytes / numBytesCol; + } + + @ProcessElement + public void processElement(ProcessContext c) { + Mutation.WriteBuilder builder = Mutation.newInsertOrUpdateBuilder(table); + Long key = Objects.requireNonNull(c.element()); + builder.set("Id").to(key); + Random random = new Random(key); + byte[] value = new byte[sizePerCol]; + for (int col = 0; col < numBytesCol; ++col) { + String name = String.format("COL%d", col + 1); + random.nextBytes(value); + builder.set(name).to(ByteArray.copyFrom(value)); + } + Mutation mutation = builder.build(); + c.output(mutation); + } + } +} diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java index 704f8337c66ff..a36f3b340e836 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java @@ -90,7 +90,7 @@ public class FileBasedIOLT extends IOLoadTestBase { @Rule public TestPipeline readPipeline = TestPipeline.create(); - private static final Map TEST_CONFIGS_PRESET; + private static final Map TEST_CONFIGS_PRESET; static { try { diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/AbstractJDBCResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/AbstractJDBCResourceManager.java index b57185b70ebce..6d50dddb0ccd3 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/AbstractJDBCResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/AbstractJDBCResourceManager.java @@ -196,7 +196,8 @@ public boolean write(String tableName, List> rows) valueList.add(null); } else if (NumberUtils.isCreatable(value.toString()) || "true".equalsIgnoreCase(value.toString()) - || "false".equalsIgnoreCase(value.toString())) { + || "false".equalsIgnoreCase(value.toString()) + || value.toString().startsWith("ARRAY[")) { valueList.add(String.valueOf(value)); } else { valueList.add("'" + value + "'"); @@ -226,34 +227,9 @@ public boolean write(String tableName, List> rows) @SuppressWarnings("nullness") public List> readTable(String tableName) { LOG.info("Reading all rows from {}.{}", databaseName, tableName); - - List> resultSet = new ArrayList<>(); - - StringBuilder sql = new StringBuilder(); - try (Connection con = driver.getConnection(getUri(), username, password)) { - Statement stmt = con.createStatement(); - - sql.append("SELECT * FROM ").append(tableName); - ResultSet result = stmt.executeQuery(sql.toString()); - - while (result.next()) { - Map row = new HashMap<>(); - ResultSetMetaData metadata = result.getMetaData(); - // Columns list in table metadata is 1-indexed - for (int i = 1; i <= metadata.getColumnCount(); i++) { - row.put(metadata.getColumnName(i), result.getObject(i)); - } - resultSet.add(row); - } - result.close(); - stmt.close(); - } catch (Exception e) { - throw new JDBCResourceManagerException( - "Failed to fetch rows from table. SQL statement: " + sql, e); - } - + List> result = runSQLQuery(String.format("SELECT * FROM %s", tableName)); LOG.info("Successfully loaded rows from {}.{}", databaseName, tableName); - return resultSet; + return result; } @Override @@ -290,9 +266,21 @@ protected String getFirstRow(String tableName) { } @Override - public synchronized ResultSet runSQLQuery(String sql) { + @SuppressWarnings("nullness") + public synchronized List> runSQLQuery(String sql) { try (Statement stmt = driver.getConnection(getUri(), username, password).createStatement()) { - return stmt.executeQuery(sql); + List> result = new ArrayList<>(); + ResultSet resultSet = stmt.executeQuery(sql); + while (resultSet.next()) { + Map row = new HashMap<>(); + ResultSetMetaData metadata = resultSet.getMetaData(); + // Columns list in table metadata is 1-indexed + for (int i = 1; i <= metadata.getColumnCount(); i++) { + row.put(metadata.getColumnName(i), resultSet.getObject(i)); + } + result.add(row); + } + return result; } catch (Exception e) { throw new JDBCResourceManagerException("Failed to execute SQL statement: " + sql, e); } @@ -307,6 +295,21 @@ public synchronized void runSQLUpdate(String sql) { } } + @Override + public synchronized long getRowCount(String tableName) { + try (Connection con = driver.getConnection(getUri(), username, password)) { + Statement stmt = con.createStatement(); + ResultSet resultSet = stmt.executeQuery(String.format("SELECT count(*) FROM %s", tableName)); + resultSet.next(); + long rows = resultSet.getLong(1); + resultSet.close(); + stmt.close(); + return rows; + } catch (Exception e) { + throw new JDBCResourceManagerException("Failed to get row count from " + tableName, e); + } + } + /** * Builder for {@link AbstractJDBCResourceManager}. * diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/JDBCResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/JDBCResourceManager.java index 9292d4cb42ece..deb29ff3a5ec1 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/JDBCResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/JDBCResourceManager.java @@ -19,7 +19,6 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; -import java.sql.ResultSet; import java.util.List; import java.util.Map; import org.apache.beam.it.common.ResourceManager; @@ -102,7 +101,7 @@ boolean write(String tableName, List> rows) * @param sql The SQL query to run. * @return A ResultSet containing the result of the execution. */ - ResultSet runSQLQuery(String sql); + List> runSQLQuery(String sql); /** * Run the given SQL DML statement (INSERT, UPDATE and DELETE). @@ -111,6 +110,14 @@ boolean write(String tableName, List> rows) */ void runSQLUpdate(String sql); + /** + * Gets the number of rows in table. + * + * @param tableName The name of the table. + * @return a count of number of rows in the table. + */ + long getRowCount(String tableName); + /** Object for managing JDBC table schemas in {@link JDBCResourceManager} instances. */ class JDBCSchema { diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java index c515b2c4844f7..0bcb16c610952 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java @@ -61,14 +61,13 @@ private MSSQLResourceManager(Builder builder) { } @VisibleForTesting - > MSSQLResourceManager( - T container, Builder builder) { + > MSSQLResourceManager(T container, Builder builder) { super(container, builder); initialized = true; } - public static MSSQLResourceManager.Builder builder(String testId) { - return new MSSQLResourceManager.Builder(testId); + public static Builder builder(String testId) { + return new Builder(testId); } private synchronized void createDatabase(String databaseName) { diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java index 688c26dfb56da..e1bf3640b53d8 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java @@ -49,8 +49,8 @@ private MySQLResourceManager(Builder builder) { super(container, builder); } - public static MySQLResourceManager.Builder builder(String testId) { - return new MySQLResourceManager.Builder(testId); + public static Builder builder(String testId) { + return new Builder(testId); } @Override diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java index 8054d26c33f70..f44e939936d28 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java @@ -45,7 +45,7 @@ public class OracleResourceManager extends AbstractJDBCResourceManager( - DockerImageName.parse(builder.containerImageName).withTag(builder.containerImageTag)), - builder); - } - @VisibleForTesting - PostgresResourceManager( - PostgreSQLContainer container, PostgresResourceManager.Builder builder) { + PostgresResourceManager(PostgreSQLContainer container, Builder builder) { super(container, builder); } - public static PostgresResourceManager.Builder builder(String testId) { - return new PostgresResourceManager.Builder(testId); + public static Builder builder(String testId) { + return new Builder(testId); } @Override @@ -80,7 +72,11 @@ public Builder(String testId) { @Override public PostgresResourceManager build() { - return new PostgresResourceManager(this); + PostgreSQLContainer container = + new PostgreSQLContainer<>( + DockerImageName.parse(containerImageName).withTag(containerImageTag)); + container.setCommand("postgres", "-c", "fsync=off", "-c", "max_connections=1000"); + return new PostgresResourceManager(container, this); } } } diff --git a/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java b/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java index 7f7fb5b695698..d9a647dbeebdd 100644 --- a/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java +++ b/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java @@ -71,16 +71,13 @@ public class KafkaResourceManager extends TestContainerResourceManager 0; @@ -105,8 +102,8 @@ private KafkaResourceManager(KafkaResourceManager.Builder builder) { : AdminClient.create(ImmutableMap.of("bootstrap.servers", this.connectionString)); } - public static KafkaResourceManager.Builder builder(String testId) { - return new KafkaResourceManager.Builder(testId); + public static Builder builder(String testId) { + return new Builder(testId); } /** Returns the kafka bootstrap server connection string. */ diff --git a/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java b/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java index 80216b14ac0e6..ed0e556bf0df4 100644 --- a/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java +++ b/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java @@ -69,7 +69,7 @@ public class MongoDBResourceManager extends TestContainerResourceManager( @@ -79,10 +79,7 @@ private Neo4jResourceManager(Neo4jResourceManager.Builder builder) { @VisibleForTesting @SuppressWarnings("nullness") - Neo4jResourceManager( - @Nullable Driver neo4jDriver, - Neo4jContainer container, - Neo4jResourceManager.Builder builder) { + Neo4jResourceManager(@Nullable Driver neo4jDriver, Neo4jContainer container, Builder builder) { super(container, builder); this.adminPassword = builder.adminPassword; @@ -101,8 +98,8 @@ private Neo4jResourceManager(Neo4jResourceManager.Builder builder) { } } - public static Neo4jResourceManager.Builder builder(String testId) { - return new Neo4jResourceManager.Builder(testId); + public static Builder builder(String testId) { + return new Builder(testId); } /** Returns the URI connection string to the Neo4j Database. */ diff --git a/it/neo4j/src/main/java/org/apache/beam/it/neo4j/conditions/Neo4jQueryCheck.java b/it/neo4j/src/main/java/org/apache/beam/it/neo4j/conditions/Neo4jQueryCheck.java index 32e283edb72c1..16f2513384256 100644 --- a/it/neo4j/src/main/java/org/apache/beam/it/neo4j/conditions/Neo4jQueryCheck.java +++ b/it/neo4j/src/main/java/org/apache/beam/it/neo4j/conditions/Neo4jQueryCheck.java @@ -20,6 +20,9 @@ import com.google.auto.value.AutoValue; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.stream.Collectors; import javax.annotation.Nullable; import org.apache.beam.it.conditions.ConditionCheck; import org.apache.beam.it.neo4j.Neo4jResourceManager; @@ -55,9 +58,17 @@ protected CheckResult check() { if (actualResult == null) { return new CheckResult(expectedResult == null); } + + Set> sortedActualResult = sort(actualResult); + Set> sortedExpectedResult = sort(expectedResult); + return new CheckResult( - actualResult.equals(expectedResult), - String.format("Expected %s to equal %s", actualResult, expectedResult)); + sortedActualResult.equals(sortedExpectedResult), + String.format("Expected %s to equal %s", sortedActualResult, sortedExpectedResult)); + } + + private static Set> sort(List> list) { + return list.stream().map(TreeMap::new).collect(Collectors.toSet()); } public static Builder builder(Neo4jResourceManager resourceManager) { diff --git a/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java b/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java index 1ef4726df43aa..0115a791eefe6 100644 --- a/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java +++ b/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java @@ -85,7 +85,7 @@ public class SplunkResourceManager extends TestContainerResourceManagerOptionally, a static resource can be specified by calling the useStaticContainer() method in - * the {@link TestContainerResourceManager.Builder} class. A static resource is a pre-configured - * database or other resource that is ready to be connected to by the resource manager. This could - * be a pre-existing TestContainer that has not been closed, a local database instance, a remote VM, - * or any other source that can be connected to. If a static container is used, the host and port - * must also be configured using the Builder's setHost() and setPort() methods, respectively. + * the {@link Builder} class. A static resource is a pre-configured database or other resource that + * is ready to be connected to by the resource manager. This could be a pre-existing TestContainer + * that has not been closed, a local database instance, a remote VM, or any other source that can be + * connected to. If a static container is used, the host and port must also be configured using the + * Builder's setHost() and setPort() methods, respectively. */ public abstract class TestContainerResourceManager> implements ResourceManager { @@ -48,12 +48,11 @@ public abstract class TestContainerResourceManager private final String host; protected int port; - protected > TestContainerResourceManager( - T container, B builder) { + protected > TestContainerResourceManager(T container, B builder) { this(container, builder, null); } - protected > TestContainerResourceManager( + protected > TestContainerResourceManager( T container, B builder, @Nullable Callable setup) { this.container = container; this.usingStaticContainer = builder.useStaticContainer; diff --git a/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java b/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java index 39a0c0cebedcd..75d5ce3a67cd5 100644 --- a/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java +++ b/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java @@ -81,7 +81,7 @@ public void hasRecordSubset(Map subset) { Map expected = convertMapToTreeMap(subset); for (Map candidate : actual) { boolean match = true; - for (Map.Entry entry : subset.entrySet()) { + for (Entry entry : subset.entrySet()) { if (!candidate.containsKey(entry.getKey()) || !candidate.get(entry.getKey()).equals(entry.getValue())) { match = false; diff --git a/learning/katas/go/go.mod b/learning/katas/go/go.mod index eb161611f8f1f..3e6d7a207c615 100644 --- a/learning/katas/go/go.mod +++ b/learning/katas/go/go.mod @@ -20,6 +20,6 @@ go 1.14 require ( github.com/apache/beam/sdks/v2 v2.40.0 github.com/google/go-cmp v0.5.8 - golang.org/x/net v0.7.0 // indirect + golang.org/x/net v0.17.0 // indirect google.golang.org/genproto v0.0.0-20220815135757-37a418bb8959 // indirect ) diff --git a/learning/katas/go/go.sum b/learning/katas/go/go.sum index 65e59bbb40aa7..a78aca00007fe 100644 --- a/learning/katas/go/go.sum +++ b/learning/katas/go/go.sum @@ -781,6 +781,7 @@ golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -817,6 +818,7 @@ golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -874,8 +876,10 @@ golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= -golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -911,6 +915,7 @@ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220513210516-0976fa681c29/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1011,11 +1016,15 @@ golang.org/x/sys v0.0.0-20220502124256-b6088ccd6cba/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1025,8 +1034,10 @@ golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1092,6 +1103,7 @@ golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/learning/katas/python/Core Transforms/Flatten/Flatten/task.py b/learning/katas/python/Core Transforms/Flatten/Flatten/task.py index ae0f5c81d9589..e8639782c60c6 100644 --- a/learning/katas/python/Core Transforms/Flatten/Flatten/task.py +++ b/learning/katas/python/Core Transforms/Flatten/Flatten/task.py @@ -18,7 +18,7 @@ # name: Flatten # description: Task from katas that merges two PCollections of words into a single PCollection. # multifile: false -# context_line: 31 +# context_line: 33 # categories: # - Flatten # complexity: BASIC @@ -26,16 +26,22 @@ # - merge # - strings -import apache_beam as beam - -with beam.Pipeline() as p: - - wordsStartingWithA = \ - p | 'Words starting with A' >> beam.Create(['apple', 'ant', 'arrow']) - - wordsStartingWithB = \ - p | 'Words starting with B' >> beam.Create(['ball', 'book', 'bow']) - - ((wordsStartingWithA, wordsStartingWithB) +def flatten(): + # [START flatten] + import apache_beam as beam + + with beam.Pipeline() as p: + + wordsStartingWithA = \ + p | 'Words starting with A' >> beam.Create(['apple', 'ant', 'arrow']) + + wordsStartingWithB = \ + p | 'Words starting with B' >> beam.Create(['ball', 'book', 'bow']) + + ((wordsStartingWithA, wordsStartingWithB) | beam.Flatten() | beam.LogElements()) + # [END flatten] + +if __name__ == '__main__': + flatten() diff --git a/model/job-management/src/main/proto/org/apache/beam/model/job_management/v1/beam_expansion_api.proto b/model/job-management/src/main/proto/org/apache/beam/model/job_management/v1/beam_expansion_api.proto index 568f9c8774103..7a26ff6a2af3f 100644 --- a/model/job-management/src/main/proto/org/apache/beam/model/job_management/v1/beam_expansion_api.proto +++ b/model/job-management/src/main/proto/org/apache/beam/model/job_management/v1/beam_expansion_api.proto @@ -53,6 +53,10 @@ message ExpansionRequest { // coders for the output PCollections. Note that the request // may not be fulfilled. map output_coder_requests = 4; + + // A set of requirements that must be used by the expansion service to + // interpret the components provided with this request. + repeated string requirements = 5; } message ExpansionResponse { diff --git a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/beam_runner_api.proto b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/beam_runner_api.proto index 2483103b5794d..db958f183c453 100644 --- a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/beam_runner_api.proto +++ b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/beam_runner_api.proto @@ -1982,5 +1982,9 @@ message StandardResourceHints { // SDKs should convert the size to bytes, but can allow users to specify human-friendly units (e.g. GiB). // Payload: ASCII encoded string of the base 10 representation of an integer number of bytes. MIN_RAM_BYTES = 1 [(beam_urn) = "beam:resources:min_ram_bytes:v1"]; + // Describes desired number of CPUs available in transform's execution environment. + // SDKs should accept and validate a positive integer count. + // Payload: ASCII encoded string of the base 10 representation of an integer number of CPUs. + CPU_COUNT = 2 [(beam_urn) = "beam:resources:cpu_count:v1"]; } } diff --git a/playground/backend/go.mod b/playground/backend/go.mod index 9f5fb433ab7e6..bcb3a78fbd4ed 100644 --- a/playground/backend/go.mod +++ b/playground/backend/go.mod @@ -18,7 +18,7 @@ module beam.apache.org/playground/backend go 1.20 require ( - cloud.google.com/go/datastore v1.10.0 + cloud.google.com/go/datastore v1.11.0 cloud.google.com/go/logging v1.7.0 github.com/GoogleCloudPlatform/functions-framework-go v1.6.1 github.com/confluentinc/confluent-kafka-go/v2 v2.1.1 @@ -33,16 +33,16 @@ require ( github.com/spf13/viper v1.14.0 github.com/stretchr/testify v1.8.2 go.uber.org/goleak v1.2.0 - google.golang.org/grpc v1.54.0 + google.golang.org/grpc v1.56.3 google.golang.org/protobuf v1.30.0 gopkg.in/yaml.v3 v3.0.1 ) require ( cloud.google.com/go v0.110.0 // indirect - cloud.google.com/go/compute v1.19.0 // indirect + cloud.google.com/go/compute v1.19.1 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect - cloud.google.com/go/functions v1.12.0 // indirect + cloud.google.com/go/functions v1.13.0 // indirect cloud.google.com/go/longrunning v0.4.1 // indirect github.com/cenkalti/backoff/v4 v4.1.3 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect @@ -75,15 +75,15 @@ require ( go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.8.0 // indirect go.uber.org/zap v1.21.0 // indirect - golang.org/x/net v0.8.0 // indirect - golang.org/x/oauth2 v0.6.0 // indirect + golang.org/x/net v0.9.0 // indirect + golang.org/x/oauth2 v0.7.0 // indirect golang.org/x/sync v0.1.0 // indirect - golang.org/x/sys v0.6.0 // indirect - golang.org/x/text v0.8.0 // indirect + golang.org/x/sys v0.7.0 // indirect + golang.org/x/text v0.9.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/api v0.114.0 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633 // indirect + google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect nhooyr.io/websocket v1.8.6 // indirect diff --git a/playground/backend/go.sum b/playground/backend/go.sum index 779dc5a38c1f9..516388aeb2762 100644 --- a/playground/backend/go.sum +++ b/playground/backend/go.sum @@ -168,8 +168,9 @@ cloud.google.com/go/compute v1.13.0/go.mod h1:5aPTS0cUNMIc1CE546K+Th6weJUNQErARy cloud.google.com/go/compute v1.14.0/go.mod h1:YfLtxrj9sU4Yxv+sXzZkyPjEyPBZfXHUvjxega5vAdo= cloud.google.com/go/compute v1.15.1/go.mod h1:bjjoF/NtFUrkD/urWfdHaKuOPDR5nWIs63rR+SXhcpA= cloud.google.com/go/compute v1.18.0/go.mod h1:1X7yHxec2Ga+Ss6jPyjxRxpu2uu7PLgsOVXvgU0yacs= -cloud.google.com/go/compute v1.19.0 h1:+9zda3WGgW1ZSTlVppLCYFIr48Pa35q1uG2N1itbCEQ= cloud.google.com/go/compute v1.19.0/go.mod h1:rikpw2y+UMidAe9tISo04EHNOIf42RLYF/q8Bs93scU= +cloud.google.com/go/compute v1.19.1 h1:am86mquDUgjGNWxiGn+5PGLbmgiWXlE/yNWpIpNvuXY= +cloud.google.com/go/compute v1.19.1/go.mod h1:6ylj3a05WF8leseCdIf77NK0g1ey+nj5IKd5/kvShxE= cloud.google.com/go/compute/metadata v0.1.0/go.mod h1:Z1VN+bulIf6bt4P/C37K4DyZYZEXYonfTBHHFPO/4UU= cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= cloud.google.com/go/compute/metadata v0.2.1/go.mod h1:jgHgmJd2RKBGzXqF5LR2EZMGxBkeanZ9wwa75XHJgOM= @@ -220,8 +221,9 @@ cloud.google.com/go/dataqna v0.6.0/go.mod h1:1lqNpM7rqNLVgWBJyk5NF6Uen2PHym0jtVJ cloud.google.com/go/dataqna v0.7.0/go.mod h1:Lx9OcIIeqCrw1a6KdO3/5KMP1wAmTc0slZWwP12Qq3c= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= -cloud.google.com/go/datastore v1.10.0 h1:4siQRf4zTiAVt/oeH4GureGkApgb2vtPQAtOmhpqQwE= cloud.google.com/go/datastore v1.10.0/go.mod h1:PC5UzAmDEkAmkfaknstTYbNpgE49HAgW2J1gcgUfmdM= +cloud.google.com/go/datastore v1.11.0 h1:iF6I/HaLs3Ado8uRKMvZRvF/ZLkWaWE9i8AiHzbC774= +cloud.google.com/go/datastore v1.11.0/go.mod h1:TvGxBIHCS50u8jzG+AW/ppf87v1of8nwzFNgEZU1D3c= cloud.google.com/go/datastream v1.2.0/go.mod h1:i/uTP8/fZwgATHS/XFu0TcNUhuA0twZxxQ3EyCUQMwo= cloud.google.com/go/datastream v1.3.0/go.mod h1:cqlOX8xlyYF/uxhiKn6Hbv6WjwPPuI9W2M9SAXwaLLQ= cloud.google.com/go/datastream v1.4.0/go.mod h1:h9dpzScPhDTs5noEMQVWP8Wx8AFBRyS0s8KWPx/9r0g= @@ -276,8 +278,9 @@ cloud.google.com/go/functions v1.7.0/go.mod h1:+d+QBcWM+RsrgZfV9xo6KfA1GlzJfxcfZ cloud.google.com/go/functions v1.8.0/go.mod h1:RTZ4/HsQjIqIYP9a9YPbU+QFoQsAlYgrwOXJWHn1POY= cloud.google.com/go/functions v1.9.0/go.mod h1:Y+Dz8yGguzO3PpIjhLTbnqV1CWmgQ5UwtlpzoyquQ08= cloud.google.com/go/functions v1.10.0/go.mod h1:0D3hEOe3DbEvCXtYOZHQZmD+SzYsi1YbI7dGvHfldXw= -cloud.google.com/go/functions v1.12.0 h1:TtRl25/oNsZyH3e4WfMRSMmFvmHC3YyQZuWaOpKI9+0= cloud.google.com/go/functions v1.12.0/go.mod h1:AXWGrF3e2C/5ehvwYo/GH6O5s09tOPksiKhz+hH8WkA= +cloud.google.com/go/functions v1.13.0 h1:pPDqtsXG2g9HeOQLoquLbmvmb82Y4Ezdo1GXuotFoWg= +cloud.google.com/go/functions v1.13.0/go.mod h1:EU4O007sQm6Ef/PwRsI8N2umygGqPBS/IZQKBQBcJ3c= cloud.google.com/go/gaming v1.5.0/go.mod h1:ol7rGcxP/qHTRQE/RO4bxkXq+Fix0j6D4LFPzYTIrDM= cloud.google.com/go/gaming v1.6.0/go.mod h1:YMU1GEvA39Qt3zWGyAVA9bpYz/yAhTvaQ1t2sK4KPUA= cloud.google.com/go/gaming v1.7.0/go.mod h1:LrB8U7MHdGgFG851iHAfqUdLcKBdQ55hzXy9xBJz0+w= @@ -1941,8 +1944,9 @@ golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= +golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1970,8 +1974,9 @@ golang.org/x/oauth2 v0.0.0-20221006150949-b44042a4b9c1/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.4.0/go.mod h1:RznEsdpjGAINPTOF0UH/t+xJ75L18YO3Ho6Pyn+uRec= golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= -golang.org/x/oauth2 v0.6.0 h1:Lh8GPgSKBfWSwFvtuWOfeI3aAAnbXTSutYxJiOJFgIw= golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= +golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= +golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -2124,8 +2129,9 @@ golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -2150,8 +2156,9 @@ golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -2468,8 +2475,9 @@ google.golang.org/genproto v0.0.0-20230223222841-637eb2293923/go.mod h1:3Dl5ZL0q google.golang.org/genproto v0.0.0-20230303212802-e74f57abe488/go.mod h1:TvhZT5f700eVlTNwND1xoEZQeWTB2RY/65kplwl/bFA= google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4/go.mod h1:NWraEVixdDnqcqQ30jipen1STv2r/n24Wb7twVTGR4s= google.golang.org/genproto v0.0.0-20230320184635-7606e756e683/go.mod h1:NWraEVixdDnqcqQ30jipen1STv2r/n24Wb7twVTGR4s= -google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633 h1:0BOZf6qNozI3pkN3fJLwNubheHJYHhMh91GRFOWWK08= google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= +google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A= +google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= google.golang.org/grpc v0.0.0-20160317175043-d3ddb4469d5a/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= @@ -2517,8 +2525,9 @@ google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCD google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.51.0/go.mod h1:wgNDFcnuBGmxLKI/qn4T+m5BtEBYXJPvibbUPsAIPww= google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= -google.golang.org/grpc v1.54.0 h1:EhTqbhiYeixwWQtAEZAxmV9MGqcjEU2mFx52xCzNyag= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= +google.golang.org/grpc v1.56.3 h1:8I4C0Yq1EjstUzUJzpcRVbuYA2mODtEmpWiQoN/b2nc= +google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/playground/kafka-emulator/build.gradle b/playground/kafka-emulator/build.gradle index 486a232f9b99e..2d3f70aa9883f 100644 --- a/playground/kafka-emulator/build.gradle +++ b/playground/kafka-emulator/build.gradle @@ -24,11 +24,11 @@ plugins { applyJavaNature(exportJavadoc: false, publish: false) distZip { - archiveName "${baseName}.zip" + archiveFileName = "${archiveBaseName}.zip" } distTar { - archiveName "${baseName}.tar" + archiveFileName = "${archiveBaseName}.tar" } dependencies { diff --git a/release/build.gradle.kts b/release/build.gradle.kts index abea131423a67..abb34d8605ad0 100644 --- a/release/build.gradle.kts +++ b/release/build.gradle.kts @@ -29,8 +29,8 @@ val library = project.extensions.extraProperties["library"] as Map 0] - - for i in range(len(comments)): - gha_trigger_commands.append(comments[i]) - - return gha_trigger_commands - ################################################################################ if __name__ == '__main__': ''' @@ -156,9 +142,6 @@ def getGithubActionsTriggerCommands(dirname): parts = comments[i].split(',') comments[i] = (parts[0], parts[1]) - gha_comments = getGithubActionsTriggerCommands(dirname) - comments.extend(gha_comments) - if not probeGitHubIsUp(): print("GitHub is unavailable, skipping fetching data.") exit() @@ -169,7 +152,8 @@ def getGithubActionsTriggerCommands(dirname): pr = input("Enter the Beam PR number to test (e.g. 11403): ") subjectId = getSubjectId(accessToken, pr) - + + # TODO(yathu): also auto rerun failed GitHub Action workflow remainingComments = getRemainingComments(accessToken, pr, comments) if len(remainingComments) == 0: print('Jobs have been started for all comments. If you would like to retry all jobs, create a new commit before running this script.') diff --git a/release/src/main/scripts/run_rc_validation.sh b/release/src/main/scripts/run_rc_validation.sh index 7f32c2979660b..0f2bfe4aaec29 100755 --- a/release/src/main/scripts/run_rc_validation.sh +++ b/release/src/main/scripts/run_rc_validation.sh @@ -300,14 +300,14 @@ if [[ ("$python_leaderboard_direct" = true \ cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.zip ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then { echo "Fail to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.zip.sha512 + sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 echo "--------------------------Updating ~/.m2/settings.xml-------------------------" cd ~ @@ -378,7 +378,7 @@ if [[ ("$python_leaderboard_direct" = true \ pip install --upgrade pip setuptools wheel echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.zip[gcp] + pip install apache-beam-${RELEASE_VER}.tar.gz[gcp] echo "----------------Starting Leaderboard with DirectRunner-----------------------" if [[ "$python_leaderboard_direct" = true ]]; then @@ -434,7 +434,7 @@ if [[ ("$python_leaderboard_direct" = true \ --dataset ${LEADERBOARD_DF_DATASET} \ --runner DataflowRunner \ --temp_location=${USER_GCS_BUCKET}/temp/ \ - --sdk_location apache-beam-${RELEASE_VER}.zip; \ + --sdk_location apache-beam-${RELEASE_VER}.tar.gz; \ exec bash" echo "***************************************************************" @@ -509,7 +509,7 @@ if [[ ("$python_leaderboard_direct" = true \ --dataset ${GAMESTATS_DF_DATASET} \ --runner DataflowRunner \ --temp_location=${USER_GCS_BUCKET}/temp/ \ - --sdk_location apache-beam-${RELEASE_VER}.zip \ + --sdk_location apache-beam-${RELEASE_VER}.tar.gz \ --fixed_window_duration ${FIXED_WINDOW_DURATION}; exec bash" echo "***************************************************************" @@ -566,14 +566,14 @@ if [[ ("$python_xlang_quickstart" = true) \ cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.zip ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then { echo "Failed to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.zip.sha512 + sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 `which pip` install --upgrade pip `which pip` install --upgrade setuptools @@ -593,7 +593,7 @@ if [[ ("$python_xlang_quickstart" = true) \ ln -s ${LOCAL_BEAM_DIR}/sdks beam_env_${py_version}/lib/sdks echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.zip + pip install apache-beam-${RELEASE_VER}.tar.gz echo '************************************************************'; echo '* Running Python Multi-language Quickstart with DirectRunner'; @@ -672,14 +672,14 @@ if [[ ("$java_xlang_quickstart" = true) \ cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.zip ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then { echo "Failed to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.zip.sha512 + sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 `which pip` install --upgrade pip `which pip` install --upgrade setuptools @@ -699,7 +699,7 @@ if [[ ("$java_xlang_quickstart" = true) \ ln -s ${LOCAL_BEAM_DIR}/sdks beam_env_${py_version}/lib/sdks echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.zip[dataframe] + pip install apache-beam-${RELEASE_VER}.tar.gz[dataframe] # Deacrivating in the main shell. We will reactivate the virtual environment new shells # for the expansion service and the job server. @@ -768,14 +768,14 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true cd ${LOCAL_BEAM_DIR} echo "---------------------Downloading Python Staging RC----------------------------" - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip - wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.zip.sha512 - if [[ ! -f apache-beam-${RELEASE_VER}.zip ]]; then + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz + wget ${PYTHON_RC_DOWNLOAD_URL}/${RELEASE_VER}/python/apache-beam-${RELEASE_VER}.tar.gz.sha512 + if [[ ! -f apache-beam-${RELEASE_VER}.tar.gz ]]; then { echo "Fail to download Python Staging RC files." ;exit 1; } fi echo "--------------------------Verifying Hashes------------------------------------" - sha512sum -c apache-beam-${RELEASE_VER}.zip.sha512 + sha512sum -c apache-beam-${RELEASE_VER}.tar.gz.sha512 `which pip` install --upgrade pip `which pip` install --upgrade setuptools @@ -807,7 +807,7 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true ln -s ${LOCAL_BEAM_DIR}/sdks beam_env_${py_version}/lib/sdks echo "--------------------------Installing Python SDK-------------------------------" - pip install apache-beam-${RELEASE_VER}.zip[gcp] + pip install apache-beam-${RELEASE_VER}.tar.gz[gcp] echo "----------------Starting XLang Kafka Taxi with DataflowRunner---------------------" if [[ "$python_xlang_kafka_taxi_dataflow" = true ]]; then @@ -837,7 +837,7 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true --temp_location=${USER_GCS_BUCKET}/temp/ \ --with_metadata \ --beam_services=\"{\\\"sdks:java:io:expansion-service:shadowJar\\\": \\\"${KAFKA_EXPANSION_SERVICE_JAR}\\\"}\" \ - --sdk_location apache-beam-${RELEASE_VER}.zip; \ + --sdk_location apache-beam-${RELEASE_VER}.tar.gz; \ exec bash" echo "***************************************************************" @@ -882,7 +882,7 @@ if [[ ("$python_xlang_kafka_taxi_dataflow" = true --temp_location=${USER_GCS_BUCKET}/temp/ \ --output_topic projects/${USER_GCP_PROJECT}/topics/${SQL_TAXI_TOPIC} \ --beam_services=\"{\\\":sdks:java:extensions:sql:expansion-service:shadowJar\\\": \\\"${SQL_EXPANSION_SERVICE_JAR}\\\"}\" \ - --sdk_location apache-beam-${RELEASE_VER}.zip; \ + --sdk_location apache-beam-${RELEASE_VER}.tar.gz; \ exec bash" echo "***************************************************************" diff --git a/release/src/main/scripts/verify_release_build.sh b/release/src/main/scripts/verify_release_build.sh index 214c65cc9ef64..51008d40831f4 100755 --- a/release/src/main/scripts/verify_release_build.sh +++ b/release/src/main/scripts/verify_release_build.sh @@ -136,9 +136,9 @@ if [[ ! -z `which hub` ]]; then # Without changing to dev version, the dataflow pipeline will fail because of non-existed worker containers. # Note that dataflow worker containers should be built after RC has been built. bash "$SCRIPT_DIR"/set_version.sh "$RELEASE_VER" --git-add - # In case the version string was not changed, append a newline to CHANGES.md - echo "" >> CHANGES.md - git add CHANGES.md + # add a file that will trigger all relevant GHA workflows. Need to be .json extension to be excluded from RAT check + echo "{}" > release/trigger_all_tests.json + git add release/trigger_all_tests.json git commit -m "Changed version.py and gradle.properties to python dev version to create a test PR" --quiet git push -f ${GITHUB_USERNAME} ${WORKING_BRANCH} --quiet @@ -147,6 +147,6 @@ if [[ ! -z `which hub` ]]; then You can run many tests automatically using release/src/main/scripts/mass_comment.py." echo "" - echo "[NOTE]: Please make sure all test targets have been invoked." + echo "[NOTE]: Please make sure all test targets (GHA and Jenkins) have been invoked." echo "Please check the test results. If there is any failure, follow the policy in release guide." fi diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/Environments.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/Environments.java index 31a555989afdc..f531b5be344df 100644 --- a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/Environments.java +++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/Environments.java @@ -94,7 +94,8 @@ public class Environments { public enum JavaVersion { java8("java", "1.8", 8), java11("java11", "11", 11), - java17("java17", "17", 17); + java17("java17", "17", 17), + java21("java21", "21", 21); // Legacy name, as used in container image private final String legacyName; @@ -119,6 +120,7 @@ public String specification() { return this.specification; } + /** Return the LTS java version given the Java specification version. */ public static JavaVersion forSpecification(String specification) { for (JavaVersion ver : JavaVersion.values()) { if (ver.specification.equals(specification)) { @@ -137,7 +139,7 @@ public static JavaVersion forSpecification(String specification) { } } LOG.warn( - "unsupported Java version: {}, falling back to: {}", + "Unsupported Java version: {}, falling back to: {}", specification, fallback.specification); return fallback; diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/External.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/External.java index 534a2b5fe0e60..93a1ade474a5c 100644 --- a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/External.java +++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/External.java @@ -19,6 +19,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; +import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.nio.file.Files; @@ -391,7 +392,8 @@ private static List resolveArtifacts( .build()) .getReplacementsList()) { Path path = Files.createTempFile("beam-artifact", ""); - try (FileOutputStream fout = new FileOutputStream(path.toFile())) { + File artifactFile = path.toFile(); + try (FileOutputStream fout = new FileOutputStream(artifactFile)) { for (Iterator it = retrievalStub.getArtifact( ArtifactApi.GetArtifactRequest.newBuilder().setArtifact(artifact).build()); @@ -409,6 +411,8 @@ private static List resolveArtifacts( .build() .toByteString()) .build()); + // Delete beam-artifact temp File on program exit + artifactFile.deleteOnExit(); } return resolved; } diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/TransformUpgrader.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/TransformUpgrader.java index d657bb31b184f..db5dfcf6825dd 100644 --- a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/TransformUpgrader.java +++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/TransformUpgrader.java @@ -108,7 +108,7 @@ public RunnerApi.Pipeline upgradeTransformsViaTransformService( } else if (options.getTransformServiceBeamVersion() != null) { String projectName = UUID.randomUUID().toString(); int port = findAvailablePort(); - service = TransformServiceLauncher.forProject(projectName, port); + service = TransformServiceLauncher.forProject(projectName, port, null); service.setBeamVersion(options.getTransformServiceBeamVersion()); // Starting the transform service. @@ -187,6 +187,7 @@ RunnerApi.Pipeline updateTransformViaTransformService( .setComponents(runnerAPIpipeline.getComponents()) .setTransform(ptransformBuilder.build()) .setNamespace(UPGRADE_NAMESPACE) + .addAllRequirements(runnerAPIpipeline.getRequirementsList()) .build(); ExpansionApi.ExpansionResponse response = diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSource.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSource.java index 67697636a363c..53fad782da968 100644 --- a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSource.java +++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSource.java @@ -17,8 +17,8 @@ */ package org.apache.beam.runners.core.construction; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import java.io.IOException; import java.io.InputStream; @@ -288,6 +288,15 @@ private void init( residualElementsList == null ? new ResidualElements(Collections.emptyList()) : new ResidualElements(residualElementsList); + + if (this.residualSource != null) { + // close current residualSource to avoid leak of reader.close() in ResidualSource + try { + this.residualSource.close(); + } catch (IOException e) { + LOG.warn("Ignore error at closing ResidualSource", e); + } + } this.residualSource = residualSource == null ? null : new ResidualSource(residualSource, options); } @@ -465,7 +474,7 @@ public ResidualSource(BoundedSource residualSource, PipelineOptions options) } private boolean advance() throws IOException { - checkArgument(!closed, "advance() call on closed %s", getClass().getName()); + checkState(!closed, "advance() call on closed %s", getClass().getName()); if (readerDone) { return false; } @@ -505,6 +514,7 @@ BoundedSource getSource() { } Checkpoint getCheckpointMark() { + checkState(!closed, "getCheckpointMark() call on closed %s", getClass().getName()); if (reader == null) { // Reader hasn't started, checkpoint the residualSource. return new Checkpoint<>(null /* residualElements */, residualSource); diff --git a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/EnvironmentsTest.java b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/EnvironmentsTest.java index ae429fb1fe6d7..b71a654f1031b 100644 --- a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/EnvironmentsTest.java +++ b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/EnvironmentsTest.java @@ -291,6 +291,8 @@ public void testLtsJavaVersion() { assertEquals("java11", JavaVersion.java11.legacyName()); assertEquals(JavaVersion.java17, JavaVersion.forSpecification("17")); assertEquals("java17", JavaVersion.java17.legacyName()); + assertEquals(JavaVersion.java21, JavaVersion.forSpecification("21")); + assertEquals("java21", JavaVersion.java21.legacyName()); } @Test @@ -303,7 +305,9 @@ public void testNonLtsJavaVersion() { assertEquals(JavaVersion.java17, JavaVersion.forSpecification("15")); assertEquals(JavaVersion.java17, JavaVersion.forSpecification("16")); assertEquals(JavaVersion.java17, JavaVersion.forSpecification("18")); - assertEquals(JavaVersion.java17, JavaVersion.forSpecification("19")); + assertEquals(JavaVersion.java21, JavaVersion.forSpecification("19")); + assertEquals(JavaVersion.java21, JavaVersion.forSpecification("20")); + assertEquals(JavaVersion.java21, JavaVersion.forSpecification("21")); } @Test(expected = UnsupportedOperationException.class) diff --git a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSourceTest.java b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSourceTest.java index cd4b49262fcb8..31f6842a42bc3 100644 --- a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSourceTest.java +++ b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/UnboundedReadFromBoundedSourceTest.java @@ -26,9 +26,15 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.lang.ref.Reference; +import java.lang.ref.ReferenceQueue; +import java.lang.ref.WeakReference; import java.nio.ByteBuffer; import java.nio.channels.ReadableByteChannel; +import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.NoSuchElementException; import java.util.Random; import org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter; @@ -69,10 +75,14 @@ import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** Unit tests for {@link UnboundedReadFromBoundedSource}. */ @RunWith(JUnit4.class) public class UnboundedReadFromBoundedSourceTest { + private static final Logger LOG = + LoggerFactory.getLogger(UnboundedReadFromBoundedSourceTest.class); @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); @@ -280,6 +290,38 @@ public void testReadFromCheckpointBeforeStart() throws Exception { unboundedSource.createReader(options, checkpoint).getCurrent(); } + @Test + public void testReadersClosedProperly() throws IOException { + ManagedReaderBoundedSource boundedSource = new ManagedReaderBoundedSource(0, 10); + BoundedToUnboundedSourceAdapter unboundedSource = + new BoundedToUnboundedSourceAdapter<>(boundedSource); + PipelineOptions options = PipelineOptionsFactory.create(); + + BoundedToUnboundedSourceAdapter.Reader reader = + unboundedSource.createReader(options, new Checkpoint(null, boundedSource)); + + for (int i = 0; i < 3; ++i) { + if (i == 0) { + assertTrue(reader.start()); + } else { + assertTrue(reader.advance()); + } + assertEquals(i, (int) reader.getCurrent()); + } + Checkpoint checkpoint = reader.getCheckpointMark(); + List> residualElements = checkpoint.getResidualElements(); + for (int i = 0; i < 7; ++i) { + TimestampedValue element = residualElements.get(i); + assertEquals(i + 3, (int) element.getValue()); + } + for (int i = 0; i < 100; ++i) { + // A WeakReference of an object that no other objects reference are not immediately added to + // ReferenceQueue. To test this, we should run System.gc() multiple times. + // If a reader is GCed without closing, `cleanQueue` throws a RuntimeException. + boundedSource.cleanQueue(); + } + } + /** Generate byte array of given size. */ private static byte[] generateInput(int size) { // Arbitrary but fixed seed @@ -298,6 +340,7 @@ private static void writeFile(File file, byte[] input) throws IOException { /** Unsplittable source for use in tests. */ private static class UnsplittableSource extends FileBasedSource { + public UnsplittableSource(String fileOrPatternSpec, long minBundleSize) { super(StaticValueProvider.of(fileOrPatternSpec), minBundleSize); } @@ -323,6 +366,7 @@ public Coder getOutputCoder() { } private static class UnsplittableReader extends FileBasedReader { + ByteBuffer buff = ByteBuffer.allocate(1); Byte current; long offset; @@ -370,4 +414,140 @@ protected long getCurrentOffset() { } } } + + /** + * An integer generating bounded source. This source class checks if readers are closed properly. + * For that, it manages weak references of readers, and checks at `createReader` and `cleanQueue` + * if readers were closed before GCed. The `cleanQueue` does not change the state in + * `ManagedReaderBoundedSource`, but throws an exception if it finds a reader GCed without + * closing. + */ + private static class ManagedReaderBoundedSource extends BoundedSource { + + private final int from; + private final int to; // exclusive + + private transient ReferenceQueue refQueue; + private transient Map, CloseStatus> cloesStatusMap; + + public ManagedReaderBoundedSource(int from, int to) { + if (from > to) { + throw new RuntimeException( + String.format("`from` <= `to`, but got from: %d, to: %d", from, to)); + } + this.from = from; + this.to = to; + } + + @Override + public List> split( + long desiredBundleSizeBytes, PipelineOptions options) { + return Collections.singletonList(this); + } + + @Override + public long getEstimatedSizeBytes(PipelineOptions options) { + return (to - from) * 4L; + } + + @Override + public BoundedReader createReader(PipelineOptions options) { + // Add weak reference to queue to monitor GCed readers. If `CloseStatus` associated with + // reader is not closed, it means a reader was GCed without closing properly. The CloseStatus + // check for GCed readers are done at cleanQueue(). + if (refQueue == null) { + refQueue = new ReferenceQueue<>(); + cloesStatusMap = new HashMap<>(); + } + cleanQueue(); + + CloseStatus status = new CloseStatus(); + ManagedReader reader = new ManagedReader(status); + WeakReference reference = new WeakReference<>(reader, refQueue); + cloesStatusMap.put(reference, status); + LOG.info("Add reference {} for reader {}", reference, reader); + return reader; + } + + public void cleanQueue() { + System.gc(); + + Reference reference; + while ((reference = refQueue.poll()) != null) { + CloseStatus closeStatus = cloesStatusMap.get(reference); + LOG.info("Poll reference: {}, closed: {}", reference, closeStatus.closed); + closeStatus.throwIfNotClosed(); + } + } + + class CloseStatus { + + private final RuntimeException allocationStacktrace; + + private boolean closed; + + public CloseStatus() { + allocationStacktrace = + new RuntimeException("Previous reader was not closed properly. Reader allocation was"); + closed = false; + } + + void close() { + cleanQueue(); + closed = true; + } + + void throwIfNotClosed() { + if (!closed) { + throw allocationStacktrace; + } + } + } + + class ManagedReader extends BoundedReader { + + private final CloseStatus status; + + int current; + + public ManagedReader(CloseStatus status) { + this.status = status; + } + + @Override + public boolean start() { + if (from < to) { + current = from; + return true; + } else { + return false; + } + } + + @Override + public boolean advance() { + if (current + 1 < to) { + ++current; + return true; + } else { + return false; + } + } + + @Override + public Integer getCurrent() { + return current; + } + + @Override + public void close() { + status.close(); + } + + @Override + public BoundedSource getCurrentSource() { + return ManagedReaderBoundedSource.this; + } + } + } } diff --git a/runners/flink/1.12/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java b/runners/flink/1.12/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java index bb794e04398d6..5072e6b2459f8 100644 --- a/runners/flink/1.12/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java +++ b/runners/flink/1.12/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java @@ -20,6 +20,7 @@ import org.apache.flink.streaming.api.operators.AbstractStreamOperator; import org.apache.flink.streaming.api.operators.InternalTimeServiceManager; import org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl; +import org.apache.flink.streaming.api.operators.sorted.state.BatchExecutionInternalTimeServiceManager; /** Compatibility layer for {@link AbstractStreamOperator} breaking changes. */ public abstract class AbstractStreamOperatorCompat @@ -44,9 +45,18 @@ protected int numProcessingTimeTimers() { return getTimeServiceManager() .map( manager -> { - final InternalTimeServiceManagerImpl cast = - (InternalTimeServiceManagerImpl) getTimeServiceManagerCompat(); - return cast.numProcessingTimeTimers(); + InternalTimeServiceManager tsm = getTimeServiceManagerCompat(); + if (tsm instanceof InternalTimeServiceManagerImpl) { + final InternalTimeServiceManagerImpl cast = + (InternalTimeServiceManagerImpl) getTimeServiceManagerCompat(); + return cast.numProcessingTimeTimers(); + } else if (tsm instanceof BatchExecutionInternalTimeServiceManager) { + return 0; + } else { + throw new IllegalStateException( + String.format( + "Unknown implementation of InternalTimerServiceManager. %s", tsm)); + } }) .orElse(0); } diff --git a/runners/flink/1.12/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java b/runners/flink/1.12/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java index 1ddc2a957b7da..0b9ca07f99a94 100644 --- a/runners/flink/1.12/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java +++ b/runners/flink/1.12/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java @@ -33,6 +33,7 @@ public class SourceTestCompat { public static class TestMetricGroup extends UnregisteredMetricGroups.UnregisteredOperatorMetricGroup { public final Map> registeredGauge = new HashMap<>(); + public final Map registeredCounter = new HashMap<>(); public final Counter numRecordsInCounter = new SimpleCounter(); @Override @@ -41,6 +42,18 @@ public > GaugeT gauge(String name, GaugeT gauge) { return gauge; } + @Override + public Counter counter(String name) { + // The OperatorIOMetricsGroup will register some IO metrics in the constructor. + // At that time, the construction of this class has not finihsed yet, so we + // need to delegate the call to the parent class. + if (registeredCounter != null) { + return registeredCounter.computeIfAbsent(name, ignored -> super.counter(name)); + } else { + return super.counter(name); + } + } + @Override public OperatorIOMetricGroup getIOMetricGroup() { return new OperatorIOMetricGroup(this) { diff --git a/runners/flink/1.14/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java b/runners/flink/1.14/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java index 3b64612d6d199..d8740964fda9b 100644 --- a/runners/flink/1.14/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java +++ b/runners/flink/1.14/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/AbstractStreamOperatorCompat.java @@ -20,6 +20,7 @@ import org.apache.flink.streaming.api.operators.AbstractStreamOperator; import org.apache.flink.streaming.api.operators.InternalTimeServiceManager; import org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl; +import org.apache.flink.streaming.api.operators.sorted.state.BatchExecutionInternalTimeServiceManager; /** Compatibility layer for {@link AbstractStreamOperator} breaking changes. */ public abstract class AbstractStreamOperatorCompat @@ -44,9 +45,18 @@ protected int numProcessingTimeTimers() { return getTimeServiceManager() .map( manager -> { - final InternalTimeServiceManagerImpl cast = - (InternalTimeServiceManagerImpl) getTimeServiceManagerCompat(); - return cast.numProcessingTimeTimers(); + InternalTimeServiceManager tsm = getTimeServiceManagerCompat(); + if (tsm instanceof InternalTimeServiceManagerImpl) { + final InternalTimeServiceManagerImpl cast = + (InternalTimeServiceManagerImpl) getTimeServiceManagerCompat(); + return cast.numProcessingTimeTimers(); + } else if (tsm instanceof BatchExecutionInternalTimeServiceManager) { + return 0; + } else { + throw new IllegalStateException( + String.format( + "Unknown implementation of InternalTimerServiceManager. %s", tsm)); + } }) .orElse(0); } diff --git a/runners/flink/1.14/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java b/runners/flink/1.14/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java index 62b16eedca0bd..8cda1341fd223 100644 --- a/runners/flink/1.14/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java +++ b/runners/flink/1.14/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/SourceTestCompat.java @@ -34,6 +34,7 @@ public class SourceTestCompat { public static class TestMetricGroup extends UnregisteredMetricsGroup implements SourceReaderMetricGroup { public final Map> registeredGauge = new HashMap<>(); + public final Map registeredCounter = new HashMap<>(); public final Counter numRecordsInCounter = new SimpleCounter(); @Override @@ -52,6 +53,18 @@ public > GaugeT gauge(String name, GaugeT gauge) { return gauge; } + @Override + public Counter counter(String name) { + // The OperatorIOMetricsGroup will register some IO metrics in the constructor. + // At that time, the construction of this class has not finihsed yet, so we + // need to delegate the call to the parent class. + if (registeredCounter != null) { + return registeredCounter.computeIfAbsent(name, ignored -> super.counter(name)); + } else { + return super.counter(name); + } + } + @Override public Counter getNumRecordsInErrorsCounter() { return new SimpleCounter(); diff --git a/runners/flink/flink_runner.gradle b/runners/flink/flink_runner.gradle index 30fb922e9c7c7..c510b346d5d02 100644 --- a/runners/flink/flink_runner.gradle +++ b/runners/flink/flink_runner.gradle @@ -46,36 +46,51 @@ evaluationDependsOn(":examples:java") /* * Copy & merge source overrides into build directory. */ -def sourceOverridesBase = "${project.buildDir}/source-overrides/src" +def sourceOverridesBase = project.layout.buildDirectory.dir('source-overrides/src').get() def copySourceOverrides = tasks.register('copySourceOverrides', Copy) { it.from main_source_overrides it.into "${sourceOverridesBase}/main/java" it.duplicatesStrategy DuplicatesStrategy.INCLUDE } -compileJava.dependsOn copySourceOverrides def copyResourcesOverrides = tasks.register('copyResourcesOverrides', Copy) { it.from main_resources_overrides it.into "${sourceOverridesBase}/main/resources" it.duplicatesStrategy DuplicatesStrategy.INCLUDE } -processResources.dependsOn copyResourcesOverrides def copyTestSourceOverrides = tasks.register('copyTestSourceOverrides', Copy) { it.from test_source_overrides it.into "${sourceOverridesBase}/test/java" it.duplicatesStrategy DuplicatesStrategy.INCLUDE } -compileTestJava.dependsOn copyTestSourceOverrides def copyTestResourcesOverrides = tasks.register('copyTestResourcesOverrides', Copy) { it.from test_resources_overrides it.into "${sourceOverridesBase}/test/resources" it.duplicatesStrategy DuplicatesStrategy.INCLUDE } + +// add dependency to gradle Java plugin defined tasks +compileJava.dependsOn copySourceOverrides +processResources.dependsOn copyResourcesOverrides +compileTestJava.dependsOn copyTestSourceOverrides processTestResources.dependsOn copyTestResourcesOverrides +// add dependency BeamModulePlugin defined custom tasks +// they are defined only when certain flags are provided (e.g. -Prelease; -Ppublishing, etc) +def sourcesJar = project.tasks.findByName('sourcesJar') +if (sourcesJar != null) { + sourcesJar.dependsOn copySourceOverrides + sourcesJar.dependsOn copyResourcesOverrides +} +def testSourcesJar = project.tasks.findByName('testSourcesJar') +if (testSourcesJar != null) { + testSourcesJar.dependsOn copyTestSourceOverrides + testSourcesJar.dependsOn copyTestResourcesOverrides +} + /* * We have to explicitly set all directories here to make sure each * version of Flink has the correct overrides set. @@ -222,6 +237,7 @@ class ValidatesRunnerConfig { String name boolean streaming boolean checkpointing + boolean useDataStreamForBatch ArrayList sickbayTests } @@ -240,6 +256,7 @@ def createValidatesRunnerTask(Map m) { description = "Validates the ${runnerType} runner" def pipelineOptionsArray = ["--runner=TestFlinkRunner", "--streaming=${config.streaming}", + "--useDataStreamForBatch=${config.useDataStreamForBatch}", "--parallelism=2", ] if (config.checkpointing) { @@ -299,12 +316,17 @@ def createValidatesRunnerTask(Map m) { excludeTestsMatching 'org.apache.beam.sdk.testing.TestStreamTest.testFirstElementLate' // https://github.com/apache/beam/issues/20844 excludeTestsMatching 'org.apache.beam.sdk.testing.TestStreamTest.testLateDataAccumulating' + if (!config.streaming) { + // FlinkBatchExecutionInternalTimeService does not support timer registration on timer firing. + excludeTestsMatching 'org.apache.beam.sdk.transforms.ParDoTest$TimestampTests.testOnTimerTimestampSkew' + } } } } } createValidatesRunnerTask(name: "validatesRunnerBatch", streaming: false, sickbayTests: sickbayTests) +createValidatesRunnerTask(name: "validatesRunnerBatchWithDataStream", streaming: false, useDataStreamForBatch: true, sickbayTests: sickbayTests) createValidatesRunnerTask(name: "validatesRunnerStreaming", streaming: true, sickbayTests: sickbayTests) // We specifically have a variant which runs with checkpointing enabled for the // tests that require it since running a checkpoint variant is significantly @@ -317,6 +339,7 @@ tasks.register('validatesRunner') { group = 'Verification' description "Validates Flink runner" dependsOn validatesRunnerBatch + dependsOn validatesRunnerBatchWithDataStream dependsOn validatesRunnerStreaming dependsOn validatesRunnerStreamingCheckpointing } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java index 7961bea6069d9..12ed3603264a2 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironment.java @@ -27,6 +27,7 @@ import org.apache.beam.sdk.metrics.MetricsOptions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.flink.api.common.JobExecutionResult; +import org.apache.flink.api.common.RuntimeExecutionMode; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.core.execution.JobClient; import org.apache.flink.runtime.jobgraph.JobGraph; @@ -101,13 +102,17 @@ public void translate(Pipeline pipeline) { prepareFilesToStageForRemoteClusterExecution(options); FlinkPipelineTranslator translator; - if (options.isStreaming()) { + if (options.isStreaming() || options.getUseDataStreamForBatch()) { this.flinkStreamEnv = FlinkExecutionEnvironments.createStreamExecutionEnvironment(options); if (hasUnboundedOutput && !flinkStreamEnv.getCheckpointConfig().isCheckpointingEnabled()) { LOG.warn( "UnboundedSources present which rely on checkpointing, but checkpointing is disabled."); } - translator = new FlinkStreamingPipelineTranslator(flinkStreamEnv, options); + translator = + new FlinkStreamingPipelineTranslator(flinkStreamEnv, options, options.isStreaming()); + if (!options.isStreaming()) { + flinkStreamEnv.setRuntimeMode(RuntimeExecutionMode.BATCH); + } } else { this.flinkBatchEnv = FlinkExecutionEnvironments.createBatchExecutionEnvironment(options); translator = new FlinkBatchPipelineTranslator(flinkBatchEnv, options); diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java index 1e01514fe8b65..f0514c69891bf 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkPipelineOptions.java @@ -32,7 +32,11 @@ * requiring flink on the classpath (e.g. to use with the direct runner). */ public interface FlinkPipelineOptions - extends PipelineOptions, ApplicationNameOptions, StreamingOptions, FileStagingOptions { + extends PipelineOptions, + ApplicationNameOptions, + StreamingOptions, + FileStagingOptions, + VersionDependentFlinkPipelineOptions { String AUTO = "[auto]"; String PIPELINED = "PIPELINED"; @@ -320,6 +324,14 @@ public interface FlinkPipelineOptions void setFileInputSplitMaxSizeMB(Long fileInputSplitMaxSizeMB); + @Description( + "Allow drain operation for flink pipelines that contain RequiresStableInput operator. Note that at time of draining," + + "the RequiresStableInput contract might be violated if there any processing related failures in the DoFn operator.") + @Default.Boolean(false) + Boolean getEnableStableInputDrain(); + + void setEnableStableInputDrain(Boolean enableStableInputDrain); + static FlinkPipelineOptions defaults() { return PipelineOptionsFactory.as(FlinkPipelineOptions.class); } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java index 885571a7ee77d..d892049bce4b5 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkRunnerResult.java @@ -18,11 +18,11 @@ package org.apache.beam.runners.flink; import static org.apache.beam.runners.core.metrics.MetricsContainerStepMap.asAttemptedOnlyMetricResults; +import static org.apache.beam.runners.flink.metrics.FlinkMetricContainer.ACCUMULATOR_NAME; import java.util.Collections; import java.util.Map; import org.apache.beam.runners.core.metrics.MetricsContainerStepMap; -import org.apache.beam.runners.flink.metrics.FlinkMetricContainer; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.metrics.MetricResults; import org.joda.time.Duration; @@ -80,6 +80,6 @@ public MetricResults metrics() { } MetricsContainerStepMap getMetricsContainerStepMap() { - return (MetricsContainerStepMap) accumulators.get(FlinkMetricContainer.ACCUMULATOR_NAME); + return (MetricsContainerStepMap) accumulators.get(ACCUMULATOR_NAME); } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java index e9f3f7fe9176c..ffc7da97cd022 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslator.java @@ -81,8 +81,9 @@ class FlinkStreamingPipelineTranslator extends FlinkPipelineTranslator { private int depth = 0; - public FlinkStreamingPipelineTranslator(StreamExecutionEnvironment env, PipelineOptions options) { - this.streamingContext = new FlinkStreamingTranslationContext(env, options); + public FlinkStreamingPipelineTranslator( + StreamExecutionEnvironment env, PipelineOptions options, boolean isStreaming) { + this.streamingContext = new FlinkStreamingTranslationContext(env, options, isStreaming); } @Override diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java index 6d42d0c3b485b..f3901fde03ba2 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslators.java @@ -38,9 +38,7 @@ import org.apache.beam.runners.core.construction.SerializablePipelineOptions; import org.apache.beam.runners.core.construction.SplittableParDo; import org.apache.beam.runners.core.construction.TransformPayloadTranslatorRegistrar; -import org.apache.beam.runners.core.construction.UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter; import org.apache.beam.runners.flink.translation.functions.FlinkAssignWindows; -import org.apache.beam.runners.flink.translation.functions.ImpulseSourceFunction; import org.apache.beam.runners.flink.translation.types.CoderTypeInformation; import org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator; import org.apache.beam.runners.flink.translation.wrappers.streaming.KvToByteBufferKeySelector; @@ -54,6 +52,9 @@ import org.apache.beam.runners.flink.translation.wrappers.streaming.io.DedupingOperator; import org.apache.beam.runners.flink.translation.wrappers.streaming.io.TestStreamSource; import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper; +import org.apache.beam.runners.flink.translation.wrappers.streaming.io.source.FlinkSource; +import org.apache.beam.runners.flink.translation.wrappers.streaming.io.source.bounded.FlinkBoundedSource; +import org.apache.beam.runners.flink.translation.wrappers.streaming.io.source.unbounded.FlinkUnboundedSource; import org.apache.beam.sdk.coders.ByteArrayCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.CoderException; @@ -96,6 +97,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.functions.RichFlatMapFunction; import org.apache.flink.api.common.functions.RichMapFunction; @@ -220,16 +222,19 @@ public void translateNode( context.getExecutionEnvironment().getMaxParallelism() > 0 ? context.getExecutionEnvironment().getMaxParallelism() : context.getExecutionEnvironment().getParallelism(); - UnboundedSourceWrapper sourceWrapper = - new UnboundedSourceWrapper<>( - fullName, context.getPipelineOptions(), rawSource, parallelism); + + FlinkUnboundedSource unboundedSource = + FlinkSource.unbounded( + transform.getName(), + rawSource, + new SerializablePipelineOptions(context.getPipelineOptions()), + parallelism); nonDedupSource = context .getExecutionEnvironment() - .addSource(sourceWrapper) - .name(fullName) - .uid(fullName) - .returns(withIdTypeInfo); + .fromSource( + unboundedSource, WatermarkStrategy.noWatermarks(), fullName, withIdTypeInfo) + .uid(fullName); if (rawSource.requiresDeduping()) { source = @@ -303,15 +308,24 @@ void translateNode(Impulse transform, FlinkStreamingTranslationContext context) WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions()); - long shutdownAfterIdleSourcesMs = - context - .getPipelineOptions() - .as(FlinkPipelineOptions.class) - .getShutdownSourcesAfterIdleMs(); + FlinkBoundedSource impulseSource; + WatermarkStrategy> watermarkStrategy; + if (context.isStreaming()) { + long shutdownAfterIdleSourcesMs = + context + .getPipelineOptions() + .as(FlinkPipelineOptions.class) + .getShutdownSourcesAfterIdleMs(); + impulseSource = FlinkSource.unboundedImpulse(shutdownAfterIdleSourcesMs); + watermarkStrategy = WatermarkStrategy.forMonotonousTimestamps(); + } else { + impulseSource = FlinkSource.boundedImpulse(); + watermarkStrategy = WatermarkStrategy.noWatermarks(); + } SingleOutputStreamOperator> source = context .getExecutionEnvironment() - .addSource(new ImpulseSourceFunction(shutdownAfterIdleSourcesMs), "Impulse") + .fromSource(impulseSource, watermarkStrategy, "Impulse") .returns(typeInfo); context.setOutputDataStream(context.getOutput(transform), source); @@ -330,7 +344,8 @@ private static class ReadSourceTranslator @Override void translateNode( PTransform> transform, FlinkStreamingTranslationContext context) { - if (context.getOutput(transform).isBounded().equals(PCollection.IsBounded.BOUNDED)) { + if (ReadTranslation.sourceIsBounded(context.getCurrentTransform()) + == PCollection.IsBounded.BOUNDED) { boundedTranslator.translateNode(transform, context); } else { unboundedTranslator.translateNode(transform, context); @@ -361,24 +376,26 @@ public void translateNode( } String fullName = getCurrentTransformName(context); - UnboundedSource adaptedRawSource = new BoundedToUnboundedSourceAdapter<>(rawSource); + int parallelism = + context.getExecutionEnvironment().getMaxParallelism() > 0 + ? context.getExecutionEnvironment().getMaxParallelism() + : context.getExecutionEnvironment().getParallelism(); + + FlinkBoundedSource flinkBoundedSource = + FlinkSource.bounded( + transform.getName(), + rawSource, + new SerializablePipelineOptions(context.getPipelineOptions()), + parallelism); + DataStream> source; try { - int parallelism = - context.getExecutionEnvironment().getMaxParallelism() > 0 - ? context.getExecutionEnvironment().getMaxParallelism() - : context.getExecutionEnvironment().getParallelism(); - UnboundedSourceWrapperNoValueWithRecordId sourceWrapper = - new UnboundedSourceWrapperNoValueWithRecordId<>( - new UnboundedSourceWrapper<>( - fullName, context.getPipelineOptions(), adaptedRawSource, parallelism)); source = context .getExecutionEnvironment() - .addSource(sourceWrapper) - .name(fullName) - .uid(fullName) - .returns(outputTypeInfo); + .fromSource( + flinkBoundedSource, WatermarkStrategy.noWatermarks(), fullName, outputTypeInfo) + .uid(fullName); } catch (Exception e) { throw new RuntimeException("Error while translating BoundedSource: " + rawSource, e); } @@ -545,7 +562,9 @@ static void translateParDo( KeySelector, ?> keySelector = null; boolean stateful = false; DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass()); - if (signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0) { + if (!signature.stateDeclarations().isEmpty() + || !signature.timerDeclarations().isEmpty() + || !signature.timerFamilyDeclarations().isEmpty()) { // Based on the fact that the signature is stateful, DoFnSignatures ensures // that it is also keyed keyCoder = ((KvCoder) input.getCoder()).getKeyCoder(); diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java index 9791eaeb4ac1d..0a89bd18172b8 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkStreamingTranslationContext.java @@ -51,6 +51,7 @@ class FlinkStreamingTranslationContext { private final StreamExecutionEnvironment env; private final PipelineOptions options; + private final boolean isStreaming; /** * Keeps a mapping between the output value of the PTransform and the Flink Operator that produced @@ -62,9 +63,11 @@ class FlinkStreamingTranslationContext { private AppliedPTransform currentTransform; - public FlinkStreamingTranslationContext(StreamExecutionEnvironment env, PipelineOptions options) { + public FlinkStreamingTranslationContext( + StreamExecutionEnvironment env, PipelineOptions options, boolean isStreaming) { this.env = checkNotNull(env); this.options = checkNotNull(options); + this.isStreaming = isStreaming; } public StreamExecutionEnvironment getExecutionEnvironment() { @@ -75,6 +78,10 @@ public PipelineOptions getPipelineOptions() { return options; } + public boolean isStreaming() { + return isStreaming; + } + @SuppressWarnings("unchecked") public DataStream getInputDataStream(PValue value) { return (DataStream) dataStreams.get(value); diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java index b53864d968c75..69ad58253b8e4 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkTransformOverrides.java @@ -36,7 +36,7 @@ class FlinkTransformOverrides { static List getDefaultOverrides(FlinkPipelineOptions options) { ImmutableList.Builder builder = ImmutableList.builder(); - if (options.isStreaming()) { + if (options.isStreaming() || options.getUseDataStreamForBatch()) { builder .add( PTransformOverride.of( diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/VersionDependentFlinkPipelineOptions.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/VersionDependentFlinkPipelineOptions.java new file mode 100644 index 0000000000000..48ee155011561 --- /dev/null +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/VersionDependentFlinkPipelineOptions.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.flink; + +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptions; + +public interface VersionDependentFlinkPipelineOptions extends PipelineOptions { + + @Description( + "When set to true, the batch job execution will use DataStream API. " + + "Otherwise, the batch job execution will use the legacy DataSet API.") + @Default.Boolean(false) + Boolean getUseDataStreamForBatch(); + + void setUseDataStreamForBatch(Boolean useDataStreamForBatch); +} diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java index c05db9e0b156b..eaa72445c3fbb 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainer.java @@ -17,29 +17,10 @@ */ package org.apache.beam.runners.flink.metrics; -import static org.apache.beam.runners.core.metrics.MetricsContainerStepMap.asAttemptedOnlyMetricResults; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.beam.model.pipeline.v1.MetricsApi.MonitoringInfo; import org.apache.beam.runners.core.metrics.MetricsContainerImpl; import org.apache.beam.runners.core.metrics.MetricsContainerStepMap; -import org.apache.beam.sdk.metrics.DistributionResult; -import org.apache.beam.sdk.metrics.GaugeResult; -import org.apache.beam.sdk.metrics.MetricKey; -import org.apache.beam.sdk.metrics.MetricName; -import org.apache.beam.sdk.metrics.MetricQueryResults; -import org.apache.beam.sdk.metrics.MetricResult; -import org.apache.beam.sdk.metrics.MetricResults; -import org.apache.beam.sdk.metrics.MetricsFilter; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.flink.api.common.accumulators.Accumulator; import org.apache.flink.api.common.functions.RuntimeContext; -import org.apache.flink.configuration.GlobalConfiguration; -import org.apache.flink.configuration.MetricOptions; -import org.apache.flink.metrics.Counter; -import org.apache.flink.metrics.Gauge; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,31 +33,15 @@ * which have a defined end. They are not essential during execution because metrics will also be * reported using the configured metrics reporter. */ -public class FlinkMetricContainer { - +public class FlinkMetricContainer extends FlinkMetricContainerBase { public static final String ACCUMULATOR_NAME = "__metricscontainers"; - private static final Logger LOG = LoggerFactory.getLogger(FlinkMetricContainer.class); - private static final String METRIC_KEY_SEPARATOR = - GlobalConfiguration.loadConfiguration().getString(MetricOptions.SCOPE_DELIMITER); - - private final MetricsContainerStepMap metricsContainers; private final RuntimeContext runtimeContext; - private final Map flinkCounterCache; - private final Map flinkDistributionGaugeCache; - private final Map flinkGaugeCache; public FlinkMetricContainer(RuntimeContext runtimeContext) { + super(runtimeContext.getMetricGroup()); this.runtimeContext = runtimeContext; - this.flinkCounterCache = new HashMap<>(); - this.flinkDistributionGaugeCache = new HashMap<>(); - this.flinkGaugeCache = new HashMap<>(); - this.metricsContainers = new MetricsContainerStepMap(); - } - - public MetricsContainerImpl getMetricsContainer(String stepName) { - return metricsContainers.getContainer(stepName); } /** @@ -99,125 +64,4 @@ public void registerMetricsForPipelineResult() { } metricsAccumulator.add(metricsContainers); } - - /** - * Update this container with metrics from the passed {@link MonitoringInfo}s, and send updates - * along to Flink's internal metrics framework. - */ - public void updateMetrics(String stepName, List monitoringInfos) { - getMetricsContainer(stepName).update(monitoringInfos); - updateMetrics(stepName); - } - - /** - * Update Flink's internal metrics ({@link this#flinkCounterCache}) with the latest metrics for a - * given step. - */ - void updateMetrics(String stepName) { - MetricResults metricResults = asAttemptedOnlyMetricResults(metricsContainers); - MetricQueryResults metricQueryResults = - metricResults.queryMetrics(MetricsFilter.builder().addStep(stepName).build()); - updateCounters(metricQueryResults.getCounters()); - updateDistributions(metricQueryResults.getDistributions()); - updateGauge(metricQueryResults.getGauges()); - } - - private void updateCounters(Iterable> counters) { - for (MetricResult metricResult : counters) { - String flinkMetricName = getFlinkMetricNameString(metricResult.getKey()); - - Long update = metricResult.getAttempted(); - - // update flink metric - Counter counter = - flinkCounterCache.computeIfAbsent( - flinkMetricName, n -> runtimeContext.getMetricGroup().counter(n)); - // Beam counters are already pre-aggregated, just update with the current value here - counter.inc(update - counter.getCount()); - } - } - - private void updateDistributions(Iterable> distributions) { - for (MetricResult metricResult : distributions) { - String flinkMetricName = getFlinkMetricNameString(metricResult.getKey()); - - DistributionResult update = metricResult.getAttempted(); - - // update flink metric - FlinkDistributionGauge gauge = flinkDistributionGaugeCache.get(flinkMetricName); - if (gauge == null) { - gauge = - runtimeContext - .getMetricGroup() - .gauge(flinkMetricName, new FlinkDistributionGauge(update)); - flinkDistributionGaugeCache.put(flinkMetricName, gauge); - } else { - gauge.update(update); - } - } - } - - private void updateGauge(Iterable> gauges) { - for (MetricResult metricResult : gauges) { - String flinkMetricName = getFlinkMetricNameString(metricResult.getKey()); - - GaugeResult update = metricResult.getAttempted(); - - // update flink metric - FlinkGauge gauge = flinkGaugeCache.get(flinkMetricName); - if (gauge == null) { - gauge = runtimeContext.getMetricGroup().gauge(flinkMetricName, new FlinkGauge(update)); - flinkGaugeCache.put(flinkMetricName, gauge); - } else { - gauge.update(update); - } - } - } - - @VisibleForTesting - static String getFlinkMetricNameString(MetricKey metricKey) { - MetricName metricName = metricKey.metricName(); - // We use only the MetricName here, the step name is already contained - // in the operator name which is passed to Flink's MetricGroup to which - // the metric with the following name will be added. - return metricName.getNamespace() + METRIC_KEY_SEPARATOR + metricName.getName(); - } - - /** Flink {@link Gauge} for {@link DistributionResult}. */ - public static class FlinkDistributionGauge implements Gauge { - - DistributionResult data; - - FlinkDistributionGauge(DistributionResult data) { - this.data = data; - } - - void update(DistributionResult data) { - this.data = data; - } - - @Override - public DistributionResult getValue() { - return data; - } - } - - /** Flink {@link Gauge} for {@link GaugeResult}. */ - public static class FlinkGauge implements Gauge { - - GaugeResult data; - - FlinkGauge(GaugeResult data) { - this.data = data; - } - - void update(GaugeResult update) { - this.data = update; - } - - @Override - public Long getValue() { - return data.getValue(); - } - } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerBase.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerBase.java new file mode 100644 index 0000000000000..a9a6db47c8147 --- /dev/null +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerBase.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.flink.metrics; + +import static org.apache.beam.runners.core.metrics.MetricsContainerStepMap.asAttemptedOnlyMetricResults; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.beam.model.pipeline.v1.MetricsApi; +import org.apache.beam.runners.core.metrics.MetricsContainerImpl; +import org.apache.beam.runners.core.metrics.MetricsContainerStepMap; +import org.apache.beam.sdk.metrics.DistributionResult; +import org.apache.beam.sdk.metrics.GaugeResult; +import org.apache.beam.sdk.metrics.MetricKey; +import org.apache.beam.sdk.metrics.MetricName; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.metrics.MetricResult; +import org.apache.beam.sdk.metrics.MetricResults; +import org.apache.beam.sdk.metrics.MetricsFilter; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.metrics.MetricGroup; + +/** + * The base helper class for holding a {@link MetricsContainerImpl} and forwarding Beam metrics to + * Flink accumulators and metrics. The two subclasses of this base class are {@link + * FlinkMetricContainer} and {@link FlinkMetricContainerWithoutAccumulator}. The former is used when + * {@link org.apache.flink.api.common.functions.RuntimeContext Flink RuntimeContext} is available. + * The latter is used otherwise. + */ +abstract class FlinkMetricContainerBase { + + private static final String METRIC_KEY_SEPARATOR = + GlobalConfiguration.loadConfiguration().getString(MetricOptions.SCOPE_DELIMITER); + + protected final MetricsContainerStepMap metricsContainers; + private final Map flinkCounterCache; + private final Map flinkDistributionGaugeCache; + private final Map flinkGaugeCache; + private final MetricGroup metricGroup; + + public FlinkMetricContainerBase(MetricGroup metricGroup) { + this.flinkCounterCache = new HashMap<>(); + this.flinkDistributionGaugeCache = new HashMap<>(); + this.flinkGaugeCache = new HashMap<>(); + this.metricsContainers = new MetricsContainerStepMap(); + this.metricGroup = metricGroup; + } + + public MetricGroup getMetricGroup() { + return metricGroup; + } + + public MetricsContainerImpl getMetricsContainer(String stepName) { + return metricsContainers.getContainer(stepName); + } + + /** + * Update this container with metrics from the passed {@link MetricsApi.MonitoringInfo}s, and send + * updates along to Flink's internal metrics framework. + */ + public void updateMetrics(String stepName, List monitoringInfos) { + getMetricsContainer(stepName).update(monitoringInfos); + updateMetrics(stepName); + } + + /** + * Update Flink's internal metrics ({@link this#flinkCounterCache}) with the latest metrics for a + * given step. + */ + void updateMetrics(String stepName) { + MetricResults metricResults = asAttemptedOnlyMetricResults(metricsContainers); + MetricQueryResults metricQueryResults = + metricResults.queryMetrics(MetricsFilter.builder().addStep(stepName).build()); + updateCounters(metricQueryResults.getCounters()); + updateDistributions(metricQueryResults.getDistributions()); + updateGauge(metricQueryResults.getGauges()); + } + + private void updateCounters(Iterable> counters) { + for (MetricResult metricResult : counters) { + String flinkMetricName = getFlinkMetricNameString(metricResult.getKey()); + + Long update = metricResult.getAttempted(); + + // update flink metric + Counter counter = + flinkCounterCache.computeIfAbsent(flinkMetricName, n -> getMetricGroup().counter(n)); + // Beam counters are already pre-aggregated, just update with the current value here + counter.inc(update - counter.getCount()); + } + } + + private void updateDistributions(Iterable> distributions) { + for (MetricResult metricResult : distributions) { + String flinkMetricName = getFlinkMetricNameString(metricResult.getKey()); + + DistributionResult update = metricResult.getAttempted(); + + // update flink metric + FlinkDistributionGauge gauge = flinkDistributionGaugeCache.get(flinkMetricName); + if (gauge == null) { + gauge = getMetricGroup().gauge(flinkMetricName, new FlinkDistributionGauge(update)); + flinkDistributionGaugeCache.put(flinkMetricName, gauge); + } else { + gauge.update(update); + } + } + } + + private void updateGauge(Iterable> gauges) { + for (MetricResult metricResult : gauges) { + String flinkMetricName = getFlinkMetricNameString(metricResult.getKey()); + + GaugeResult update = metricResult.getAttempted(); + + // update flink metric + FlinkGauge gauge = flinkGaugeCache.get(flinkMetricName); + if (gauge == null) { + gauge = getMetricGroup().gauge(flinkMetricName, new FlinkGauge(update)); + flinkGaugeCache.put(flinkMetricName, gauge); + } else { + gauge.update(update); + } + } + } + + @VisibleForTesting + static String getFlinkMetricNameString(MetricKey metricKey) { + MetricName metricName = metricKey.metricName(); + // We use only the MetricName here, the step name is already contained + // in the operator name which is passed to Flink's MetricGroup to which + // the metric with the following name will be added. + return metricName.getNamespace() + METRIC_KEY_SEPARATOR + metricName.getName(); + } + + /** Flink {@link Gauge} for {@link DistributionResult}. */ + public static class FlinkDistributionGauge implements Gauge { + + DistributionResult data; + + FlinkDistributionGauge(DistributionResult data) { + this.data = data; + } + + void update(DistributionResult data) { + this.data = data; + } + + @Override + public DistributionResult getValue() { + return data; + } + } + + /** Flink {@link Gauge} for {@link GaugeResult}. */ + public static class FlinkGauge implements Gauge { + + GaugeResult data; + + FlinkGauge(GaugeResult data) { + this.data = data; + } + + void update(GaugeResult update) { + this.data = update; + } + + @Override + public Long getValue() { + return data.getValue(); + } + } +} diff --git a/.test-infra/jenkins/job_PreCommit_Java_Examples_Dataflow.groovy b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerWithoutAccumulator.java similarity index 55% rename from .test-infra/jenkins/job_PreCommit_Java_Examples_Dataflow.groovy rename to runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerWithoutAccumulator.java index 109456b3bc4fc..88d52273108ac 100644 --- a/.test-infra/jenkins/job_PreCommit_Java_Examples_Dataflow.groovy +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerWithoutAccumulator.java @@ -15,29 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.beam.runners.flink.metrics; -import PrecommitJobBuilder +import org.apache.flink.metrics.MetricGroup; -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Java_Examples_Dataflow', - gradleTask: ':javaExamplesDataflowPreCommit', - gradleSwitches: [ - '-PdisableSpotlessCheck=true', - '-PdisableCheckStyle=true' - ], // spotless checked in separate pre-commit - triggerPathPatterns: [ - '^model/.*$', - '^sdks/java/.*$', - '^runners/google-cloud-dataflow-java/.*$', - '^examples/java/.*$', - '^examples/kotlin/.*$', - '^release/.*$', - ], - timeoutMins: 60, - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') +/** + * The base helper class for holding a {@link + * org.apache.beam.runners.core.metrics.MetricsContainerImpl MetricsContainerImpl} and forwarding + * Beam metrics to Flink accumulators and metrics. This class is used when {@link + * org.apache.flink.api.common.functions.RuntimeContext Flink RuntimeContext} is not available. + * + * @see FlinkMetricContainer + */ +public class FlinkMetricContainerWithoutAccumulator extends FlinkMetricContainerBase { + public FlinkMetricContainerWithoutAccumulator(MetricGroup metricGroup) { + super(metricGroup); } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/ReaderInvocationUtil.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/ReaderInvocationUtil.java index 736a2dd9da59a..60b84e63263f7 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/ReaderInvocationUtil.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/metrics/ReaderInvocationUtil.java @@ -33,11 +33,11 @@ public class ReaderInvocationUtil> { private final String stepName; - private final FlinkMetricContainer container; + private final FlinkMetricContainerBase container; private final Boolean enableMetrics; public ReaderInvocationUtil( - String stepName, PipelineOptions options, FlinkMetricContainer container) { + String stepName, PipelineOptions options, FlinkMetricContainerBase container) { FlinkPipelineOptions flinkPipelineOptions = options.as(FlinkPipelineOptions.class); this.stepName = stepName; this.enableMetrics = !flinkPipelineOptions.getDisableMetrics(); diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java index 726ffb2291880..63f5ede002420 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperator.java @@ -110,10 +110,12 @@ import org.apache.flink.streaming.api.operators.InternalTimeServiceManager; import org.apache.flink.streaming.api.operators.InternalTimer; import org.apache.flink.streaming.api.operators.InternalTimerService; +import org.apache.flink.streaming.api.operators.InternalTimerServiceImpl; import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.api.operators.Output; import org.apache.flink.streaming.api.operators.Triggerable; import org.apache.flink.streaming.api.operators.TwoInputStreamOperator; +import org.apache.flink.streaming.api.operators.sorted.state.BatchExecutionInternalTimeService; import org.apache.flink.streaming.api.watermark.Watermark; import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService; @@ -199,6 +201,12 @@ public class DoFnOperator /** If true, we must process elements only after a checkpoint is finished. */ final boolean requiresStableInput; + /** + * If both requiresStableInput and this parameter are true, we must flush the buffer during drain + * operation. + */ + final boolean enableStableInputDrain; + final int numConcurrentCheckpoints; private final boolean usesOnWindowExpiration; @@ -323,6 +331,8 @@ public DoFnOperator( + Math.max(0, flinkOptions.getMinPauseBetweenCheckpoints())); } + this.enableStableInputDrain = flinkOptions.getEnableStableInputDrain(); + this.numConcurrentCheckpoints = flinkOptions.getNumConcurrentCheckpoints(); this.finishBundleBeforeCheckpointing = flinkOptions.getFinishBundleBeforeCheckpointing(); @@ -626,6 +636,12 @@ void flushData() throws Exception { while (bundleStarted) { invokeFinishBundle(); } + if (requiresStableInput && enableStableInputDrain) { + // Flush any buffered events here before draining the pipeline. Note that this is best-effort + // and requiresStableInput contract might be violated in cases where buffer processing fails. + bufferingDoFnRunner.checkpointCompleted(Long.MAX_VALUE); + updateOutputWatermark(); + } if (currentOutputWatermark < Long.MAX_VALUE) { throw new RuntimeException( String.format( @@ -672,6 +688,7 @@ protected final void setBundleFinishedCallback(Runnable callback) { @Override public final void processElement(StreamRecord> streamRecord) { checkInvokeStartBundle(); + LOG.trace("Processing element {} in {}", streamRecord.getValue().getValue(), doFn.getClass()); long oldHold = keyCoder != null ? keyedStateInternals.minWatermarkHoldMs() : -1L; doFnRunner.processElement(streamRecord.getValue()); checkInvokeFinishBundleByCount(); @@ -754,6 +771,7 @@ public final void processElement2(StreamRecord streamRecord) thro @Override public final void processWatermark(Watermark mark) throws Exception { + LOG.trace("Processing watermark {} in {}", mark.getTimestamp(), doFn.getClass()); processWatermark1(mark); } @@ -1442,8 +1460,10 @@ private void populateOutputTimestampQueue(InternalTimerService timerS BiConsumerWithException consumer = (timerData, stamp) -> keyedStateInternals.addWatermarkHoldUsage(timerData.getOutputTimestamp()); - timerService.forEachEventTimeTimer(consumer); - timerService.forEachProcessingTimeTimer(consumer); + if (timerService instanceof InternalTimerServiceImpl) { + timerService.forEachEventTimeTimer(consumer); + timerService.forEachProcessingTimeTimer(consumer); + } } private String constructTimerId(String timerFamilyId, String timerId) { @@ -1494,6 +1514,7 @@ public void setTimer(TimerData timer) { } private void registerTimer(TimerData timer, String contextTimerId) throws Exception { + LOG.debug("Registering timer {}", timer); pendingTimersById.put(contextTimerId, timer); long time = timer.getTimestamp().getMillis(); switch (timer.getDomain()) { @@ -1604,7 +1625,31 @@ public Instant currentProcessingTime() { @Override public Instant currentInputWatermarkTime() { - return new Instant(getEffectiveInputWatermark()); + if (timerService instanceof BatchExecutionInternalTimeService) { + // In batch mode, this method will only either return BoundedWindow.TIMESTAMP_MIN_VALUE, + // or BoundedWindow.TIMESTAMP_MAX_VALUE. + // + // For batch execution mode, the currentInputWatermark variable will never be updated + // until all the records are processed. However, every time when a record with a new + // key arrives, the Flink timer service watermark will be set to + // MAX_WATERMARK(LONG.MAX_VALUE) so that all the timers associated with the current + // key can fire. After that the Flink timer service watermark will be reset to + // LONG.MIN_VALUE, so the next key will start from a fresh env as if the previous + // records of a different key never existed. So the watermark is either Long.MIN_VALUE + // or long MAX_VALUE. So we should just use the Flink time service watermark in batch mode. + // + // In Flink the watermark ranges from + // [LONG.MIN_VALUE (-9223372036854775808), LONG.MAX_VALUE (9223372036854775807)] while the + // beam + // watermark range is [BoundedWindow.TIMESTAMP_MIN_VALUE (-9223372036854775), + // BoundedWindow.TIMESTAMP_MAX_VALUE (9223372036854775)]. To ensure the timestamp visible to + // the users follow the Beam convention, we just use the Beam range instead. + return timerService.currentWatermark() == Long.MAX_VALUE + ? new Instant(Long.MAX_VALUE) + : BoundedWindow.TIMESTAMP_MIN_VALUE; + } else { + return new Instant(getEffectiveInputWatermark()); + } } @Override diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java index c4d82cb5c8ad0..6f2f473feddce 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/SingletonKeyedWorkItem.java @@ -51,4 +51,9 @@ public Iterable timersIterable() { public Iterable> elementsIterable() { return Collections.singletonList(value); } + + @Override + public String toString() { + return String.format("{%s, [%s]}", key, value); + } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSource.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSource.java index c001b263340cb..0b9fdd9dcd7c7 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSource.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSource.java @@ -44,6 +44,8 @@ */ public abstract class FlinkSource implements Source, Map>>> { + + protected final String stepName; protected final org.apache.beam.sdk.io.Source beamSource; protected final Boundedness boundedness; protected final SerializablePipelineOptions serializablePipelineOptions; @@ -53,18 +55,20 @@ public abstract class FlinkSource // ----------------- public static methods to construct sources -------------------- public static FlinkBoundedSource bounded( + String stepName, BoundedSource boundedSource, SerializablePipelineOptions serializablePipelineOptions, int numSplits) { return new FlinkBoundedSource<>( - boundedSource, serializablePipelineOptions, Boundedness.BOUNDED, numSplits); + stepName, boundedSource, serializablePipelineOptions, Boundedness.BOUNDED, numSplits); } public static FlinkUnboundedSource unbounded( + String stepName, UnboundedSource source, SerializablePipelineOptions serializablePipelineOptions, int numSplits) { - return new FlinkUnboundedSource<>(source, serializablePipelineOptions, numSplits); + return new FlinkUnboundedSource<>(stepName, source, serializablePipelineOptions, numSplits); } public static FlinkBoundedSource unboundedImpulse(long shutdownSourceAfterIdleMs) { @@ -77,6 +81,7 @@ public static FlinkBoundedSource unboundedImpulse(long shutdownSourceAft // BeamImpulseSource will be discarded after the impulse emission. So the streaming // job won't see another impulse after failover. return new FlinkBoundedSource<>( + "Impulse", new BeamImpulseSource(), new SerializablePipelineOptions(flinkPipelineOptions), Boundedness.CONTINUOUS_UNBOUNDED, @@ -86,6 +91,7 @@ record -> Watermark.MAX_WATERMARK.getTimestamp()); public static FlinkBoundedSource boundedImpulse() { return new FlinkBoundedSource<>( + "Impulse", new BeamImpulseSource(), new SerializablePipelineOptions(FlinkPipelineOptions.defaults()), Boundedness.BOUNDED, @@ -96,10 +102,12 @@ record -> Watermark.MAX_WATERMARK.getTimestamp()); // ------ Common implementations for both bounded and unbounded source --------- protected FlinkSource( + String stepName, org.apache.beam.sdk.io.Source beamSource, SerializablePipelineOptions serializablePipelineOptions, Boundedness boundedness, int numSplits) { + this.stepName = stepName; this.beamSource = beamSource; this.serializablePipelineOptions = serializablePipelineOptions; this.boundedness = boundedness; diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderBase.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderBase.java index 27b84910ac270..f0b93e0dde0f7 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderBase.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderBase.java @@ -39,6 +39,8 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import org.apache.beam.runners.flink.FlinkPipelineOptions; +import org.apache.beam.runners.flink.metrics.FlinkMetricContainerWithoutAccumulator; +import org.apache.beam.runners.flink.metrics.ReaderInvocationUtil; import org.apache.beam.runners.flink.translation.wrappers.streaming.io.source.compat.FlinkSourceCompat; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.io.BoundedSource; @@ -87,6 +89,7 @@ public abstract class FlinkSourceReaderBase protected final SourceReaderContext context; private final ScheduledExecutorService executor; + protected final ReaderInvocationUtil> invocationUtil; protected final Counter numRecordsInCounter; protected final long idleTimeoutMs; private final CompletableFuture idleTimeoutFuture; @@ -96,10 +99,12 @@ public abstract class FlinkSourceReaderBase private boolean noMoreSplits; protected FlinkSourceReaderBase( + String stepName, SourceReaderContext context, PipelineOptions pipelineOptions, @Nullable Function timestampExtractor) { this( + stepName, Executors.newSingleThreadScheduledExecutor( r -> new Thread(r, "FlinkSource-Executor-Thread-" + context.getIndexOfSubtask())), context, @@ -108,6 +113,7 @@ protected FlinkSourceReaderBase( } protected FlinkSourceReaderBase( + String stepName, ScheduledExecutorService executor, SourceReaderContext context, PipelineOptions pipelineOptions, @@ -126,6 +132,9 @@ protected FlinkSourceReaderBase( // TODO: Remove the casting and use SourceReaderMetricGroup after minimum FLink version is // upgraded to 1.14 and above. this.numRecordsInCounter = FlinkSourceCompat.getNumRecordsInCounter(context); + FlinkMetricContainerWithoutAccumulator metricsContainer = + new FlinkMetricContainerWithoutAccumulator(context.metricGroup()); + this.invocationUtil = new ReaderInvocationUtil<>(stepName, pipelineOptions, metricsContainer); } @Override @@ -368,10 +377,10 @@ public SourceOutput getAndMaybeCreateSplitOutput(ReaderOutput public boolean startOrAdvance() throws IOException { if (started) { - return reader.advance(); + return invocationUtil.invokeAdvance(reader); } else { started = true; - return reader.start(); + return invocationUtil.invokeStart(reader); } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSource.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSource.java index c2bd904dcc603..ab9a6cc03cd59 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSource.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSource.java @@ -41,20 +41,22 @@ public class FlinkBoundedSource extends FlinkSource> { protected final @Nullable TimestampExtractor> timestampExtractor; public FlinkBoundedSource( + String stepName, BoundedSource beamSource, SerializablePipelineOptions serializablePipelineOptions, Boundedness boundedness, int numSplits) { - this(beamSource, serializablePipelineOptions, boundedness, numSplits, null); + this(stepName, beamSource, serializablePipelineOptions, boundedness, numSplits, null); } public FlinkBoundedSource( + String stepName, BoundedSource beamSource, SerializablePipelineOptions serializablePipelineOptions, Boundedness boundedness, int numSplits, @Nullable TimestampExtractor> timestampExtractor) { - super(beamSource, serializablePipelineOptions, boundedness, numSplits); + super(stepName, beamSource, serializablePipelineOptions, boundedness, numSplits); this.timestampExtractor = timestampExtractor; } @@ -62,6 +64,6 @@ public FlinkBoundedSource( public SourceReader, FlinkSourceSplit> createReader( SourceReaderContext readerContext) throws Exception { return new FlinkBoundedSourceReader<>( - readerContext, serializablePipelineOptions.get(), timestampExtractor); + stepName, readerContext, serializablePipelineOptions.get(), timestampExtractor); } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReader.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReader.java index 7fb5fcc714c91..b015b527aa45a 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReader.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReader.java @@ -54,20 +54,22 @@ public class FlinkBoundedSourceReader extends FlinkSourceReaderBase, Long> timestampExtractor) { - super(context, pipelineOptions, timestampExtractor); + super(stepName, context, pipelineOptions, timestampExtractor); currentSplitId = -1; } @VisibleForTesting protected FlinkBoundedSourceReader( + String stepName, SourceReaderContext context, PipelineOptions pipelineOptions, ScheduledExecutorService executor, @Nullable Function, Long> timestampExtractor) { - super(executor, context, pipelineOptions, timestampExtractor); + super(stepName, executor, context, pipelineOptions, timestampExtractor); currentSplitId = -1; } @@ -105,7 +107,7 @@ public InputStatus pollNext(ReaderOutput> output) throws Except // If the advance() invocation throws exception here, the job will just fail over and read // everything again from // the beginning. So the failover granularity is the entire Flink job. - if (!tempCurrentReader.advance()) { + if (!invocationUtil.invokeAdvance(tempCurrentReader)) { finishSplit(currentSplitId); currentReader = null; currentSplitId = -1; @@ -133,7 +135,7 @@ private boolean moveToNextNonEmptyReader() throws IOException { Optional readerAndOutput; while ((readerAndOutput = createAndTrackNextReader()).isPresent()) { ReaderAndOutput rao = readerAndOutput.get(); - if (rao.reader.start()) { + if (invocationUtil.invokeStart(rao.reader)) { currentSplitId = Integer.parseInt(rao.splitId); currentReader = rao.reader; return true; diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSource.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSource.java index b404922017005..8ef2edfa606e9 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSource.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSource.java @@ -40,18 +40,25 @@ public class FlinkUnboundedSource extends FlinkSource beamSource, SerializablePipelineOptions serializablePipelineOptions, int numSplits) { - this(beamSource, serializablePipelineOptions, numSplits, null); + this(stepName, beamSource, serializablePipelineOptions, numSplits, null); } public FlinkUnboundedSource( + String stepName, UnboundedSource beamSource, SerializablePipelineOptions serializablePipelineOptions, int numSplits, @Nullable TimestampExtractor>> timestampExtractor) { - super(beamSource, serializablePipelineOptions, Boundedness.CONTINUOUS_UNBOUNDED, numSplits); + super( + stepName, + beamSource, + serializablePipelineOptions, + Boundedness.CONTINUOUS_UNBOUNDED, + numSplits); this.timestampExtractor = timestampExtractor; } @@ -59,6 +66,6 @@ public FlinkUnboundedSource( public SourceReader>, FlinkSourceSplit> createReader( SourceReaderContext readerContext) throws Exception { return new FlinkUnboundedSourceReader<>( - readerContext, serializablePipelineOptions.get(), timestampExtractor); + stepName, readerContext, serializablePipelineOptions.get(), timestampExtractor); } } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReader.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReader.java index 8f3595b9729d1..04726990295c6 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReader.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReader.java @@ -69,10 +69,11 @@ public class FlinkUnboundedSourceReader private volatile boolean shouldEmitWatermark; public FlinkUnboundedSourceReader( + String stepName, SourceReaderContext context, PipelineOptions pipelineOptions, @Nullable Function>, Long> timestampExtractor) { - super(context, pipelineOptions, timestampExtractor); + super(stepName, context, pipelineOptions, timestampExtractor); this.readers = new ArrayList<>(); this.dataAvailableFutureRef = new AtomicReference<>(DUMMY_FUTURE); this.currentReaderIndex = 0; @@ -80,11 +81,12 @@ public FlinkUnboundedSourceReader( @VisibleForTesting protected FlinkUnboundedSourceReader( + String stepName, SourceReaderContext context, PipelineOptions pipelineOptions, ScheduledExecutorService executor, @Nullable Function>, Long> timestampExtractor) { - super(executor, context, pipelineOptions, timestampExtractor); + super(stepName, executor, context, pipelineOptions, timestampExtractor); this.readers = new ArrayList<>(); this.dataAvailableFutureRef = new AtomicReference<>(DUMMY_FUTURE); this.currentReaderIndex = 0; diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java index 49d317d46ced8..ec44d279586d9 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkExecutionEnvironmentsTest.java @@ -29,6 +29,8 @@ import java.io.InputStream; import java.net.InetSocketAddress; import java.nio.file.Files; +import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -49,17 +51,33 @@ import org.junit.Test; import org.junit.rules.ExpectedException; import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; import org.powermock.reflect.Whitebox; /** Tests for {@link FlinkExecutionEnvironments}. */ +@RunWith(Parameterized.class) public class FlinkExecutionEnvironmentsTest { @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); @Rule public ExpectedException expectedException = ExpectedException.none(); + @Parameterized.Parameter public boolean useDataStreamForBatch; + + @Parameterized.Parameters(name = "UseDataStreamForBatch = {0}") + public static Collection useDataStreamForBatchJobValues() { + return Arrays.asList(new Object[][] {{false}, {true}}); + } + + private FlinkPipelineOptions getDefaultPipelineOptions() { + FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + options.setUseDataStreamForBatch(useDataStreamForBatch); + return options; + } + @Test public void shouldSetParallelismBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setParallelism(42); @@ -71,7 +89,7 @@ public void shouldSetParallelismBatch() { @Test public void shouldSetParallelismStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setParallelism(42); @@ -84,7 +102,7 @@ public void shouldSetParallelismStreaming() { @Test public void shouldSetMaxParallelismStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setMaxParallelism(42); @@ -99,7 +117,7 @@ public void shouldSetMaxParallelismStreaming() { public void shouldInferParallelismFromEnvironmentBatch() throws IOException { String flinkConfDir = extractFlinkConfig(); - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("host:80"); @@ -115,7 +133,7 @@ public void shouldInferParallelismFromEnvironmentBatch() throws IOException { public void shouldInferParallelismFromEnvironmentStreaming() throws IOException { String confDir = extractFlinkConfig(); - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("host:80"); @@ -129,7 +147,7 @@ public void shouldInferParallelismFromEnvironmentStreaming() throws IOException @Test public void shouldFallbackToDefaultParallelismBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("host:80"); @@ -141,7 +159,7 @@ public void shouldFallbackToDefaultParallelismBatch() { @Test public void shouldFallbackToDefaultParallelismStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("host:80"); @@ -154,7 +172,7 @@ public void shouldFallbackToDefaultParallelismStreaming() { @Test public void useDefaultParallelismFromContextBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); ExecutionEnvironment bev = FlinkExecutionEnvironments.createBatchExecutionEnvironment(options); @@ -166,7 +184,7 @@ public void useDefaultParallelismFromContextBatch() { @Test public void useDefaultParallelismFromContextStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); StreamExecutionEnvironment sev = @@ -179,7 +197,7 @@ public void useDefaultParallelismFromContextStreaming() { @Test public void shouldParsePortForRemoteEnvironmentBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("host:1234"); @@ -191,7 +209,7 @@ public void shouldParsePortForRemoteEnvironmentBatch() { @Test public void shouldParsePortForRemoteEnvironmentStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("host:1234"); @@ -204,7 +222,7 @@ public void shouldParsePortForRemoteEnvironmentStreaming() { @Test public void shouldAllowPortOmissionForRemoteEnvironmentBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("host"); @@ -216,7 +234,7 @@ public void shouldAllowPortOmissionForRemoteEnvironmentBatch() { @Test public void shouldAllowPortOmissionForRemoteEnvironmentStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("host"); @@ -229,7 +247,7 @@ public void shouldAllowPortOmissionForRemoteEnvironmentStreaming() { @Test public void shouldTreatAutoAndEmptyHostTheSameBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); ExecutionEnvironment sev = FlinkExecutionEnvironments.createBatchExecutionEnvironment(options); @@ -243,7 +261,7 @@ public void shouldTreatAutoAndEmptyHostTheSameBatch() { @Test public void shouldTreatAutoAndEmptyHostTheSameStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); StreamExecutionEnvironment sev = @@ -259,7 +277,7 @@ public void shouldTreatAutoAndEmptyHostTheSameStreaming() { @Test public void shouldDetectMalformedPortBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("host:p0rt"); @@ -271,7 +289,7 @@ public void shouldDetectMalformedPortBatch() { @Test public void shouldDetectMalformedPortStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("host:p0rt"); @@ -283,7 +301,7 @@ public void shouldDetectMalformedPortStreaming() { @Test public void shouldSupportIPv4Batch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("192.168.1.1:1234"); @@ -297,7 +315,7 @@ public void shouldSupportIPv4Batch() { @Test public void shouldSupportIPv4Streaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("192.168.1.1:1234"); @@ -311,7 +329,7 @@ public void shouldSupportIPv4Streaming() { @Test public void shouldSupportIPv6Batch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("[FE80:CD00:0000:0CDE:1257:0000:211E:729C]:1234"); @@ -326,7 +344,7 @@ public void shouldSupportIPv6Batch() { @Test public void shouldSupportIPv6Streaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setFlinkMaster("[FE80:CD00:0000:0CDE:1257:0000:211E:729C]:1234"); @@ -342,7 +360,7 @@ public void shouldSupportIPv6Streaming() { @Test public void shouldRemoveHttpProtocolFromHostBatch() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); for (String flinkMaster : @@ -358,7 +376,7 @@ public void shouldRemoveHttpProtocolFromHostBatch() { @Test public void shouldRemoveHttpProtocolFromHostStreaming() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); for (String flinkMaster : @@ -382,7 +400,7 @@ private String extractFlinkConfig() throws IOException { @Test public void shouldAutoSetIdleSourcesFlagWithoutCheckpointing() { // Checkpointing disabled, shut down sources immediately - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); FlinkExecutionEnvironments.createStreamExecutionEnvironment(options); assertThat(options.getShutdownSourcesAfterIdleMs(), is(0L)); } @@ -390,7 +408,7 @@ public void shouldAutoSetIdleSourcesFlagWithoutCheckpointing() { @Test public void shouldAutoSetIdleSourcesFlagWithCheckpointing() { // Checkpointing is enabled, never shut down sources - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setCheckpointingInterval(1000L); FlinkExecutionEnvironments.createStreamExecutionEnvironment(options); assertThat(options.getShutdownSourcesAfterIdleMs(), is(Long.MAX_VALUE)); @@ -399,7 +417,7 @@ public void shouldAutoSetIdleSourcesFlagWithCheckpointing() { @Test public void shouldAcceptExplicitlySetIdleSourcesFlagWithoutCheckpointing() { // Checkpointing disabled, accept flag - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setShutdownSourcesAfterIdleMs(42L); FlinkExecutionEnvironments.createStreamExecutionEnvironment(options); assertThat(options.getShutdownSourcesAfterIdleMs(), is(42L)); @@ -408,7 +426,7 @@ public void shouldAcceptExplicitlySetIdleSourcesFlagWithoutCheckpointing() { @Test public void shouldAcceptExplicitlySetIdleSourcesFlagWithCheckpointing() { // Checkpointing enable, still accept flag - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setCheckpointingInterval(1000L); options.setShutdownSourcesAfterIdleMs(42L); FlinkExecutionEnvironments.createStreamExecutionEnvironment(options); @@ -418,7 +436,7 @@ public void shouldAcceptExplicitlySetIdleSourcesFlagWithCheckpointing() { @Test public void shouldSetSavepointRestoreForRemoteStreaming() { String path = "fakePath"; - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("host:80"); options.setSavepointPath(path); @@ -432,7 +450,7 @@ public void shouldSetSavepointRestoreForRemoteStreaming() { @Test public void shouldFailOnUnknownStateBackend() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setStreaming(true); options.setStateBackend("unknown"); options.setStateBackendStoragePath("/path"); @@ -445,7 +463,7 @@ public void shouldFailOnUnknownStateBackend() { @Test public void shouldFailOnNoStoragePathProvided() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setStreaming(true); options.setStateBackend("unknown"); @@ -457,7 +475,7 @@ public void shouldFailOnNoStoragePathProvided() { @Test public void shouldCreateFileSystemStateBackend() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setStreaming(true); options.setStateBackend("fileSystem"); options.setStateBackendStoragePath(temporaryFolder.getRoot().toURI().toString()); @@ -470,7 +488,7 @@ public void shouldCreateFileSystemStateBackend() { @Test public void shouldCreateRocksDbStateBackend() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setStreaming(true); options.setStateBackend("rocksDB"); options.setStateBackendStoragePath(temporaryFolder.getRoot().toURI().toString()); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java index d8c4c6f6c8ecb..676e35d4bc0fe 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java @@ -28,6 +28,7 @@ import static org.hamcrest.core.Every.everyItem; import static org.junit.Assert.assertThrows; import static org.junit.Assert.fail; +import static org.junit.Assume.assumeFalse; import java.io.ByteArrayOutputStream; import java.io.File; @@ -38,6 +39,8 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.stream.Collectors; import org.apache.beam.runners.core.construction.PTransformMatchers; @@ -68,13 +71,13 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; +import org.junit.runners.Parameterized; import org.mockito.ArgumentCaptor; import org.mockito.Mockito; import org.powermock.reflect.Whitebox; /** Tests for {@link FlinkPipelineExecutionEnvironment}. */ -@RunWith(JUnit4.class) +@RunWith(Parameterized.class) @SuppressWarnings({ "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) }) @@ -82,9 +85,22 @@ public class FlinkPipelineExecutionEnvironmentTest implements Serializable { @Rule public transient TemporaryFolder tmpFolder = new TemporaryFolder(); + @Parameterized.Parameter public boolean useDataStreamForBatch; + + @Parameterized.Parameters(name = "UseDataStreamForBatch = {0}") + public static Collection useDataStreamForBatchJobValues() { + return Arrays.asList(new Object[][] {{false}, {true}}); + } + + private FlinkPipelineOptions getDefaultPipelineOptions() { + FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + options.setUseDataStreamForBatch(useDataStreamForBatch); + return options; + } + @Test public void shouldRecognizeAndTranslateStreamingPipeline() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("[auto]"); @@ -136,6 +152,8 @@ public void shouldNotPrepareFilesToStageWhenFlinkMasterIsSetToAuto() throws IOEx @Test public void shouldNotPrepareFilesToStagewhenFlinkMasterIsSetToCollection() throws IOException { + // StreamingExecutionEnv does not support "collection" mode. + assumeFalse(useDataStreamForBatch); FlinkPipelineOptions options = testPreparingResourcesToStage("[collection]"); assertThat(options.getFilesToStage().size(), is(2)); @@ -152,7 +170,7 @@ public void shouldNotPrepareFilesToStageWhenFlinkMasterIsSetToLocal() throws IOE @Test public void shouldUseDefaultTempLocationIfNoneSet() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("clusterAddress"); @@ -168,42 +186,33 @@ public void shouldUseDefaultTempLocationIfNoneSet() { @Test public void shouldUsePreparedFilesOnRemoteEnvironment() throws Exception { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); - options.setRunner(TestFlinkRunner.class); - options.setFlinkMaster("clusterAddress"); - - FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); - - Pipeline pipeline = Pipeline.create(options); - flinkEnv.translate(pipeline); - - ExecutionEnvironment executionEnvironment = flinkEnv.getBatchExecutionEnvironment(); - assertThat(executionEnvironment, instanceOf(RemoteEnvironment.class)); - - List jarFiles = getJars(executionEnvironment); - - List urlConvertedStagedFiles = convertFilesToURLs(options.getFilesToStage()); - - assertThat(jarFiles, is(urlConvertedStagedFiles)); + shouldUsePreparedFilesOnRemoteStreamEnvironment(true); + shouldUsePreparedFilesOnRemoteStreamEnvironment(false); } - @Test - public void shouldUsePreparedFilesOnRemoteStreamEnvironment() throws Exception { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + public void shouldUsePreparedFilesOnRemoteStreamEnvironment(boolean streamingMode) + throws Exception { + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("clusterAddress"); - options.setStreaming(true); + options.setStreaming(streamingMode); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); Pipeline pipeline = Pipeline.create(options); flinkEnv.translate(pipeline); - StreamExecutionEnvironment streamExecutionEnvironment = - flinkEnv.getStreamExecutionEnvironment(); - assertThat(streamExecutionEnvironment, instanceOf(RemoteStreamEnvironment.class)); - - List jarFiles = getJars(streamExecutionEnvironment); + List jarFiles; + if (streamingMode || options.getUseDataStreamForBatch()) { + StreamExecutionEnvironment streamExecutionEnvironment = + flinkEnv.getStreamExecutionEnvironment(); + assertThat(streamExecutionEnvironment, instanceOf(RemoteStreamEnvironment.class)); + jarFiles = getJars(streamExecutionEnvironment); + } else { + ExecutionEnvironment executionEnvironment = flinkEnv.getBatchExecutionEnvironment(); + assertThat(executionEnvironment, instanceOf(RemoteEnvironment.class)); + jarFiles = getJars(executionEnvironment); + } List urlConvertedStagedFiles = convertFilesToURLs(options.getFilesToStage()); @@ -214,7 +223,7 @@ public void shouldUsePreparedFilesOnRemoteStreamEnvironment() throws Exception { public void shouldUseTransformOverrides() { boolean[] testParameters = {true, false}; for (boolean streaming : testParameters) { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setStreaming(streaming); options.setRunner(FlinkRunner.class); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); @@ -234,7 +243,7 @@ public void shouldUseTransformOverrides() { @Test public void shouldProvideParallelismToTransformOverrides() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setStreaming(true); options.setRunner(FlinkRunner.class); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); @@ -278,7 +287,7 @@ public boolean matches(Object actual) { @Test public void shouldUseStreamingTransformOverridesWithUnboundedSources() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); // no explicit streaming mode set options.setRunner(FlinkRunner.class); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); @@ -303,7 +312,7 @@ public void shouldUseStreamingTransformOverridesWithUnboundedSources() { @Test public void testTranslationModeOverrideWithUnboundedSources() { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setStreaming(false); @@ -319,7 +328,7 @@ public void testTranslationModeOverrideWithUnboundedSources() { public void testTranslationModeNoOverrideWithoutUnboundedSources() { boolean[] testArgs = new boolean[] {true, false}; for (boolean streaming : testArgs) { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(FlinkRunner.class); options.setStreaming(streaming); @@ -408,7 +417,7 @@ private FlinkPipelineOptions testPreparingResourcesToStage( private FlinkPipelineOptions setPipelineOptions( String flinkMaster, String tempLocation, List filesToStage) { - FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + FlinkPipelineOptions options = getDefaultPipelineOptions(); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster(flinkMaster); options.setTempLocation(tempLocation); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java index c2d9163aacc97..da8c560690a62 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineOptionsTest.java @@ -94,6 +94,7 @@ public void testDefaults() { assertThat(options.getMaxBundleSize(), is(1000L)); assertThat(options.getMaxBundleTimeMills(), is(1000L)); assertThat(options.getExecutionModeForBatch(), is(ExecutionMode.PIPELINED.name())); + assertThat(options.getUseDataStreamForBatch(), is(false)); assertThat(options.getSavepointPath(), is(nullValue())); assertThat(options.getAllowNonRestoredState(), is(false)); assertThat(options.getDisableMetrics(), is(false)); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslatorTest.java index 5d56e6ddbf675..84f1dc3c64575 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslatorTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingPipelineTranslatorTest.java @@ -156,7 +156,7 @@ public void testStatefulParDoAfterCombineChaining() { private JobGraph getStatefulParDoAfterCombineChainingJobGraph(boolean stablePartitioning) { final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); final FlinkStreamingPipelineTranslator translator = - new FlinkStreamingPipelineTranslator(env, PipelineOptionsFactory.create()); + new FlinkStreamingPipelineTranslator(env, PipelineOptionsFactory.create(), true); final PipelineOptions pipelineOptions = PipelineOptionsFactory.create(); pipelineOptions.setRunner(FlinkRunner.class); final Pipeline pipeline = Pipeline.create(pipelineOptions); @@ -188,7 +188,7 @@ public void testStatefulParDoAfterGroupByKeyChaining() { private JobGraph getStatefulParDoAfterGroupByKeyChainingJobGraph(boolean stablePartitioning) { final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); final FlinkStreamingPipelineTranslator translator = - new FlinkStreamingPipelineTranslator(env, PipelineOptionsFactory.create()); + new FlinkStreamingPipelineTranslator(env, PipelineOptionsFactory.create(), true); final PipelineOptions pipelineOptions = PipelineOptionsFactory.create(); pipelineOptions.setRunner(FlinkRunner.class); final Pipeline pipeline = Pipeline.create(pipelineOptions); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java index 451070c1c1643..d5d34b59214b4 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkStreamingTransformTranslatorsTest.java @@ -29,8 +29,8 @@ import java.util.Map; import org.apache.beam.runners.core.construction.PTransformTranslation; import org.apache.beam.runners.core.construction.SplittableParDo; -import org.apache.beam.runners.flink.FlinkStreamingTransformTranslators.UnboundedSourceWrapperNoValueWithRecordId; -import org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper; +import org.apache.beam.runners.flink.translation.wrappers.streaming.io.source.FlinkSource; +import org.apache.beam.runners.flink.translation.wrappers.streaming.io.source.bounded.FlinkBoundedSource; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.StringUtf8Coder; @@ -49,8 +49,8 @@ import org.apache.beam.sdk.values.WindowingStrategy; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; import org.apache.flink.streaming.api.transformations.OneInputTransformation; +import org.apache.flink.streaming.api.transformations.SourceTransformation; import org.checkerframework.checker.nullness.qual.Nullable; import org.junit.Test; @@ -76,11 +76,10 @@ public void readSourceTranslatorBoundedWithMaxParallelism() { Object sourceTransform = applyReadSourceTransform(transform, PCollection.IsBounded.BOUNDED, env); - UnboundedSourceWrapperNoValueWithRecordId source = - (UnboundedSourceWrapperNoValueWithRecordId) - ((LegacySourceTransformation) sourceTransform).getOperator().getUserFunction(); + FlinkBoundedSource source = + (FlinkBoundedSource) ((SourceTransformation) sourceTransform).getSource(); - assertEquals(maxParallelism, source.getUnderlyingSource().getSplitSources().size()); + assertEquals(maxParallelism, source.getNumSplits()); } @Test @@ -96,11 +95,10 @@ public void readSourceTranslatorBoundedWithoutMaxParallelism() { Object sourceTransform = applyReadSourceTransform(transform, PCollection.IsBounded.BOUNDED, env); - UnboundedSourceWrapperNoValueWithRecordId source = - (UnboundedSourceWrapperNoValueWithRecordId) - ((LegacySourceTransformation) sourceTransform).getOperator().getUserFunction(); + FlinkBoundedSource source = + (FlinkBoundedSource) ((SourceTransformation) sourceTransform).getSource(); - assertEquals(parallelism, source.getUnderlyingSource().getSplitSources().size()); + assertEquals(parallelism, source.getNumSplits()); } @Test @@ -119,13 +117,12 @@ public void readSourceTranslatorUnboundedWithMaxParallelism() { (OneInputTransformation) applyReadSourceTransform(transform, PCollection.IsBounded.UNBOUNDED, env); - UnboundedSourceWrapper source = - (UnboundedSourceWrapper) - ((LegacySourceTransformation) Iterables.getOnlyElement(sourceTransform.getInputs())) - .getOperator() - .getUserFunction(); + FlinkSource source = + (FlinkSource) + ((SourceTransformation) Iterables.getOnlyElement(sourceTransform.getInputs())) + .getSource(); - assertEquals(maxParallelism, source.getSplitSources().size()); + assertEquals(maxParallelism, source.getNumSplits()); } @Test @@ -142,13 +139,12 @@ public void readSourceTranslatorUnboundedWithoutMaxParallelism() { (OneInputTransformation) applyReadSourceTransform(transform, PCollection.IsBounded.UNBOUNDED, env); - UnboundedSourceWrapper source = - (UnboundedSourceWrapper) - ((LegacySourceTransformation) Iterables.getOnlyElement(sourceTransform.getInputs())) - .getOperator() - .getUserFunction(); + FlinkSource source = + (FlinkSource) + ((SourceTransformation) Iterables.getOnlyElement(sourceTransform.getInputs())) + .getSource(); - assertEquals(parallelism, source.getSplitSources().size()); + assertEquals(parallelism, source.getNumSplits()); } private Object applyReadSourceTransform( @@ -157,7 +153,7 @@ private Object applyReadSourceTransform( FlinkStreamingPipelineTranslator.StreamTransformTranslator> translator = getReadSourceTranslator(); FlinkStreamingTranslationContext ctx = - new FlinkStreamingTranslationContext(env, PipelineOptionsFactory.create()); + new FlinkStreamingTranslationContext(env, PipelineOptionsFactory.create(), true); Pipeline pipeline = Pipeline.create(); PCollection pc = diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java index 601dbc66b1a22..b502e1129ee26 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java @@ -72,6 +72,8 @@ public class FlinkSubmissionTest { /** Counter which keeps track of the number of jobs submitted. */ private static int expectedNumberOfJobs; + public static boolean useDataStreamForBatch; + @BeforeClass public static void beforeClass() throws Exception { Configuration config = new Configuration(); @@ -104,6 +106,12 @@ public void testSubmissionBatch() throws Exception { runSubmission(false, false); } + @Test + public void testSubmissionBatchUseDataStream() throws Exception { + FlinkSubmissionTest.useDataStreamForBatch = true; + runSubmission(false, false); + } + @Test public void testSubmissionStreaming() throws Exception { runSubmission(false, true); @@ -114,6 +122,12 @@ public void testDetachedSubmissionBatch() throws Exception { runSubmission(true, false); } + @Test + public void testDetachedSubmissionBatchUseDataStream() throws Exception { + FlinkSubmissionTest.useDataStreamForBatch = true; + runSubmission(true, false); + } + @Test public void testDetachedSubmissionStreaming() throws Exception { runSubmission(true, true); @@ -164,6 +178,7 @@ private void waitUntilJobIsCompleted() throws Exception { /** The Flink program which is executed by the CliFrontend. */ public static void main(String[] args) { FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + options.setUseDataStreamForBatch(useDataStreamForBatch); options.setRunner(FlinkRunner.class); options.setStreaming(streaming); options.setParallelism(1); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingTest.java index 8da44d4b3a83e..b8dc52f6cd4b4 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/ReadSourceStreamingTest.java @@ -56,13 +56,19 @@ public void postSubmit() throws Exception { } @Test - public void testProgram() throws Exception { - runProgram(resultPath); + public void testStreaming() { + runProgram(resultPath, true); } - private static void runProgram(String resultPath) { + @Test + public void testBatch() { + runProgram(resultPath, false); + } + + private static void runProgram(String resultPath, boolean streaming) { - Pipeline p = FlinkTestPipeline.createForStreaming(); + Pipeline p = + streaming ? FlinkTestPipeline.createForStreaming() : FlinkTestPipeline.createForBatch(); p.apply(GenerateSequence.from(0).to(10)) .apply( diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java index 0947ddda8d0b4..a93a7663c4516 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/metrics/FlinkMetricContainerTest.java @@ -37,7 +37,7 @@ import org.apache.beam.runners.core.metrics.MonitoringInfoConstants; import org.apache.beam.runners.core.metrics.MonitoringInfoMetricName; import org.apache.beam.runners.core.metrics.SimpleMonitoringInfoBuilder; -import org.apache.beam.runners.flink.metrics.FlinkMetricContainer.FlinkDistributionGauge; +import org.apache.beam.runners.flink.metrics.FlinkMetricContainerBase.FlinkDistributionGauge; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.DistributionResult; diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java index 722d32b309c41..17cc16cc76e07 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/DoFnOperatorTest.java @@ -2015,6 +2015,98 @@ public void finishBundle(FinishBundleContext context) { WindowedValue.valueInGlobalWindow("finishBundle"))); } + @Test + public void testExactlyOnceBufferingFlushDuringDrain() throws Exception { + FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); + options.setMaxBundleSize(2L); + options.setCheckpointingInterval(1L); + options.setEnableStableInputDrain(true); + + TupleTag outputTag = new TupleTag<>("main-output"); + WindowedValue.ValueOnlyWindowedValueCoder windowedValueCoder = + WindowedValue.getValueOnlyCoder(StringUtf8Coder.of()); + + numStartBundleCalled = 0; + DoFn doFn = + new DoFn() { + @StartBundle + public void startBundle(StartBundleContext context) { + numStartBundleCalled += 1; + } + + @ProcessElement + // Use RequiresStableInput to force buffering elements + @RequiresStableInput + public void processElement(ProcessContext context) { + context.output(context.element()); + } + + @FinishBundle + public void finishBundle(FinishBundleContext context) { + context.output( + "finishBundle", BoundedWindow.TIMESTAMP_MIN_VALUE, GlobalWindow.INSTANCE); + } + }; + + DoFnOperator.MultiOutputOutputManagerFactory outputManagerFactory = + new DoFnOperator.MultiOutputOutputManagerFactory<>( + outputTag, + WindowedValue.getFullCoder(StringUtf8Coder.of(), GlobalWindow.Coder.INSTANCE), + new SerializablePipelineOptions(options)); + + Supplier> doFnOperatorSupplier = + () -> + new DoFnOperator<>( + doFn, + "stepName", + windowedValueCoder, + Collections.emptyMap(), + outputTag, + Collections.emptyList(), + outputManagerFactory, + WindowingStrategy.globalDefault(), + new HashMap<>(), /* side-input mapping */ + Collections.emptyList(), /* side inputs */ + options, + null, + null, + DoFnSchemaInformation.create(), + Collections.emptyMap()); + + DoFnOperator doFnOperator = doFnOperatorSupplier.get(); + OneInputStreamOperatorTestHarness, WindowedValue> testHarness = + new OneInputStreamOperatorTestHarness<>(doFnOperator); + + testHarness.open(); + + testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("a"))); + testHarness.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow("b"))); + + assertThat(Iterables.size(testHarness.getOutput()), is(0)); + assertThat(numStartBundleCalled, is(0)); + + // Simulate pipeline drain scenario + OperatorSubtaskState backup = testHarness.snapshot(0, 0); + doFnOperator.flushData(); + + assertThat(numStartBundleCalled, is(1)); + assertThat( + stripStreamRecordFromWindowedValue(testHarness.getOutput()), + contains( + WindowedValue.valueInGlobalWindow("a"), + WindowedValue.valueInGlobalWindow("b"), + WindowedValue.valueInGlobalWindow("finishBundle"))); + + doFnOperator = doFnOperatorSupplier.get(); + testHarness = new OneInputStreamOperatorTestHarness<>(doFnOperator); + testHarness.open(); + + doFnOperator.notifyCheckpointComplete(0L); + + assertThat(numStartBundleCalled, is(1)); + assertThat(stripStreamRecordFromWindowedValue(testHarness.getOutput()), emptyIterable()); + } + @Test public void testExactlyOnceBufferingKeyed() throws Exception { FlinkPipelineOptions options = FlinkPipelineOptions.defaults(); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/TestCountingSource.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/TestCountingSource.java index 5c54ce4c44e1d..3af9062ba9b41 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/TestCountingSource.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/TestCountingSource.java @@ -29,6 +29,8 @@ import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.coders.VarIntCoder; import org.apache.beam.sdk.io.UnboundedSource; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.values.KV; @@ -198,6 +200,10 @@ public boolean requiresDeduping() { */ public class CountingSourceReader extends UnboundedReader> implements TestReader { + public static final String ADVANCE_COUNTER_NAMESPACE = "testNameSpace"; + public static final String ADVANCE_COUNTER_NAME = "advanceCounter"; + private final Counter advanceCounter = + Metrics.counter(ADVANCE_COUNTER_NAMESPACE, ADVANCE_COUNTER_NAME); private int current; private boolean closed; @@ -213,6 +219,7 @@ public boolean start() { @Override public boolean advance() { + advanceCounter.inc(); if (current >= numMessagesPerShard - 1 || haltEmission) { return false; } diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderTestBase.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderTestBase.java index dcab3aff0f5ba..462a1ba0153d9 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderTestBase.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/FlinkSourceReaderTestBase.java @@ -19,6 +19,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.mockito.Mockito.when; @@ -32,6 +33,7 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.function.Function; import javax.annotation.Nullable; +import org.apache.beam.runners.flink.translation.wrappers.streaming.io.TestCountingSource; import org.apache.beam.sdk.io.Source; import org.apache.beam.sdk.values.KV; import org.apache.flink.api.common.eventtime.Watermark; @@ -40,6 +42,7 @@ import org.apache.flink.api.connector.source.SourceReader; import org.apache.flink.api.connector.source.SourceReaderContext; import org.apache.flink.core.testutils.ManuallyTriggeredScheduledExecutorService; +import org.apache.flink.metrics.Counter; import org.junit.Test; import org.mockito.Mockito; @@ -194,6 +197,31 @@ public void testNumBytesInMetrics() throws Exception { assertEquals(numRecordsPerSplit * numSplits, testMetricGroup.numRecordsInCounter.getCount()); } + @Test + public void testMetricsContainer() throws Exception { + ManuallyTriggeredScheduledExecutorService executor = + new ManuallyTriggeredScheduledExecutorService(); + SourceTestCompat.TestMetricGroup testMetricGroup = new SourceTestCompat.TestMetricGroup(); + try (SourceReader>> reader = + createReader(executor, 0L, null, testMetricGroup)) { + reader.start(); + + List>> splits = createSplits(2, 10, 0); + reader.addSplits(splits); + RecordsValidatingOutput validatingOutput = new RecordsValidatingOutput(splits); + + // Need to poll once to create all the readers. + reader.pollNext(validatingOutput); + Counter advanceCounter = + testMetricGroup.registeredCounter.get( + TestCountingSource.CountingSourceReader.ADVANCE_COUNTER_NAMESPACE + + "." + + TestCountingSource.CountingSourceReader.ADVANCE_COUNTER_NAME); + assertNotNull(advanceCounter); + assertTrue("The reader should have advanced.", advanceCounter.getCount() > 0); + } + } + // --------------- abstract methods --------------- protected abstract KV getKVPairs(OutputT record); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReaderTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReaderTest.java index 6303a729652a6..84cb2a72ddaff 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReaderTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/bounded/FlinkBoundedSourceReaderTest.java @@ -138,9 +138,10 @@ protected FlinkBoundedSourceReader> createReader( SourceReaderContext mockContext = createSourceReaderContext(testMetricGroup); if (executor != null) { return new FlinkBoundedSourceReader<>( - mockContext, pipelineOptions, executor, timestampExtractor); + "FlinkBoundedSource", mockContext, pipelineOptions, executor, timestampExtractor); } else { - return new FlinkBoundedSourceReader<>(mockContext, pipelineOptions, timestampExtractor); + return new FlinkBoundedSourceReader<>( + "FlinkBoundedSource", mockContext, pipelineOptions, timestampExtractor); } } } diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReaderTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReaderTest.java index f420bd8900ff3..b7cba373cf75f 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReaderTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/source/unbounded/FlinkUnboundedSourceReaderTest.java @@ -303,9 +303,10 @@ protected FlinkUnboundedSourceReader> createReader( SourceReaderContext mockContext = createSourceReaderContext(metricGroup); if (executor != null) { return new FlinkUnboundedSourceReader<>( - mockContext, pipelineOptions, executor, timestampExtractor); + "FlinkUnboundedReader", mockContext, pipelineOptions, executor, timestampExtractor); } else { - return new FlinkUnboundedSourceReader<>(mockContext, pipelineOptions, timestampExtractor); + return new FlinkUnboundedSourceReader<>( + "FlinkUnboundedReader", mockContext, pipelineOptions, timestampExtractor); } } diff --git a/runners/google-cloud-dataflow-java/arm/build.gradle b/runners/google-cloud-dataflow-java/arm/build.gradle index e79eeedcd8284..a3d191671d1b6 100644 --- a/runners/google-cloud-dataflow-java/arm/build.gradle +++ b/runners/google-cloud-dataflow-java/arm/build.gradle @@ -71,15 +71,18 @@ configurations { examplesJavaIntegrationTest } dependencies { examplesJavaIntegrationTest project(project.path) - examplesJavaIntegrationTest project(path: ":runners:google-cloud-dataflow-java", configuration: "testRuntimeMigration") + // TODO(yathu) Include full test classpath once gradle shadow plugin does not support Java21 + if (project.findProperty('testJavaVersion') == '21' || JavaVersion.current().equals(JavaVersion.VERSION_21)) { + examplesJavaIntegrationTest project(path: ":runners:google-cloud-dataflow-java") + } else { + examplesJavaIntegrationTest project(path: ":runners:google-cloud-dataflow-java", configuration: "testRuntimeMigration") + } examplesJavaIntegrationTest project(path: ":examples:java", configuration: "testRuntimeMigration") } def javaVer = "java8" -if(project.hasProperty('compileAndRunTestsWithJava17')) { - javaVer = "java17" -} else if(project.hasProperty('compileAndRunTestsWithJava11')) { - javaVer = "java11" +if (project.hasProperty('testJavaVersion')) { + javaVer = "java${project.getProperty('testJavaVersion')}" } def dataflowProject = project.findProperty('dataflowProject') ?: 'apache-beam-testing' def dataflowRegion = project.findProperty('dataflowRegion') ?: 'us-central1' @@ -88,14 +91,14 @@ def firestoreDb = project.findProperty('firestoreDb') ?: 'firestoredb' def dockerImageRoot = project.findProperty('docker-repository-root') ?: "us.gcr.io/${dataflowProject}/java-postcommit-it" def DockerJavaMultiarchImageContainer = "${dockerImageRoot}/${project.docker_image_default_repo_prefix}${javaVer}_sdk" def dockerTag = project.findProperty('docker-tag') ?: new Date().format('yyyyMMddHHmmss') -ext.DockerJavaMultiarchImageName = "${DockerJavaMultiarchImageContainer}:${dockerTag}" +ext.DockerJavaMultiarchImageName = "${DockerJavaMultiarchImageContainer}:${dockerTag}" as String def runnerV2PipelineOptionsARM = [ "--runner=TestDataflowRunner", "--project=${dataflowProject}", "--region=${dataflowRegion}", "--tempRoot=${dataflowValidatesTempRoot}", - "--sdkContainerImage=${DockerJavaMultiarchImageContainer}:${dockerTag}", + "--sdkContainerImage=${project.ext.DockerJavaMultiarchImageName}", "--experiments=use_unified_worker,use_runner_v2", "--firestoreDb=${firestoreDb}", "--workerMachineType=t2a-standard-1", @@ -137,3 +140,29 @@ task examplesJavaRunnerV2IntegrationTestARM(type: Test) { testClassesDirs = files(project(":examples:java").sourceSets.test.output.classesDirs) useJUnit { } } + +// Clean up built Java images +def cleanUpDockerJavaImages = tasks.register("cleanUpDockerJavaImages") { + doLast { + exec { + commandLine "docker", "rmi", "--force", "${DockerJavaMultiarchImageName}" + } + exec { + ignoreExitValue true + commandLine "gcloud", "--quiet", "container", "images", "untag", "${DockerJavaMultiarchImageName}" + } + exec { + ignoreExitValue true + commandLine "./../scripts/cleanup_untagged_gcr_images.sh", "${DockerJavaMultiarchImageContainer}" + } + } +} + +afterEvaluate { + // Ensure all tasks which use published docker images run before they are cleaned up + tasks.each { t -> + if (t.dependsOn.contains(buildAndPushDockerJavaMultiarchContainer) && !t.name.equalsIgnoreCase('printrunnerV2PipelineOptionsARM')) { + t.finalizedBy cleanUpDockerJavaImages + } + } +} diff --git a/runners/google-cloud-dataflow-java/build.gradle b/runners/google-cloud-dataflow-java/build.gradle index adc1f2e09bc4e..5d967d90f2b88 100644 --- a/runners/google-cloud-dataflow-java/build.gradle +++ b/runners/google-cloud-dataflow-java/build.gradle @@ -51,8 +51,8 @@ evaluationDependsOn(":sdks:java:container:java11") ext.dataflowLegacyEnvironmentMajorVersion = '8' ext.dataflowFnapiEnvironmentMajorVersion = '8' -ext.dataflowLegacyContainerVersion = 'beam-master-20230809' -ext.dataflowFnapiContainerVersion = 'beam-master-20230809' +ext.dataflowLegacyContainerVersion = 'beam-master-20231026' +ext.dataflowFnapiContainerVersion = 'beam-master-20231026' ext.dataflowContainerBaseRepository = 'gcr.io/cloud-dataflow/v1beta3' processResources { @@ -279,10 +279,8 @@ def createRunnerV2ValidatesRunnerTest = { Map args -> // task ordering such that the registry doesn't get cleaned up prior to task completion. def buildAndPushDockerJavaContainer = tasks.register("buildAndPushDockerJavaContainer") { def javaVer = "java8" - if(project.hasProperty('compileAndRunTestsWithJava17')) { - javaVer = "java17" - } else if(project.hasProperty('compileAndRunTestsWithJava11')) { - javaVer = "java11" + if(project.hasProperty('testJavaVersion')) { + javaVer = "java${project.getProperty('testJavaVersion')}" } dependsOn ":sdks:java:container:${javaVer}:docker" def defaultDockerImageName = containerImageName( @@ -306,9 +304,11 @@ def cleanUpDockerJavaImages = tasks.register("cleanUpDockerJavaImages") { commandLine "docker", "rmi", "--force", "${dockerJavaImageName}" } exec { + ignoreExitValue true commandLine "gcloud", "--quiet", "container", "images", "untag", "${dockerJavaImageName}" } exec { + ignoreExitValue true commandLine "./scripts/cleanup_untagged_gcr_images.sh", "${dockerJavaImageContainer}" } } @@ -347,9 +347,11 @@ def cleanUpDockerPythonImages = tasks.register("cleanUpDockerPythonImages") { commandLine "docker", "rmi", "--force", "${dockerPythonImageName}" } exec { + ignoreExitValue true commandLine "gcloud", "--quiet", "container", "images", "untag", "${dockerPythonImageName}" } exec { + ignoreExitValue true commandLine "./scripts/cleanup_untagged_gcr_images.sh", "${dockerPythonImageContainer}" } } diff --git a/runners/google-cloud-dataflow-java/examples/build.gradle b/runners/google-cloud-dataflow-java/examples/build.gradle index 20bc50dea5a70..34addac695b18 100644 --- a/runners/google-cloud-dataflow-java/examples/build.gradle +++ b/runners/google-cloud-dataflow-java/examples/build.gradle @@ -36,7 +36,7 @@ dependencies { def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' def gcpRegion = project.findProperty('gcpRegion') ?: 'us-central1' -def gcsTempRoot = project.findProperty('gcsTempRoot') ?: 'gs://temp-storage-for-end-to-end-tests/' +def gcsTempRoot = project.findProperty('gcsTempRoot') ?: 'gs://temp-storage-for-end-to-end-tests' def dockerJavaImageName = project(':runners:google-cloud-dataflow-java').ext.dockerJavaImageName // If -PuseExecutableStage is set, the use_executable_stage_bundle_execution wil be enabled. def fnapiExperiments = project.hasProperty('useExecutableStage') ? 'beam_fn_api_use_deprecated_read,use_executable_stage_bundle_execution' : "beam_fn_api,beam_fn_api_use_deprecated_read" @@ -84,7 +84,7 @@ def commonConfig = { Map args -> include "**/WordCountIT.class" include "**/WindowedWordCountIT.class" } else { - include "**/IT.class" + include "**/*IT.class" if (runWordCount == 'exclude') { exclude "**/WordCountIT.class" exclude "**/WindowedWordCountIT.class" @@ -98,9 +98,11 @@ def commonConfig = { Map args -> "--region=${gcpRegion}", "--tempRoot=${actualGcsTempRoot}", "--runner=TestDataflowRunner", - "--dataflowWorkerJar=${actualDataflowWorkerJar}", - "--workerHarnessContainerImage=${actualWorkerHarnessContainerImage}" - ] + additionalOptions + "--dataflowWorkerJar=${actualDataflowWorkerJar}"] + if (actualWorkerHarnessContainerImage) { + preCommitBeamTestPipelineOptions += "--workerHarnessContainerImage=${actualWorkerHarnessContainerImage}" + } + preCommitBeamTestPipelineOptions += additionalOptions systemProperty "beamTestPipelineOptions", JsonOutput.toJson(preCommitBeamTestPipelineOptions) } } @@ -169,6 +171,17 @@ task java17PostCommit() { dependsOn postCommitLegacyWorkerJava17 } +task postCommitLegacyWorkerJava21(type: Test) { + dependsOn ":runners:google-cloud-dataflow-java:worker:shadowJar" + def dataflowWorkerJar = project.findProperty('dataflowWorkerJar') ?: project(":runners:google-cloud-dataflow-java:worker").shadowJar.archivePath + systemProperty "java.specification.version", "21" + with commonConfig(dataflowWorkerJar: dataflowWorkerJar, runWordCount: 'exclude') +} + +task java21PostCommit() { + dependsOn postCommitLegacyWorkerJava21 +} + task preCommit() { dependsOn preCommitLegacyWorker dependsOn preCommitLegacyWorkerImpersonate diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index 26548038a1dfc..6449053194fee 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -168,7 +168,6 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Joiner; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Utf8; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -398,18 +397,20 @@ && isServiceEndpoint(dataflowOptions.getDataflowEndpoint())) { // Adding the Java version to the SDK name for user's and support convenience. String agentJavaVer = "(JRE 8 environment)"; - if (Environments.getJavaVersion() == Environments.JavaVersion.java17) { - agentJavaVer = "(JRE 17 environment)"; - } else if (Environments.getJavaVersion() == Environments.JavaVersion.java11) { - agentJavaVer = "(JRE 11 environment)"; + if (Environments.getJavaVersion() != Environments.JavaVersion.java8) { + agentJavaVer = + String.format("(JRE %s environment)", Environments.getJavaVersion().specification()); } DataflowRunnerInfo dataflowRunnerInfo = DataflowRunnerInfo.getDataflowRunnerInfo(); + String userAgentName = dataflowRunnerInfo.getName(); + Preconditions.checkArgument( + !userAgentName.equals(""), "Dataflow runner's `name` property cannot be empty."); + String userAgentVersion = dataflowRunnerInfo.getVersion(); + Preconditions.checkArgument( + !userAgentVersion.equals(""), "Dataflow runner's `version` property cannot be empty."); String userAgent = - String.format( - "%s/%s%s", - dataflowRunnerInfo.getName(), dataflowRunnerInfo.getVersion(), agentJavaVer) - .replace(" ", "_"); + String.format("%s/%s%s", userAgentName, userAgentVersion, agentJavaVer).replace(" ", "_"); dataflowOptions.setUserAgent(userAgent); return new DataflowRunner(dataflowOptions); @@ -1331,15 +1332,26 @@ public DataflowPipelineJob run(Pipeline pipeline) { hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment()); } + // enable upload_graph when the graph is too large + byte[] jobGraphBytes = DataflowPipelineTranslator.jobToString(newJob).getBytes(UTF_8); + int jobGraphByteSize = jobGraphBytes.length; + if (jobGraphByteSize >= CREATE_JOB_REQUEST_LIMIT_BYTES + && !hasExperiment(options, "upload_graph")) { + List experiments = firstNonNull(options.getExperiments(), new ArrayList<>()); + experiments.add("upload_graph"); + options.setExperiments(ImmutableList.copyOf(experiments)); + LOG.info( + "The job graph size ({} in bytes) is larger than {}. Automatically add " + + "the upload_graph option to experiments.", + jobGraphByteSize, + CREATE_JOB_REQUEST_LIMIT_BYTES); + } + // Upload the job to GCS and remove the graph object from the API call. The graph // will be downloaded from GCS by the service. if (hasExperiment(options, "upload_graph")) { DataflowPackage stagedGraph = - options - .getStager() - .stageToFile( - DataflowPipelineTranslator.jobToString(newJob).getBytes(UTF_8), - DATAFLOW_GRAPH_FILE_NAME); + options.getStager().stageToFile(jobGraphBytes, DATAFLOW_GRAPH_FILE_NAME); newJob.getSteps().clear(); newJob.setStepsLocation(stagedGraph.getLocation()); } @@ -1399,7 +1411,7 @@ public DataflowPipelineJob run(Pipeline pipeline) { } catch (GoogleJsonResponseException e) { String errorMessages = "Unexpected errors"; if (e.getDetails() != null) { - if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) { + if (jobGraphByteSize >= CREATE_JOB_REQUEST_LIMIT_BYTES) { errorMessages = "The size of the serialized JSON representation of the pipeline " + "exceeds the allowable limit. " diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowPipelineOptions.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowPipelineOptions.java index 985e1736dcb03..8cc812cfa17be 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowPipelineOptions.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/options/DataflowPipelineOptions.java @@ -112,7 +112,10 @@ public interface DataflowPipelineOptions */ @Description( "Service options are set by the user and configure the service. This " - + "decouples service side feature availability from the Apache Beam release cycle.") + + "decouples service side feature availability from the Apache Beam release cycle. " + + "For a list of service options, see " + + "https://cloud.google.com/dataflow/docs/reference/service-options " + + "in the Dataflow documentation.") List getDataflowServiceOptions(); void setDataflowServiceOptions(List options); diff --git a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java index 078f25e0e38e8..bcdea03dba2c3 100644 --- a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java +++ b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java @@ -242,7 +242,7 @@ public void setUp() throws IOException { mockJobs = mock(Dataflow.Projects.Locations.Jobs.class); } - private Pipeline buildDataflowPipeline(DataflowPipelineOptions options) { + private static Pipeline buildDataflowPipeline(DataflowPipelineOptions options) { options.setStableUniqueNames(CheckEnabled.ERROR); options.setRunner(DataflowRunner.class); Pipeline p = Pipeline.create(options); @@ -256,6 +256,22 @@ private Pipeline buildDataflowPipeline(DataflowPipelineOptions options) { return p; } + private static Pipeline buildDataflowPipelineWithLargeGraph(DataflowPipelineOptions options) { + options.setStableUniqueNames(CheckEnabled.ERROR); + options.setRunner(DataflowRunner.class); + Pipeline p = Pipeline.create(options); + + for (int i = 0; i < 100; i++) { + p.apply("ReadMyFile_" + i, TextIO.read().from("gs://bucket/object")) + .apply("WriteMyFile_" + i, TextIO.write().to("gs://bucket/object")); + } + + // Enable the FileSystems API to know about gs:// URIs in this test. + FileSystems.setDefaultPipelineOptions(options); + + return p; + } + private static Dataflow buildMockDataflow(Dataflow.Projects.Locations.Jobs mockJobs) throws IOException { Dataflow mockDataflowClient = mock(Dataflow.class); @@ -824,6 +840,24 @@ public void testUploadGraph() throws IOException { .startsWith("gs://valid-bucket/temp/staging/dataflow_graph")); } + /** Test for automatically using upload_graph when the job graph is too large (>10MB). */ + @Test + public void testUploadGraphWithAutoUpload() throws IOException { + DataflowPipelineOptions options = buildPipelineOptions(); + Pipeline p = buildDataflowPipelineWithLargeGraph(options); + p.run(); + + ArgumentCaptor jobCaptor = ArgumentCaptor.forClass(Job.class); + Mockito.verify(mockJobs).create(eq(PROJECT_ID), eq(REGION_ID), jobCaptor.capture()); + assertValidJob(jobCaptor.getValue()); + assertTrue(jobCaptor.getValue().getSteps().isEmpty()); + assertTrue( + jobCaptor + .getValue() + .getStepsLocation() + .startsWith("gs://valid-bucket/temp/staging/dataflow_graph")); + } + @Test public void testUpdateNonExistentPipeline() throws IOException { thrown.expect(IllegalArgumentException.class); diff --git a/runners/google-cloud-dataflow-java/worker/build.gradle b/runners/google-cloud-dataflow-java/worker/build.gradle index ce06063c9b52d..1ca9eba2b482c 100644 --- a/runners/google-cloud-dataflow-java/worker/build.gradle +++ b/runners/google-cloud-dataflow-java/worker/build.gradle @@ -89,6 +89,9 @@ applyJavaNature( // Allow slf4j implementation worker for logging during pipeline execution "org/slf4j/impl/**" ], + generatedClassPatterns: [ + /^org\.apache\.beam\.runners\.dataflow\.worker\.windmill.*/ + ], shadowClosure: { // Each included dependency must also include all of its necessary transitive dependencies // or have them provided by the users pipeline during job submission. Typically a users diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowMetricsContainer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowMetricsContainer.java index 81517129c8e91..c3e4fb1388b06 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowMetricsContainer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowMetricsContainer.java @@ -22,9 +22,11 @@ import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.Gauge; +import org.apache.beam.sdk.metrics.Histogram; import org.apache.beam.sdk.metrics.MetricName; import org.apache.beam.sdk.metrics.MetricsContainer; import org.apache.beam.sdk.metrics.MetricsEnvironment; +import org.apache.beam.sdk.util.HistogramData; /** * An implementation of {@link MetricsContainer} that reads the current execution state (tracked in @@ -56,6 +58,11 @@ public Counter getCounter(MetricName metricName) { return getCurrentContainer().getCounter(metricName); } + @Override + public Counter getPerWorkerCounter(MetricName metricName) { + return getCurrentContainer().getPerWorkerCounter(metricName); + } + @Override public Distribution getDistribution(MetricName metricName) { return getCurrentContainer().getDistribution(metricName); @@ -65,4 +72,10 @@ public Distribution getDistribution(MetricName metricName) { public Gauge getGauge(MetricName metricName) { return getCurrentContainer().getGauge(metricName); } + + @Override + public Histogram getPerWorkerHistogram( + MetricName metricName, HistogramData.BucketType bucketType) { + return getCurrentContainer().getPerWorkerHistogram(metricName, bucketType); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowOperationContext.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowOperationContext.java index df520ebd3923c..b2ab928bc9961 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowOperationContext.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowOperationContext.java @@ -264,7 +264,12 @@ protected String getLullMessage(Thread trackedThread, Duration lullDuration) { .append(" for at least ") .append(formatDuration(lullDuration)) .append(" without outputting or completing in state ") - .append(getStateName()); + .append(getStateName()) + .append(" in thread ") + .append(trackedThread.getName()) + .append(" with id ") + .append(trackedThread.getId()); + message.append("\n"); message.append(getStackTraceForLullMessage(trackedThread.getStackTrace())); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java index ee2a04af9982b..640febc616baf 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/DataflowSystemMetrics.java @@ -42,6 +42,10 @@ public enum StreamingSystemCounterNames { TIME_AT_MAX_ACTIVE_THREADS("dataflow_time_at_max_active_threads"), ACTIVE_THREADS("dataflow_active_threads"), TOTAL_ALLOCATED_THREADS("dataflow_total_allocated_threads"), + OUTSTANDING_BYTES("dataflow_outstanding_bytes"), + MAX_OUTSTANDING_BYTES("dataflow_max_outstanding_bytes"), + OUTSTANDING_BUNDLES("dataflow_outstanding_bundles"), + MAX_OUTSTANDING_BUNDLES("dataflow_max_outstanding_bundles"), WINDMILL_QUOTA_THROTTLING("dataflow_streaming_engine_throttled_msecs"), MEMORY_THRASHING("dataflow_streaming_engine_user_worker_thrashing"); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java index 33b55647213aa..0e929249b3a19 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/MetricTrackingWindmillServerStub.java @@ -29,8 +29,8 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStreamPool; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StateFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StateFetcher.java deleted file mode 100644 index 0cbcd2e830120..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StateFetcher.java +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker; - -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; - -import java.io.Closeable; -import java.util.Collections; -import java.util.Objects; -import java.util.Set; -import java.util.concurrent.Callable; -import java.util.concurrent.TimeUnit; -import org.apache.beam.runners.core.InMemoryMultimapSideInputView; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.coders.IterableCoder; -import org.apache.beam.sdk.coders.KvCoder; -import org.apache.beam.sdk.transforms.Materializations; -import org.apache.beam.sdk.transforms.Materializations.IterableView; -import org.apache.beam.sdk.transforms.Materializations.MultimapView; -import org.apache.beam.sdk.transforms.ViewFn; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.util.ByteStringOutputStream; -import org.apache.beam.sdk.values.PCollectionView; -import org.apache.beam.sdk.values.TupleTag; -import org.apache.beam.sdk.values.WindowingStrategy; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Optional; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.Cache; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheBuilder; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.Weigher; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Class responsible for fetching state from the windmill server. */ -@SuppressWarnings({ - "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -class StateFetcher { - private static final Set SUPPORTED_MATERIALIZATIONS = - ImmutableSet.of( - Materializations.ITERABLE_MATERIALIZATION_URN, - Materializations.MULTIMAP_MATERIALIZATION_URN); - - private static final Logger LOG = LoggerFactory.getLogger(StateFetcher.class); - - private Cache sideInputCache; - private MetricTrackingWindmillServerStub server; - private long bytesRead = 0L; - - public StateFetcher(MetricTrackingWindmillServerStub server) { - this( - server, - CacheBuilder.newBuilder() - .maximumWeight(100000000 /* 100 MB */) - .expireAfterWrite(1, TimeUnit.MINUTES) - .weigher((Weigher) (id, entry) -> entry.size()) - .build()); - } - - public StateFetcher( - MetricTrackingWindmillServerStub server, - Cache sideInputCache) { - this.server = server; - this.sideInputCache = sideInputCache; - } - - /** Returns a view of the underlying cache that keeps track of bytes read separately. */ - public StateFetcher byteTrackingView() { - return new StateFetcher(server, sideInputCache); - } - - public long getBytesRead() { - return bytesRead; - } - - /** Indicates the caller's knowledge of whether a particular side input has been computed. */ - public enum SideInputState { - CACHED_IN_WORKITEM, - KNOWN_READY, - UNKNOWN; - } - - /** - * Fetch the given side input, storing it in a process-level cache. - * - *

If state is KNOWN_READY, attempt to fetch the data regardless of whether a not-ready entry - * was cached. - * - *

Returns {@literal null} if the side input was not ready, {@literal Optional.absent()} if the - * side input was null, and {@literal Optional.present(...)} if the side input was non-null. - */ - public @Nullable Optional fetchSideInput( - final PCollectionView view, - final SideWindowT sideWindow, - final String stateFamily, - SideInputState state, - final Supplier scopedReadStateSupplier) { - final SideInputId id = new SideInputId(view.getTagInternal(), sideWindow); - - Callable fetchCallable = - () -> { - @SuppressWarnings("unchecked") - WindowingStrategy sideWindowStrategy = - (WindowingStrategy) view.getWindowingStrategyInternal(); - - Coder windowCoder = sideWindowStrategy.getWindowFn().windowCoder(); - - ByteStringOutputStream windowStream = new ByteStringOutputStream(); - windowCoder.encode(sideWindow, windowStream, Coder.Context.OUTER); - - @SuppressWarnings("unchecked") - Windmill.GlobalDataRequest request = - Windmill.GlobalDataRequest.newBuilder() - .setDataId( - Windmill.GlobalDataId.newBuilder() - .setTag(view.getTagInternal().getId()) - .setVersion(windowStream.toByteString()) - .build()) - .setStateFamily(stateFamily) - .setExistenceWatermarkDeadline( - WindmillTimeUtils.harnessToWindmillTimestamp( - sideWindowStrategy - .getTrigger() - .getWatermarkThatGuaranteesFiring(sideWindow))) - .build(); - - Windmill.GlobalData data; - try (Closeable scope = scopedReadStateSupplier.get()) { - data = server.getSideInputData(request); - } - - bytesRead += data.getSerializedSize(); - - checkState( - SUPPORTED_MATERIALIZATIONS.contains(view.getViewFn().getMaterialization().getUrn()), - "Only materializations of type %s supported, received %s", - SUPPORTED_MATERIALIZATIONS, - view.getViewFn().getMaterialization().getUrn()); - - Iterable rawData; - if (data.getIsReady()) { - if (data.getData().size() > 0) { - rawData = - IterableCoder.of(view.getCoderInternal()) - .decode(data.getData().newInput(), Coder.Context.OUTER); - } else { - rawData = Collections.emptyList(); - } - - switch (view.getViewFn().getMaterialization().getUrn()) { - case Materializations.ITERABLE_MATERIALIZATION_URN: - { - ViewFn viewFn = (ViewFn) view.getViewFn(); - return SideInputCacheEntry.ready( - viewFn.apply(() -> rawData), data.getData().size()); - } - case Materializations.MULTIMAP_MATERIALIZATION_URN: - { - ViewFn viewFn = (ViewFn) view.getViewFn(); - Coder keyCoder = ((KvCoder) view.getCoderInternal()).getKeyCoder(); - return SideInputCacheEntry.ready( - viewFn.apply( - InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) rawData)), - data.getData().size()); - } - default: - throw new IllegalStateException( - String.format( - "Unknown side input materialization format requested '%s'", - view.getViewFn().getMaterialization().getUrn())); - } - } else { - return SideInputCacheEntry.notReady(); - } - }; - - try { - if (state == SideInputState.KNOWN_READY) { - SideInputCacheEntry entry = sideInputCache.getIfPresent(id); - if (entry == null) { - return sideInputCache.get(id, fetchCallable).getValue(); - } else if (!entry.isReady()) { - // Invalidate the existing not-ready entry. This must be done atomically - // so that another thread doesn't replace the entry with a ready entry, which - // would then be deleted here. - synchronized (entry) { - SideInputCacheEntry newEntry = sideInputCache.getIfPresent(id); - if (newEntry != null && !newEntry.isReady()) { - sideInputCache.invalidate(id); - } - } - - return sideInputCache.get(id, fetchCallable).getValue(); - } else { - return entry.getValue(); - } - } else { - return sideInputCache.get(id, fetchCallable).getValue(); - } - } catch (Exception e) { - LOG.error("Fetch failed: ", e); - throw new RuntimeException("Exception while fetching side input: ", e); - } - } - - /** Struct representing a side input for a particular window. */ - static class SideInputId { - private final TupleTag tag; - private final BoundedWindow window; - - public SideInputId(TupleTag tag, BoundedWindow window) { - this.tag = tag; - this.window = window; - } - - @Override - public boolean equals(@Nullable Object other) { - if (other instanceof SideInputId) { - SideInputId otherId = (SideInputId) other; - return tag.equals(otherId.tag) && window.equals(otherId.window); - } - return false; - } - - @Override - public int hashCode() { - return Objects.hash(tag, window); - } - } - - /** - * Entry in the side input cache that stores the value (null if not ready), and the encoded size - * of the value. - */ - static class SideInputCacheEntry { - private final boolean ready; - private final Object value; - private final int encodedSize; - - private SideInputCacheEntry(boolean ready, Object value, int encodedSize) { - this.ready = ready; - this.value = value; - this.encodedSize = encodedSize; - } - - public static SideInputCacheEntry ready(Object value, int encodedSize) { - return new SideInputCacheEntry(true, value, encodedSize); - } - - public static SideInputCacheEntry notReady() { - return new SideInputCacheEntry(false, null, 0); - } - - public boolean isReady() { - return ready; - } - - /** - * Returns {@literal null} if the side input was not ready, {@literal Optional.absent()} if the - * side input was null, and {@literal Optional.present(...)} if the side input was non-null. - */ - public @Nullable Optional getValue() { - @SuppressWarnings("unchecked") - T typed = (T) value; - return ready ? Optional.fromNullable(typed) : null; - } - - public int size() { - return encodedSize; - } - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index 5d4c0288c8385..92f7520676ade 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -94,6 +94,7 @@ import org.apache.beam.runners.dataflow.worker.streaming.WeightedBoundedQueue; import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.Work.State; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; import org.apache.beam.runners.dataflow.worker.util.common.worker.ElementCounter; @@ -103,9 +104,11 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStreamPool; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.extensions.gcp.util.Transport; @@ -226,7 +229,7 @@ public class StreamingDataflowWorker { private final Thread commitThread; private final AtomicLong activeCommitBytes = new AtomicLong(); private final AtomicBoolean running = new AtomicBoolean(); - private final StateFetcher stateFetcher; + private final SideInputStateFetcher sideInputStateFetcher; private final StreamingDataflowWorkerOptions options; private final boolean windmillServiceEnabled; private final long clientId; @@ -248,6 +251,10 @@ public class StreamingDataflowWorker { private final Counter timeAtMaxActiveThreads; private final Counter activeThreads; private final Counter totalAllocatedThreads; + private final Counter outstandingBytes; + private final Counter maxOutstandingBytes; + private final Counter outstandingBundles; + private final Counter maxOutstandingBundles; private final Counter windmillMaxObservedWorkItemCommitBytes; private final Counter memoryThrashing; private final boolean publishCounters; @@ -334,6 +341,18 @@ public class StreamingDataflowWorker { StreamingSystemCounterNames.TIME_AT_MAX_ACTIVE_THREADS.counterName()); this.activeThreads = pendingCumulativeCounters.intSum(StreamingSystemCounterNames.ACTIVE_THREADS.counterName()); + this.outstandingBytes = + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.OUTSTANDING_BYTES.counterName()); + this.maxOutstandingBytes = + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.MAX_OUTSTANDING_BYTES.counterName()); + this.outstandingBundles = + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.OUTSTANDING_BUNDLES.counterName()); + this.maxOutstandingBundles = + pendingCumulativeCounters.longSum( + StreamingSystemCounterNames.MAX_OUTSTANDING_BUNDLES.counterName()); this.totalAllocatedThreads = pendingCumulativeCounters.intSum( StreamingSystemCounterNames.TOTAL_ALLOCATED_THREADS.counterName()); @@ -404,7 +423,7 @@ public void run() { this.metricTrackingWindmillServer = new MetricTrackingWindmillServerStub(windmillServer, memoryMonitor, windmillServiceEnabled); this.metricTrackingWindmillServer.start(); - this.stateFetcher = new StateFetcher(metricTrackingWindmillServer); + this.sideInputStateFetcher = new SideInputStateFetcher(metricTrackingWindmillServer); this.clientId = clientIdGenerator.nextLong(); for (MapTask mapTask : mapTasks) { @@ -463,6 +482,11 @@ public static void main(String[] args) throws Exception { // metrics. MetricsEnvironment.setProcessWideContainer(new MetricsLogger(null)); + // When enabled, the Pipeline will record Per-Worker metrics that will be piped to WMW. + StreamingStepMetricsContainer.setEnablePerWorkerMetrics( + options.isEnableStreamingEngine() + && DataflowRunner.hasExperiment(options, "enable_per_worker_metrics")); + JvmInitializers.runBeforeProcessing(options); worker.startStatusPages(); worker.start(); @@ -607,7 +631,8 @@ public void run() { + options.getWorkerId() + "_" + page.pageName() - + timestamp) + + timestamp + + ".html") .replaceAll("/", "_")); writer = new PrintWriter(outputFile, UTF_8.name()); page.captureData(writer); @@ -1071,7 +1096,7 @@ public void close() { } }; }); - StateFetcher localStateFetcher = stateFetcher.byteTrackingView(); + SideInputStateFetcher localSideInputStateFetcher = sideInputStateFetcher.byteTrackingView(); // If the read output KVs, then we can decode Windmill's byte key into a userland // key object and provide it to the execution context for use with per-key state. @@ -1107,7 +1132,7 @@ public void close() { outputDataWatermark, synchronizedProcessingTime, stateReader, - localStateFetcher, + localSideInputStateFetcher, outputBuilder); // Blocks while executing work. @@ -1177,7 +1202,7 @@ public void close() { shuffleBytesRead += message.getSerializedSize(); } } - long stateBytesRead = stateReader.getBytesRead() + localStateFetcher.getBytesRead(); + long stateBytesRead = stateReader.getBytesRead() + localSideInputStateFetcher.getBytesRead(); windmillShuffleBytesRead.addValue(shuffleBytesRead); windmillStateBytesRead.addValue(stateBytesRead); windmillStateBytesWritten.addValue(stateBytesWritten); @@ -1713,6 +1738,14 @@ private void updateThreadMetrics() { activeThreads.addValue(workUnitExecutor.activeCount()); totalAllocatedThreads.getAndReset(); totalAllocatedThreads.addValue(chooseMaximumNumberOfThreads()); + outstandingBytes.getAndReset(); + outstandingBytes.addValue(workUnitExecutor.bytesOutstanding()); + maxOutstandingBytes.getAndReset(); + maxOutstandingBytes.addValue(workUnitExecutor.maximumBytesOutstanding()); + outstandingBundles.getAndReset(); + outstandingBundles.addValue(workUnitExecutor.elementsOutstanding()); + maxOutstandingBundles.getAndReset(); + maxOutstandingBundles.addValue(workUnitExecutor.maximumElementsOutstanding()); } @VisibleForTesting diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java index 3f4cb08937550..d630601c28a37 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContext.java @@ -30,6 +30,7 @@ import java.util.List; import java.util.Map; import java.util.NavigableSet; +import java.util.Optional; import java.util.Set; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.atomic.AtomicLong; @@ -45,10 +46,16 @@ import org.apache.beam.runners.dataflow.worker.counters.CounterFactory; import org.apache.beam.runners.dataflow.worker.counters.NameContext; import org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler.ProfileScope; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInput; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputState; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataId; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.Timer; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateInternals; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.io.UnboundedSource; import org.apache.beam.sdk.metrics.MetricsContainer; @@ -59,7 +66,7 @@ import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Optional; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.FluentIterable; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.HashBasedTable; @@ -82,7 +89,12 @@ public class StreamingModeExecutionContext extends DataflowExecutionContext { private static final Logger LOG = LoggerFactory.getLogger(StreamingModeExecutionContext.class); - + private final String computationId; + private final Map, Map>> sideInputCache; + // Per-key cache of active Reader objects in use by this process. + private final ImmutableMap stateNameMap; + private final WindmillStateCache.ForComputation stateCache; + private final ReaderCache readerCache; /** * The current user-facing key for this execution context. * @@ -94,20 +106,12 @@ public class StreamingModeExecutionContext extends DataflowExecutionContext, Map> sideInputCache; - - // Per-key cache of active Reader objects in use by this process. - private final ImmutableMap stateNameMap; - private final WindmillStateCache.ForComputation stateCache; - private Windmill.WorkItem work; private WindmillComputationKey computationKey; - private StateFetcher stateFetcher; + private SideInputStateFetcher sideInputStateFetcher; private Windmill.WorkItemCommitRequest.Builder outputBuilder; private UnboundedSource.UnboundedReader activeReader; private volatile long backlogBytes; - private final ReaderCache readerCache; public StreamingModeExecutionContext( CounterFactory counterFactory, @@ -133,86 +137,6 @@ public StreamingModeExecutionContext( this.backlogBytes = UnboundedSource.UnboundedReader.BACKLOG_UNKNOWN; } - /** - * Execution states in Streaming are shared between multiple map-task executors. Thus this class - * needs to be thread safe for multiple writers. A single stage could have have multiple executors - * running concurrently. - */ - public static class StreamingModeExecutionState - extends DataflowOperationContext.DataflowExecutionState { - - // AtomicLong is used because this value is written in two places: - // 1. The sampling thread calls takeSample to increment the time spent in this state - // 2. The reporting thread calls extractUpdate which reads the current sum *AND* sets it to 0. - private final AtomicLong totalMillisInState = new AtomicLong(); - - // The worker that created this state. Used to report lulls back to the worker. - @SuppressWarnings("unused") // Affects a public api - private final StreamingDataflowWorker worker; - - public StreamingModeExecutionState( - NameContext nameContext, - String stateName, - MetricsContainer metricsContainer, - ProfileScope profileScope, - StreamingDataflowWorker worker) { - // TODO: Take in the requesting step name and side input index for streaming. - super(nameContext, stateName, null, null, metricsContainer, profileScope); - this.worker = worker; - } - - /** - * Take sample is only called by the ExecutionStateSampler thread. It is the only place that - * increments totalMillisInState, however the reporting thread periodically calls extractUpdate - * which will read the sum and reset it to 0, so totalMillisInState does have multiple writers. - */ - @Override - public void takeSample(long millisSinceLastSample) { - totalMillisInState.addAndGet(millisSinceLastSample); - } - - /** - * Extract updates in the form of a {@link CounterUpdate}. - * - *

Non-final updates are extracted periodically and report the physical value as a delta. - * This requires setting the totalMillisInState back to 0. - * - *

Final updates should never be requested from a Streaming job since the work unit never - * completes. - */ - @Override - public @Nullable CounterUpdate extractUpdate(boolean isFinalUpdate) { - // Streaming reports deltas, so isFinalUpdate doesn't matter, and should never be true. - long sum = totalMillisInState.getAndSet(0); - return sum == 0 ? null : createUpdate(false, sum); - } - } - - /** - * Implementation of DataflowExecutionStateRegistry that creates Streaming versions of - * ExecutionState. - */ - public static class StreamingModeExecutionStateRegistry extends DataflowExecutionStateRegistry { - - private final StreamingDataflowWorker worker; - - public StreamingModeExecutionStateRegistry(StreamingDataflowWorker worker) { - this.worker = worker; - } - - @Override - protected DataflowOperationContext.DataflowExecutionState createState( - NameContext nameContext, - String stateName, - String requestingStepName, - Integer inputIndex, - MetricsContainer container, - ProfileScope profileScope) { - return new StreamingModeExecutionState( - nameContext, stateName, container, profileScope, worker); - } - } - @VisibleForTesting public long getBacklogBytes() { return backlogBytes; @@ -225,20 +149,20 @@ public void start( @Nullable Instant outputDataWatermark, @Nullable Instant synchronizedProcessingTime, WindmillStateReader stateReader, - StateFetcher stateFetcher, + SideInputStateFetcher sideInputStateFetcher, Windmill.WorkItemCommitRequest.Builder outputBuilder) { this.key = key; this.work = work; this.computationKey = WindmillComputationKey.create(computationId, work.getKey(), work.getShardingKey()); - this.stateFetcher = stateFetcher; + this.sideInputStateFetcher = sideInputStateFetcher; this.outputBuilder = outputBuilder; this.sideInputCache.clear(); clearSinkFullHint(); Instant processingTime = Instant.now(); // Ensure that the processing time is greater than any fired processing time - // timers. Otherwise a trigger could ignore the timer and orphan the window. + // timers. Otherwise, a trigger could ignore the timer and orphan the window. for (Windmill.Timer timer : work.getTimers().getTimersList()) { if (timer.getType() == Windmill.Timer.Type.REALTIME) { Instant inferredFiringTime = @@ -288,45 +212,67 @@ protected SideInputReader getSideInputReaderForViews( return StreamingModeSideInputReader.of(views, this); } + @SuppressWarnings("deprecation") + private TupleTag getInternalTag(PCollectionView view) { + return view.getTagInternal(); + } + /** * Fetches the requested sideInput, and maintains a view of the cache that doesn't remove items * until the active work item is finished. * - *

If the side input was not ready, throws {@code IllegalStateException} if the state is - * {@literal CACHED_IN_WORKITEM} or returns null otherwise. - * - *

If the side input was ready and null, returns {@literal Optional.absent()}. If the side - * input was ready and non-null returns {@literal Optional.present(...)}. + *

If the side input was not cached, throws {@code IllegalStateException} if the state is + * {@literal CACHED_IN_WORK_ITEM} or returns {@link SideInput} which contains {@link + * Optional}. */ - private @Nullable Optional fetchSideInput( + private SideInput fetchSideInput( PCollectionView view, BoundedWindow sideInputWindow, - String stateFamily, - StateFetcher.SideInputState state, - Supplier scopedReadStateSupplier) { - Map tagCache = sideInputCache.get(view.getTagInternal()); - if (tagCache == null) { - tagCache = new HashMap<>(); - sideInputCache.put(view.getTagInternal(), tagCache); + @Nullable String stateFamily, + SideInputState state, + @Nullable Supplier scopedReadStateSupplier) { + TupleTag viewInternalTag = getInternalTag(view); + Map> tagCache = + sideInputCache.computeIfAbsent(viewInternalTag, k -> new HashMap<>()); + + @SuppressWarnings("unchecked") + Optional> cachedSideInput = + Optional.ofNullable((SideInput) tagCache.get(sideInputWindow)); + + if (cachedSideInput.isPresent()) { + return cachedSideInput.get(); } - if (tagCache.containsKey(sideInputWindow)) { - @SuppressWarnings("unchecked") - T typed = (T) tagCache.get(sideInputWindow); - return Optional.fromNullable(typed); - } else { - if (state == StateFetcher.SideInputState.CACHED_IN_WORKITEM) { - throw new IllegalStateException( - "Expected side input to be cached. Tag: " + view.getTagInternal().getId()); - } - Optional fetched = - stateFetcher.fetchSideInput( - view, sideInputWindow, stateFamily, state, scopedReadStateSupplier); - if (fetched != null) { - tagCache.put(sideInputWindow, fetched.orNull()); - } - return fetched; + if (state == SideInputState.CACHED_IN_WORK_ITEM) { + throw new IllegalStateException( + "Expected side input to be cached. Tag: " + viewInternalTag.getId()); } + + return fetchSideInputFromWindmill( + view, + sideInputWindow, + Preconditions.checkNotNull(stateFamily), + state, + Preconditions.checkNotNull(scopedReadStateSupplier), + tagCache); + } + + private SideInput fetchSideInputFromWindmill( + PCollectionView view, + BoundedWindow sideInputWindow, + String stateFamily, + SideInputState state, + Supplier scopedReadStateSupplier, + Map> tagCache) { + SideInput fetched = + sideInputStateFetcher.fetchSideInput( + view, sideInputWindow, stateFamily, state, scopedReadStateSupplier); + + if (fetched.isReady()) { + tagCache.put(sideInputWindow, fetched); + } + + return fetched; } public Iterable getSideInputNotifications() { @@ -455,10 +401,13 @@ public Map flushState() { return callbacks; } + String getStateFamily(NameContext nameContext) { + return nameContext.userName() == null ? null : stateNameMap.get(nameContext.userName()); + } + interface StreamingModeStepContext { - boolean issueSideInputFetch( - PCollectionView view, BoundedWindow w, StateFetcher.SideInputState s); + boolean issueSideInputFetch(PCollectionView view, BoundedWindow w, SideInputState s); void addBlockingSideInput(Windmill.GlobalDataRequest blocked); @@ -478,8 +427,80 @@ void writePCollectionViewData( throws IOException; } - String getStateFamily(NameContext nameContext) { - return nameContext.userName() == null ? null : stateNameMap.get(nameContext.userName()); + /** + * Execution states in Streaming are shared between multiple map-task executors. Thus this class + * needs to be thread safe for multiple writers. A single stage could have have multiple executors + * running concurrently. + */ + public static class StreamingModeExecutionState + extends DataflowOperationContext.DataflowExecutionState { + + // AtomicLong is used because this value is written in two places: + // 1. The sampling thread calls takeSample to increment the time spent in this state + // 2. The reporting thread calls extractUpdate which reads the current sum *AND* sets it to 0. + private final AtomicLong totalMillisInState = new AtomicLong(); + + @SuppressWarnings("unused") + public StreamingModeExecutionState( + NameContext nameContext, + String stateName, + MetricsContainer metricsContainer, + ProfileScope profileScope, + StreamingDataflowWorker worker) { + // TODO: Take in the requesting step name and side input index for streaming. + super(nameContext, stateName, null, null, metricsContainer, profileScope); + } + + /** + * Take sample is only called by the ExecutionStateSampler thread. It is the only place that + * increments totalMillisInState, however the reporting thread periodically calls extractUpdate + * which will read the sum and reset it to 0, so totalMillisInState does have multiple writers. + */ + @Override + public void takeSample(long millisSinceLastSample) { + totalMillisInState.addAndGet(millisSinceLastSample); + } + + /** + * Extract updates in the form of a {@link CounterUpdate}. + * + *

Non-final updates are extracted periodically and report the physical value as a delta. + * This requires setting the totalMillisInState back to 0. + * + *

Final updates should never be requested from a Streaming job since the work unit never + * completes. + */ + @Override + public @Nullable CounterUpdate extractUpdate(boolean isFinalUpdate) { + // Streaming reports deltas, so isFinalUpdate doesn't matter, and should never be true. + long sum = totalMillisInState.getAndSet(0); + return sum == 0 ? null : createUpdate(false, sum); + } + } + + /** + * Implementation of DataflowExecutionStateRegistry that creates Streaming versions of + * ExecutionState. + */ + public static class StreamingModeExecutionStateRegistry extends DataflowExecutionStateRegistry { + + private final StreamingDataflowWorker worker; + + public StreamingModeExecutionStateRegistry(StreamingDataflowWorker worker) { + this.worker = worker; + } + + @Override + protected DataflowOperationContext.DataflowExecutionState createState( + NameContext nameContext, + String stateName, + String requestingStepName, + Integer inputIndex, + MetricsContainer container, + ProfileScope profileScope) { + return new StreamingModeExecutionState( + nameContext, stateName, container, profileScope, worker); + } } private static class ScopedReadStateSupplier implements Supplier { @@ -501,15 +522,156 @@ public Closeable get() { } } + /** + * A specialized {@link StepContext} that uses provided {@link StateInternals} and {@link + * TimerInternals} for user state and timers. + */ + private static class UserStepContext extends DataflowStepContext + implements StreamingModeStepContext { + + private final StreamingModeExecutionContext.StepContext wrapped; + + public UserStepContext(StreamingModeExecutionContext.StepContext wrapped) { + super(wrapped.getNameContext()); + this.wrapped = wrapped; + } + + @Override + public boolean issueSideInputFetch(PCollectionView view, BoundedWindow w, SideInputState s) { + return wrapped.issueSideInputFetch(view, w, s); + } + + @Override + public void addBlockingSideInput(GlobalDataRequest blocked) { + wrapped.addBlockingSideInput(blocked); + } + + @Override + public void addBlockingSideInputs(Iterable blocked) { + wrapped.addBlockingSideInputs(blocked); + } + + @Override + public StateInternals stateInternals() { + return wrapped.stateInternals(); + } + + @Override + public Iterable getSideInputNotifications() { + return wrapped.getSideInputNotifications(); + } + + @Override + public void writePCollectionViewData( + TupleTag tag, + Iterable data, + Coder> dataCoder, + W window, + Coder windowCoder) + throws IOException { + throw new IllegalStateException("User DoFns cannot write PCollectionView data"); + } + + @Override + public TimerInternals timerInternals() { + return wrapped.userTimerInternals(); + } + + @Override + public TimerData getNextFiredTimer(Coder windowCoder) { + return wrapped.getNextFiredUserTimer(windowCoder); + } + + @Override + public void setStateCleanupTimer( + String timerId, + W window, + Coder windowCoder, + Instant cleanupTime, + Instant cleanupOutputTimestamp) { + throw new UnsupportedOperationException( + String.format( + "setStateCleanupTimer should not be called on %s, only on a system %s", + getClass().getSimpleName(), + StreamingModeExecutionContext.StepContext.class.getSimpleName())); + } + + @Override + public DataflowStepContext namespacedToUser() { + return this; + } + } + + /** A {@link SideInputReader} that fetches side inputs from the streaming worker's cache. */ + public static class StreamingModeSideInputReader implements SideInputReader { + + private final StreamingModeExecutionContext context; + private final Set> viewSet; + + private StreamingModeSideInputReader( + Iterable> views, StreamingModeExecutionContext context) { + this.context = context; + this.viewSet = ImmutableSet.copyOf(views); + } + + public static StreamingModeSideInputReader of( + Iterable> views, StreamingModeExecutionContext context) { + return new StreamingModeSideInputReader(views, context); + } + + @Override + public T get(PCollectionView view, BoundedWindow window) { + if (!contains(view)) { + throw new RuntimeException("get() called with unknown view"); + } + + // We are only fetching the cached value here, so we don't need stateFamily or + // readStateSupplier. + return context + .fetchSideInput( + view, + window, + null /* unused stateFamily */, + SideInputState.CACHED_IN_WORK_ITEM, + null /* unused readStateSupplier */) + .value() + .orElse(null); + } + + @Override + public boolean contains(PCollectionView view) { + return viewSet.contains(view); + } + + @Override + public boolean isEmpty() { + return viewSet.isEmpty(); + } + } + class StepContext extends DataflowExecutionContext.DataflowStepContext implements StreamingModeStepContext { + private final String stateFamily; + private final Supplier scopedReadStateSupplier; private WindmillStateInternals stateInternals; - private WindmillTimerInternals systemTimerInternals; private WindmillTimerInternals userTimerInternals; - private final String stateFamily; - private final Supplier scopedReadStateSupplier; + // Lazily initialized + private Iterator cachedFiredSystemTimers = null; + // Lazily initialized + private PeekingIterator cachedFiredUserTimers = null; + // An ordered list of any timers that were set or modified by user processing earlier in this + // bundle. + // We use a NavigableSet instead of a priority queue to prevent duplicate elements from ending + // up in the queue. + private NavigableSet modifiedUserEventTimersOrdered = null; + private NavigableSet modifiedUserProcessingTimersOrdered = null; + private NavigableSet modifiedUserSynchronizedProcessingTimersOrdered = null; + // A list of timer keys that were modified by user processing earlier in this bundle. This + // serves a tombstone, so + // that we know not to fire any bundle tiemrs that were moddified. + private Table modifiedUserTimerKeys = null; public StepContext(DataflowOperationContext operationContext) { super(operationContext.nameContext()); @@ -570,14 +732,11 @@ public void flushState() { userTimerInternals.persistTo(outputBuilder); } - // Lazily initialized - private Iterator cachedFiredSystemTimers = null; - @Override public TimerData getNextFiredTimer(Coder windowCoder) { if (cachedFiredSystemTimers == null) { cachedFiredSystemTimers = - FluentIterable.from(StreamingModeExecutionContext.this.getFiredTimers()) + FluentIterable.from(StreamingModeExecutionContext.this.getFiredTimers()) .filter( timer -> WindmillTimerInternals.isSystemTimer(timer) @@ -601,16 +760,6 @@ public TimerData getNextFiredTimer(Coder windowCode return nextTimer; } - // Lazily initialized - private PeekingIterator cachedFiredUserTimers = null; - // An ordered list of any timers that were set or modified by user processing earlier in this - // bundle. - // We use a NavigableSet instead of a priority queue to prevent duplicate elements from ending - // up in the queue. - private NavigableSet modifiedUserEventTimersOrdered = null; - private NavigableSet modifiedUserProcessingTimersOrdered = null; - private NavigableSet modifiedUserSynchronizedProcessingTimersOrdered = null; - private NavigableSet getModifiedUserTimersOrdered(TimeDomain timeDomain) { switch (timeDomain) { case EVENT_TIME: @@ -624,11 +773,6 @@ private NavigableSet getModifiedUserTimersOrdered(TimeDomain timeDoma } } - // A list of timer keys that were modified by user processing earlier in this bundle. This - // serves a tombstone, so - // that we know not to fire any bundle tiemrs that were moddified. - private Table modifiedUserTimerKeys = null; - private void onUserTimerModified(TimerData timerData) { if (!timerData.getDeleted()) { getModifiedUserTimersOrdered(timerData.getDomain()).add(timerData); @@ -763,10 +907,10 @@ public void writePCollectionViewData( /** Fetch the given side input asynchronously and return true if it is present. */ @Override public boolean issueSideInputFetch( - PCollectionView view, BoundedWindow mainInputWindow, StateFetcher.SideInputState state) { + PCollectionView view, BoundedWindow mainInputWindow, SideInputState state) { BoundedWindow sideInputWindow = view.getWindowMappingFn().getSideInputWindow(mainInputWindow); return fetchSideInput(view, sideInputWindow, stateFamily, state, scopedReadStateSupplier) - != null; + .isReady(); } /** Note that there is data on the current key that is blocked on the given side input. */ @@ -804,131 +948,4 @@ public TimerInternals userTimerInternals() { return checkNotNull(userTimerInternals); } } - - /** - * A specialized {@link StepContext} that uses provided {@link StateInternals} and {@link - * TimerInternals} for user state and timers. - */ - private static class UserStepContext extends DataflowStepContext - implements StreamingModeStepContext { - - private final StreamingModeExecutionContext.StepContext wrapped; - - public UserStepContext(StreamingModeExecutionContext.StepContext wrapped) { - super(wrapped.getNameContext()); - this.wrapped = wrapped; - } - - @Override - public boolean issueSideInputFetch( - PCollectionView view, BoundedWindow w, StateFetcher.SideInputState s) { - return wrapped.issueSideInputFetch(view, w, s); - } - - @Override - public void addBlockingSideInput(GlobalDataRequest blocked) { - wrapped.addBlockingSideInput(blocked); - } - - @Override - public void addBlockingSideInputs(Iterable blocked) { - wrapped.addBlockingSideInputs(blocked); - } - - @Override - public StateInternals stateInternals() { - return wrapped.stateInternals(); - } - - @Override - public Iterable getSideInputNotifications() { - return wrapped.getSideInputNotifications(); - } - - @Override - public void writePCollectionViewData( - TupleTag tag, - Iterable data, - Coder> dataCoder, - W window, - Coder windowCoder) - throws IOException { - throw new IllegalStateException("User DoFns cannot write PCollectionView data"); - } - - @Override - public TimerInternals timerInternals() { - return wrapped.userTimerInternals(); - } - - @Override - public TimerData getNextFiredTimer(Coder windowCoder) { - return wrapped.getNextFiredUserTimer(windowCoder); - } - - @Override - public void setStateCleanupTimer( - String timerId, - W window, - Coder windowCoder, - Instant cleanupTime, - Instant cleanupOutputTimestamp) { - throw new UnsupportedOperationException( - String.format( - "setStateCleanupTimer should not be called on %s, only on a system %s", - getClass().getSimpleName(), - StreamingModeExecutionContext.StepContext.class.getSimpleName())); - } - - @Override - public DataflowStepContext namespacedToUser() { - return this; - } - } - - /** A {@link SideInputReader} that fetches side inputs from the streaming worker's cache. */ - public static class StreamingModeSideInputReader implements SideInputReader { - - private StreamingModeExecutionContext context; - private Set> viewSet; - - private StreamingModeSideInputReader( - Iterable> views, StreamingModeExecutionContext context) { - this.context = context; - this.viewSet = ImmutableSet.copyOf(views); - } - - public static StreamingModeSideInputReader of( - Iterable> views, StreamingModeExecutionContext context) { - return new StreamingModeSideInputReader(views, context); - } - - @Override - public T get(PCollectionView view, BoundedWindow window) { - if (!contains(view)) { - throw new RuntimeException("get() called with unknown view"); - } - - // We are only fetching the cached value here, so we don't need stateFamily or - // readStateSupplier. - return context - .fetchSideInput( - view, - window, - null /* unused stateFamily */, - StateFetcher.SideInputState.CACHED_IN_WORKITEM, - null /* unused readStateSupplier */) - .orNull(); - } - - @Override - public boolean contains(PCollectionView view) { - return viewSet.contains(view); - } - - @Override - public boolean isEmpty() { - return viewSet.isEmpty(); - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcher.java index 2b551acd2d8c7..4f585e1c01b60 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcher.java @@ -33,6 +33,7 @@ import org.apache.beam.runners.core.TimerInternals.TimerData; import org.apache.beam.runners.core.TimerInternals.TimerDataCoder; import org.apache.beam.runners.core.TimerInternals.TimerDataCoderV2; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputState; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; import org.apache.beam.sdk.coders.AtomicCoder; @@ -135,8 +136,7 @@ public Set getReadyWindows() { W window = entry.getKey(); boolean allSideInputsCached = true; for (PCollectionView view : sideInputViews.values()) { - if (!stepContext.issueSideInputFetch( - view, window, StateFetcher.SideInputState.KNOWN_READY)) { + if (!stepContext.issueSideInputFetch(view, window, SideInputState.KNOWN_READY)) { Windmill.GlobalDataRequest request = buildGlobalDataRequest(view, window); stepContext.addBlockingSideInput(request); windowBlockedSet.add(request); @@ -192,7 +192,7 @@ public boolean storeIfBlocked(WindowedValue elem) { Set blocked = blockedMap().get(window); if (blocked == null) { for (PCollectionView view : sideInputViews.values()) { - if (!stepContext.issueSideInputFetch(view, window, StateFetcher.SideInputState.UNKNOWN)) { + if (!stepContext.issueSideInputFetch(view, window, SideInputState.UNKNOWN)) { if (blocked == null) { blocked = new HashSet<>(); blockedMap().put(window, blocked); @@ -222,7 +222,7 @@ public boolean storeIfBlocked(TimerData timer) { boolean blocked = false; for (PCollectionView view : sideInputViews.values()) { - if (!stepContext.issueSideInputFetch(view, window, StateFetcher.SideInputState.UNKNOWN)) { + if (!stepContext.issueSideInputFetch(view, window, SideInputState.UNKNOWN)) { blocked = true; } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java index 8c5b9c2f2b662..875a2d649ece2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainer.java @@ -24,13 +24,17 @@ import javax.annotation.Nonnull; import org.apache.beam.runners.core.metrics.DistributionData; import org.apache.beam.runners.core.metrics.GaugeCell; +import org.apache.beam.runners.core.metrics.HistogramCell; import org.apache.beam.runners.core.metrics.MetricsMap; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.Gauge; +import org.apache.beam.sdk.metrics.Histogram; import org.apache.beam.sdk.metrics.MetricKey; import org.apache.beam.sdk.metrics.MetricName; import org.apache.beam.sdk.metrics.MetricsContainer; +import org.apache.beam.sdk.util.HistogramData; +import org.apache.beam.sdk.values.KV; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Predicates; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.FluentIterable; @@ -47,14 +51,22 @@ public class StreamingStepMetricsContainer implements MetricsContainer { private final String stepName; + private static Boolean enablePerWorkerMetrics; + private MetricsMap counters = new MetricsMap<>(DeltaCounterCell::new); + private MetricsMap perWorkerCounters = + new MetricsMap<>(DeltaCounterCell::new); + private MetricsMap gauges = new MetricsMap<>(GaugeCell::new); private MetricsMap distributions = new MetricsMap<>(DeltaDistributionCell::new); + private MetricsMap, HistogramCell> perWorkerHistograms = + new MetricsMap<>(HistogramCell::new); + private StreamingStepMetricsContainer(String stepName) { this.stepName = stepName; } @@ -73,6 +85,15 @@ public Counter getCounter(MetricName metricName) { return counters.get(metricName); } + @Override + public Counter getPerWorkerCounter(MetricName metricName) { + if (enablePerWorkerMetrics) { + return perWorkerCounters.get(metricName); + } else { + return MetricsContainer.super.getPerWorkerCounter(metricName); + } + } + @Override public Distribution getDistribution(MetricName metricName) { return distributions.get(metricName); @@ -83,6 +104,16 @@ public Gauge getGauge(MetricName metricName) { return gauges.get(metricName); } + @Override + public Histogram getPerWorkerHistogram( + MetricName metricName, HistogramData.BucketType bucketType) { + if (enablePerWorkerMetrics) { + return perWorkerHistograms.get(KV.of(metricName, bucketType)); + } else { + return MetricsContainer.super.getPerWorkerHistogram(metricName, bucketType); + } + } + public Iterable extractUpdates() { return counterUpdates().append(distributionUpdates()); } @@ -142,4 +173,8 @@ public static Iterable extractMetricUpdates( .getContainers() .transformAndConcat(StreamingStepMetricsContainer::extractUpdates); } + + public static void setEnablePerWorkerMetrics(Boolean enablePerWorkerMetrics) { + StreamingStepMetricsContainer.enablePerWorkerMetrics = enablePerWorkerMetrics; + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/Weighers.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/Weighers.java index d2231b8b47bbb..eb4e0f4885a7c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/Weighers.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/Weighers.java @@ -28,7 +28,7 @@ *

Package-private here so that the dependency on Guava does not leak into the public API * surface. */ -class Weighers { +public class Weighers { public static Weigher fixedWeightKeys(final int keyWeight) { return (key, value) -> (int) Math.min(keyWeight + value.getWeight(), Integer.MAX_VALUE); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateInternals.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateInternals.java deleted file mode 100644 index d4edc0afc0b10..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateInternals.java +++ /dev/null @@ -1,2830 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker; - -import com.google.auto.value.AutoValue; -import java.io.Closeable; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.nio.charset.StandardCharsets; -import java.util.AbstractMap; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Objects; -import java.util.Random; -import java.util.Set; -import java.util.SortedSet; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.Future; -import java.util.function.BiConsumer; -import java.util.function.Function; -import java.util.stream.Collectors; -import javax.annotation.concurrent.NotThreadSafe; -import org.apache.beam.repackaged.core.org.apache.commons.lang3.tuple.Triple; -import org.apache.beam.runners.core.StateInternals; -import org.apache.beam.runners.core.StateNamespace; -import org.apache.beam.runners.core.StateTable; -import org.apache.beam.runners.core.StateTag; -import org.apache.beam.runners.core.StateTag.StateBinder; -import org.apache.beam.runners.core.StateTags; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListEntry; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListRange; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagSortedListDeleteRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagSortedListInsertRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagSortedListUpdateRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; -import org.apache.beam.sdk.coders.BooleanCoder; -import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.coders.Coder.Context; -import org.apache.beam.sdk.coders.CoderException; -import org.apache.beam.sdk.coders.CustomCoder; -import org.apache.beam.sdk.coders.InstantCoder; -import org.apache.beam.sdk.coders.MapCoder; -import org.apache.beam.sdk.coders.NullableCoder; -import org.apache.beam.sdk.coders.SetCoder; -import org.apache.beam.sdk.coders.StructuredCoder; -import org.apache.beam.sdk.coders.VarLongCoder; -import org.apache.beam.sdk.state.BagState; -import org.apache.beam.sdk.state.CombiningState; -import org.apache.beam.sdk.state.MapState; -import org.apache.beam.sdk.state.MultimapState; -import org.apache.beam.sdk.state.OrderedListState; -import org.apache.beam.sdk.state.ReadableState; -import org.apache.beam.sdk.state.ReadableStates; -import org.apache.beam.sdk.state.SetState; -import org.apache.beam.sdk.state.State; -import org.apache.beam.sdk.state.StateContext; -import org.apache.beam.sdk.state.StateContexts; -import org.apache.beam.sdk.state.ValueState; -import org.apache.beam.sdk.state.WatermarkHoldState; -import org.apache.beam.sdk.transforms.Combine.CombineFn; -import org.apache.beam.sdk.transforms.CombineWithContext.CombineFnWithContext; -import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; -import org.apache.beam.sdk.util.ByteStringOutputStream; -import org.apache.beam.sdk.util.CombineFnUtil; -import org.apache.beam.sdk.util.Weighted; -import org.apache.beam.sdk.values.TimestampedValue; -import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Optional; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.BoundType; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterators; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.RangeSet; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.TreeRangeSet; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; -import org.checkerframework.checker.initialization.qual.Initialized; -import org.checkerframework.checker.nullness.qual.NonNull; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.checkerframework.checker.nullness.qual.UnknownKeyFor; -import org.joda.time.Duration; -import org.joda.time.Instant; - -/** Implementation of {@link StateInternals} using Windmill to manage the underlying data. */ -@SuppressWarnings({ - "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -class WindmillStateInternals implements StateInternals { - - /** - * The key will be null when not in a keyed context, from the users perspective. There is still a - * "key" for the Windmill computation, but it cannot be meaningfully deserialized. - */ - private final @Nullable K key; - - @Override - public @Nullable K getKey() { - return key; - } - - private static class CachingStateTable extends StateTable { - private final String stateFamily; - private final WindmillStateReader reader; - private final WindmillStateCache.ForKeyAndFamily cache; - private final boolean isSystemTable; - boolean isNewKey; - private final Supplier scopedReadStateSupplier; - private final StateTable derivedStateTable; - - public CachingStateTable( - @Nullable K key, - String stateFamily, - WindmillStateReader reader, - WindmillStateCache.ForKeyAndFamily cache, - boolean isSystemTable, - boolean isNewKey, - Supplier scopedReadStateSupplier, - StateTable derivedStateTable) { - this.stateFamily = stateFamily; - this.reader = reader; - this.cache = cache; - this.isSystemTable = isSystemTable; - this.isNewKey = isNewKey; - this.scopedReadStateSupplier = scopedReadStateSupplier; - this.derivedStateTable = derivedStateTable != null ? derivedStateTable : this; - } - - @Override - protected StateBinder binderForNamespace( - final StateNamespace namespace, final StateContext c) { - // Look up state objects in the cache or create new ones if not found. The state will - // be added to the cache in persist(). - return new StateBinder() { - @Override - public BagState bindBag(StateTag> address, Coder elemCoder) { - if (isSystemTable) { - address = StateTags.makeSystemTagInternal(address); - } - WindmillBag result = (WindmillBag) cache.get(namespace, address); - if (result == null) { - result = new WindmillBag<>(namespace, address, stateFamily, elemCoder, isNewKey); - } - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - - @Override - public SetState bindSet(StateTag> spec, Coder elemCoder) { - WindmillSet result = - new WindmillSet(namespace, spec, stateFamily, elemCoder, cache, isNewKey); - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - - @Override - public MapState bindMap( - StateTag> spec, Coder keyCoder, Coder valueCoder) { - WindmillMap result = (WindmillMap) cache.get(namespace, spec); - if (result == null) { - result = - new WindmillMap( - namespace, spec, stateFamily, keyCoder, valueCoder, isNewKey); - } - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - - @Override - public MultimapState bindMultimap( - StateTag> spec, - Coder keyCoder, - Coder valueCoder) { - WindmillMultimap result = - (WindmillMultimap) cache.get(namespace, spec); - if (result == null) { - result = - new WindmillMultimap<>( - namespace, spec, stateFamily, keyCoder, valueCoder, isNewKey); - } - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - - @Override - public OrderedListState bindOrderedList( - StateTag> spec, Coder elemCoder) { - if (isSystemTable) { - spec = StateTags.makeSystemTagInternal(spec); - } - WindmillOrderedList result = (WindmillOrderedList) cache.get(namespace, spec); - if (result == null) { - result = - new WindmillOrderedList<>( - derivedStateTable, namespace, spec, stateFamily, elemCoder, isNewKey); - } - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - - @Override - public WatermarkHoldState bindWatermark( - StateTag address, TimestampCombiner timestampCombiner) { - if (isSystemTable) { - address = StateTags.makeSystemTagInternal(address); - } - WindmillWatermarkHold result = (WindmillWatermarkHold) cache.get(namespace, address); - if (result == null) { - result = - new WindmillWatermarkHold( - namespace, address, stateFamily, timestampCombiner, isNewKey); - } - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - - @Override - public CombiningState bindCombiningValue( - StateTag> address, - Coder accumCoder, - CombineFn combineFn) { - if (isSystemTable) { - address = StateTags.makeSystemTagInternal(address); - } - WindmillCombiningState result = - new WindmillCombiningState<>( - namespace, address, stateFamily, accumCoder, combineFn, cache, isNewKey); - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - - @Override - public - CombiningState bindCombiningValueWithContext( - StateTag> address, - Coder accumCoder, - CombineFnWithContext combineFn) { - if (isSystemTable) { - address = StateTags.makeSystemTagInternal(address); - } - return bindCombiningValue(address, accumCoder, CombineFnUtil.bindContext(combineFn, c)); - } - - @Override - public ValueState bindValue(StateTag> address, Coder coder) { - if (isSystemTable) { - address = StateTags.makeSystemTagInternal(address); - } - WindmillValue result = (WindmillValue) cache.get(namespace, address); - if (result == null) { - result = new WindmillValue<>(namespace, address, stateFamily, coder, isNewKey); - } - result.initializeForWorkItem(reader, scopedReadStateSupplier); - return result; - } - }; - } - } - - private WindmillStateCache.ForKeyAndFamily cache; - Supplier scopedReadStateSupplier; - private StateTable workItemState; - private StateTable workItemDerivedState; - - public WindmillStateInternals( - @Nullable K key, - String stateFamily, - WindmillStateReader reader, - boolean isNewKey, - WindmillStateCache.ForKeyAndFamily cache, - Supplier scopedReadStateSupplier) { - this.key = key; - this.cache = cache; - this.scopedReadStateSupplier = scopedReadStateSupplier; - this.workItemDerivedState = - new CachingStateTable<>( - key, stateFamily, reader, cache, true, isNewKey, scopedReadStateSupplier, null); - this.workItemState = - new CachingStateTable<>( - key, - stateFamily, - reader, - cache, - false, - isNewKey, - scopedReadStateSupplier, - workItemDerivedState); - } - - private void persist(List> commitsToMerge, StateTable stateTable) { - for (State location : stateTable.values()) { - if (!(location instanceof WindmillState)) { - throw new IllegalStateException( - String.format( - "%s wasn't created by %s -- unable to persist it", - location.getClass().getSimpleName(), getClass().getSimpleName())); - } - - try { - commitsToMerge.add(((WindmillState) location).persist(cache)); - } catch (IOException e) { - throw new RuntimeException("Unable to persist state", e); - } - } - - // All cached State objects now have known values. - // Clear any references to the underlying reader to prevent space leaks. - // The next work unit to use these cached State objects will reset the - // reader to a current reader in case those values are modified. - for (State location : stateTable.values()) { - ((WindmillState) location).cleanupAfterWorkItem(); - } - - // Clear out the map of already retrieved state instances. - stateTable.clear(); - } - - public void persist(final Windmill.WorkItemCommitRequest.Builder commitBuilder) { - List> commitsToMerge = new ArrayList<>(); - - // Call persist on each first, which may schedule some futures for reading. - persist(commitsToMerge, workItemState); - persist(commitsToMerge, workItemDerivedState); - - try (Closeable scope = scopedReadStateSupplier.get()) { - for (Future commitFuture : commitsToMerge) { - commitBuilder.mergeFrom(commitFuture.get()); - } - } catch (ExecutionException | InterruptedException | IOException exc) { - if (exc instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Failed to retrieve Windmill state during persist()", exc); - } - - cache.persist(); - } - - /** Encodes the given namespace and address as {@code <namespace>+<address>}. */ - @VisibleForTesting - static ByteString encodeKey(StateNamespace namespace, StateTag address) { - try { - // Use ByteStringOutputStream rather than concatenation and String.format. We build these keys - // a lot, and this leads to better performance results. See associated benchmarks. - ByteStringOutputStream stream = new ByteStringOutputStream(); - OutputStreamWriter writer = new OutputStreamWriter(stream, StandardCharsets.UTF_8); - - // stringKey starts and ends with a slash. We separate it from the - // StateTag ID by a '+' (which is guaranteed not to be in the stringKey) because the - // ID comes from the user. - namespace.appendTo(writer); - writer.write('+'); - address.appendTo(writer); - writer.flush(); - return stream.toByteString(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - /** - * Abstract base class for all Windmill state. - * - *

Note that these are not thread safe; each state object is associated with a key and thus - * only accessed by a single thread at once. - */ - @NotThreadSafe - private abstract static class WindmillState { - protected Supplier scopedReadStateSupplier; - protected WindmillStateReader reader; - - /** - * Return an asynchronously computed {@link WorkItemCommitRequest}. The request should be of a - * form that can be merged with others (only add to repeated fields). - */ - abstract Future persist(WindmillStateCache.ForKeyAndFamily cache) - throws IOException; - - /** - * Prepare this (possibly reused from cache) state for reading from {@code reader} if needed. - */ - void initializeForWorkItem( - WindmillStateReader reader, Supplier scopedReadStateSupplier) { - this.reader = reader; - this.scopedReadStateSupplier = scopedReadStateSupplier; - } - - /** - * This (now cached) state should never need to interact with the reader until the next work - * item. Clear it to prevent space leaks. The reader will be reset by {@link - * #initializeForWorkItem} upon the next work item. - */ - void cleanupAfterWorkItem() { - this.reader = null; - this.scopedReadStateSupplier = null; - } - - Closeable scopedReadState() { - return scopedReadStateSupplier.get(); - } - } - - /** - * Base class for implementations of {@link WindmillState} where the {@link #persist} call does - * not require any asynchronous reading. - */ - private abstract static class SimpleWindmillState extends WindmillState { - @Override - public final Future persist(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - return Futures.immediateFuture(persistDirectly(cache)); - } - - /** - * Returns a {@link WorkItemCommitRequest} that can be used to persist this state to Windmill. - */ - protected abstract WorkItemCommitRequest persistDirectly( - WindmillStateCache.ForKeyAndFamily cache) throws IOException; - } - - @Override - public T state(StateNamespace namespace, StateTag address) { - return workItemState.get(namespace, address, StateContexts.nullContext()); - } - - @Override - public T state( - StateNamespace namespace, StateTag address, StateContext c) { - return workItemState.get(namespace, address, c); - } - - private static class WindmillValue extends SimpleWindmillState implements ValueState { - private final StateNamespace namespace; - private final StateTag> address; - private final ByteString stateKey; - private final String stateFamily; - private final Coder coder; - - /** Whether we've modified the value since creation of this state. */ - private boolean modified = false; - /** Whether the in memory value is the true value. */ - private boolean valueIsKnown = false; - /** The size of the encoded value */ - private long cachedSize = -1; - - private T value; - - private WindmillValue( - StateNamespace namespace, - StateTag> address, - String stateFamily, - Coder coder, - boolean isNewKey) { - this.namespace = namespace; - this.address = address; - this.stateKey = encodeKey(namespace, address); - this.stateFamily = stateFamily; - this.coder = coder; - if (isNewKey) { - this.valueIsKnown = true; - this.value = null; - } - } - - @Override - public void clear() { - modified = true; - valueIsKnown = true; - value = null; - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public WindmillValue readLater() { - getFuture(); - return this; - } - - @Override - public T read() { - try (Closeable scope = scopedReadState()) { - if (!valueIsKnown) { - cachedSize = -1; - } - value = getFuture().get(); - valueIsKnown = true; - return value; - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read value from state", e); - } - } - - @Override - public void write(T value) { - modified = true; - valueIsKnown = true; - cachedSize = -1; - this.value = value; - } - - @Override - protected WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - if (!valueIsKnown) { - // The value was never read, written or cleared. - // Thus nothing to update in Windmill. - // And no need to add to global cache. - return WorkItemCommitRequest.newBuilder().buildPartial(); - } - - ByteString encoded = null; - if (cachedSize == -1 || modified) { - ByteStringOutputStream stream = new ByteStringOutputStream(); - if (value != null) { - coder.encode(value, stream, Coder.Context.OUTER); - } - encoded = stream.toByteString(); - cachedSize = encoded.size(); - } - - // Place in cache to avoid a future read. - cache.put(namespace, address, this, cachedSize); - - if (!modified) { - // The value was read, but never written or cleared. - // But nothing to update in Windmill. - return WorkItemCommitRequest.newBuilder().buildPartial(); - } - - // The value was written or cleared. Commit that change to Windmill. - modified = false; - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - commitBuilder - .addValueUpdatesBuilder() - .setTag(stateKey) - .setStateFamily(stateFamily) - .getValueBuilder() - .setData(encoded) - .setTimestamp(Long.MAX_VALUE); - return commitBuilder.buildPartial(); - } - - private Future getFuture() { - // WindmillStateReader guarantees that we can ask for a future for a particular tag multiple - // times and it will efficiently be reused. - return valueIsKnown - ? Futures.immediateFuture(value) - : reader.valueFuture(stateKey, stateFamily, coder); - } - } - - // Coder for closed-open ranges. - private static class RangeCoder extends StructuredCoder> { - private Coder boundCoder; - - RangeCoder(Coder boundCoder) { - this.boundCoder = NullableCoder.of(boundCoder); - } - - @Override - public List> getCoderArguments() { - return Lists.newArrayList(boundCoder); - } - - @Override - public void verifyDeterministic() throws NonDeterministicException { - boundCoder.verifyDeterministic(); - ; - } - - @Override - public void encode(Range value, OutputStream outStream) throws CoderException, IOException { - Preconditions.checkState( - value.lowerBoundType().equals(BoundType.CLOSED), "unexpected range " + value); - Preconditions.checkState( - value.upperBoundType().equals(BoundType.OPEN), "unexpected range " + value); - boundCoder.encode(value.hasLowerBound() ? value.lowerEndpoint() : null, outStream); - boundCoder.encode(value.hasUpperBound() ? value.upperEndpoint() : null, outStream); - } - - @Override - public Range decode(InputStream inStream) throws CoderException, IOException { - @Nullable T lower = boundCoder.decode(inStream); - @Nullable T upper = boundCoder.decode(inStream); - if (lower == null) { - return upper != null ? Range.lessThan(upper) : Range.all(); - } else if (upper == null) { - return Range.atLeast(lower); - } else { - return Range.closedOpen(lower, upper); - } - } - } - - private static class RangeSetCoder extends CustomCoder> { - private SetCoder> rangesCoder; - - RangeSetCoder(Coder boundCoder) { - this.rangesCoder = SetCoder.of(new RangeCoder<>(boundCoder)); - } - - @Override - public void encode(RangeSet value, OutputStream outStream) throws IOException { - rangesCoder.encode(value.asRanges(), outStream); - } - - @Override - public RangeSet decode(InputStream inStream) throws CoderException, IOException { - return TreeRangeSet.create(rangesCoder.decode(inStream)); - } - } - - /** - * Tracker for the ids used in an ordered list. - * - *

Windmill accepts an int64 id for each timestamped-element in the list. Unique elements are - * identified by the pair of timestamp and id. This means that tow unique elements e1, e2 must - * have different (ts1, id1), (ts2, id2) pairs. To accomplish this we bucket time into five-minute - * buckets, and store a free list of ids available for each bucket. - * - *

When a timestamp range is deleted, we remove id tracking for elements in that range. In - * order to handle the case where a range is deleted piecemeal, we track sub-range deletions for - * each range. For example: - * - *

12:00 - 12:05 ids 12:05 - 12:10 ids - * - *

delete 12:00-12:06 - * - *

12:00 - 12:05 *removed* 12:05 - 12:10 ids subranges deleted 12:05-12:06 - * - *

delete 12:06 - 12:07 - * - *

12:05 - 12:10 ids subranges deleted 12:05-12:07 - * - *

delete 12:07 - 12:10 - * - *

12:05 - 12:10 *removed* - */ - static final class IdTracker { - static final String IDS_AVAILABLE_STR = "IdsAvailable"; - static final String DELETIONS_STR = "Deletions"; - - // Note that this previously was Long.MIN_VALUE but ids are unsigned when - // sending to windmill for Streaming Engine. For updated appliance - // pipelines with existing state, there may be negative ids. - static final long NEW_RANGE_MIN_ID = 0; - static final long NEW_RANGE_MAX_ID = Long.MAX_VALUE; - - // We track ids on five-minute boundaries. - private static final Duration RESOLUTION = Duration.standardMinutes(5); - static final MapCoder, RangeSet> IDS_AVAILABLE_CODER = - MapCoder.of(new RangeCoder<>(InstantCoder.of()), new RangeSetCoder<>(VarLongCoder.of())); - static final MapCoder, RangeSet> SUBRANGE_DELETIONS_CODER = - MapCoder.of(new RangeCoder<>(InstantCoder.of()), new RangeSetCoder<>(InstantCoder.of())); - private final StateTag, RangeSet>>> idsAvailableTag; - // A map from five-minute ranges to the set of ids available in that interval. - final ValueState, RangeSet>> idsAvailableValue; - private final StateTag, RangeSet>>> subRangeDeletionsTag; - // If a timestamp-range in the map has been partially cleared, the cleared intervals are stored - // here. - final ValueState, RangeSet>> subRangeDeletionsValue; - - IdTracker( - StateTable stateTable, - StateNamespace namespace, - StateTag spec, - String stateFamily, - boolean complete) { - this.idsAvailableTag = - StateTags.makeSystemTagInternal( - StateTags.value(spec.getId() + IDS_AVAILABLE_STR, IDS_AVAILABLE_CODER)); - this.idsAvailableValue = - stateTable.get(namespace, idsAvailableTag, StateContexts.nullContext()); - this.subRangeDeletionsTag = - StateTags.makeSystemTagInternal( - StateTags.value(spec.getId() + DELETIONS_STR, SUBRANGE_DELETIONS_CODER)); - this.subRangeDeletionsValue = - stateTable.get(namespace, subRangeDeletionsTag, StateContexts.nullContext()); - } - - static > - Map, RangeSet> newSortedRangeMap(Class valueClass) { - return Maps.newTreeMap( - Comparator., Instant>comparing(Range::lowerEndpoint) - .thenComparing(Range::upperEndpoint)); - } - - private Range getTrackedRange(Instant ts) { - Instant snapped = - new Instant(ts.getMillis() - ts.plus(RESOLUTION).getMillis() % RESOLUTION.getMillis()); - return Range.closedOpen(snapped, snapped.plus(RESOLUTION)); - } - - @SuppressWarnings("FutureReturnValueIgnored") - void readLater() { - idsAvailableValue.readLater(); - subRangeDeletionsValue.readLater(); - } - - Map, RangeSet> readIdsAvailable() { - Map, RangeSet> idsAvailable = idsAvailableValue.read(); - return idsAvailable != null ? idsAvailable : newSortedRangeMap(Long.class); - } - - Map, RangeSet> readSubRangeDeletions() { - Map, RangeSet> subRangeDeletions = subRangeDeletionsValue.read(); - return subRangeDeletions != null ? subRangeDeletions : newSortedRangeMap(Instant.class); - } - - void clear() throws ExecutionException, InterruptedException { - idsAvailableValue.clear(); - subRangeDeletionsValue.clear(); - } - - void add( - SortedSet> elements, BiConsumer, Long> output) - throws ExecutionException, InterruptedException { - Range currentIdRange = null; - long currentId = 0; - - Range currentTsRange = null; - RangeSet currentTsRangeDeletions = null; - - Map, RangeSet> idsAvailable = readIdsAvailable(); - Map, RangeSet> subRangeDeletions = readSubRangeDeletions(); - - RangeSet availableIdsForTsRange = null; - Iterator> idRangeIter = null; - RangeSet idsUsed = TreeRangeSet.create(); - for (TimestampedValueWithId pendingAdd : elements) { - // Since elements are in increasing ts order, often we'll be able to reuse the previous - // iteration's range. - if (currentTsRange == null - || !currentTsRange.contains(pendingAdd.getValue().getTimestamp())) { - if (availableIdsForTsRange != null) { - // We're moving onto a new ts range. Remove all used ids - availableIdsForTsRange.removeAll(idsUsed); - idsUsed = TreeRangeSet.create(); - } - - // Lookup the range for the current timestamp. - currentTsRange = getTrackedRange(pendingAdd.getValue().getTimestamp()); - // Lookup available ids for this timestamp range. If nothing there, we default to all ids - // available. - availableIdsForTsRange = - idsAvailable.computeIfAbsent( - currentTsRange, - r -> - TreeRangeSet.create( - ImmutableList.of(Range.closedOpen(NEW_RANGE_MIN_ID, NEW_RANGE_MAX_ID)))); - idRangeIter = availableIdsForTsRange.asRanges().iterator(); - currentIdRange = null; - currentTsRangeDeletions = subRangeDeletions.get(currentTsRange); - } - - if (currentIdRange == null || currentId >= currentIdRange.upperEndpoint()) { - // Move to the next range of free ids, and start assigning ranges from there. - currentIdRange = idRangeIter.next(); - currentId = currentIdRange.lowerEndpoint(); - } - - if (currentTsRangeDeletions != null) { - currentTsRangeDeletions.remove( - Range.closedOpen( - pendingAdd.getValue().getTimestamp(), - pendingAdd.getValue().getTimestamp().plus(Duration.millis(1)))); - } - idsUsed.add(Range.closedOpen(currentId, currentId + 1)); - output.accept(pendingAdd.getValue(), currentId++); - } - if (availableIdsForTsRange != null) { - availableIdsForTsRange.removeAll(idsUsed); - } - writeValues(idsAvailable, subRangeDeletions); - } - - // Remove a timestamp range. Returns ids freed up. - void remove(Range tsRange) throws ExecutionException, InterruptedException { - Map, RangeSet> idsAvailable = readIdsAvailable(); - Map, RangeSet> subRangeDeletions = readSubRangeDeletions(); - - for (Range current = getTrackedRange(tsRange.lowerEndpoint()); - current.lowerEndpoint().isBefore(tsRange.upperEndpoint()); - current = getTrackedRange(current.lowerEndpoint().plus(RESOLUTION))) { - // TODO(reuvenlax): shouldn't need to iterate over all ranges. - boolean rangeCleared; - if (!tsRange.encloses(current)) { - // This can happen if the beginning or the end of tsRange doesn't fall on a RESOLUTION - // boundary. Since we - // are deleting a portion of a tracked range, track what we are deleting. - RangeSet rangeDeletions = - subRangeDeletions.computeIfAbsent(current, r -> TreeRangeSet.create()); - rangeDeletions.add(tsRange.intersection(current)); - // If we ended up deleting the whole range, than we can simply remove it from the tracking - // map. - rangeCleared = rangeDeletions.encloses(current); - } else { - rangeCleared = true; - } - if (rangeCleared) { - // Remove the range from both maps. - idsAvailable.remove(current); - subRangeDeletions.remove(current); - } - } - writeValues(idsAvailable, subRangeDeletions); - } - - private void writeValues( - Map, RangeSet> idsAvailable, - Map, RangeSet> subRangeDeletions) { - if (idsAvailable.isEmpty()) { - idsAvailable.clear(); - } else { - idsAvailableValue.write(idsAvailable); - } - if (subRangeDeletions.isEmpty()) { - subRangeDeletionsValue.clear(); - } else { - subRangeDeletionsValue.write(subRangeDeletions); - } - } - } - - @AutoValue - abstract static class TimestampedValueWithId { - private static final Comparator> COMPARATOR = - Comparator., Instant>comparing(v -> v.getValue().getTimestamp()) - .thenComparingLong(TimestampedValueWithId::getId); - - abstract TimestampedValue getValue(); - - abstract long getId(); - - static TimestampedValueWithId of(TimestampedValue value, long id) { - return new AutoValue_WindmillStateInternals_TimestampedValueWithId<>(value, id); - } - - static TimestampedValueWithId bound(Instant ts) { - return of(TimestampedValue.of(null, ts), Long.MIN_VALUE); - } - } - - static class WindmillOrderedList extends SimpleWindmillState implements OrderedListState { - private final ByteString stateKey; - private final String stateFamily; - private final Coder elemCoder; - private boolean complete; - private boolean cleared = false; - // We need to sort based on timestamp, but we need objects with the same timestamp to be treated - // as unique. We can't use a MultiSet as we can't construct a comparator that uniquely - // identifies objects, - // so we construct a unique in-memory long ids for each element. - private SortedSet> pendingAdds = - Sets.newTreeSet(TimestampedValueWithId.COMPARATOR); - - private RangeSet pendingDeletes = TreeRangeSet.create(); - private IdTracker idTracker; - - // The default proto values for SortedListRange correspond to the minimum and maximum - // timestamps. - static final long MIN_TS_MICROS = SortedListRange.getDefaultInstance().getStart(); - static final long MAX_TS_MICROS = SortedListRange.getDefaultInstance().getLimit(); - - private WindmillOrderedList( - StateTable derivedStateTable, - StateNamespace namespace, - StateTag> spec, - String stateFamily, - Coder elemCoder, - boolean isNewKey) { - - this.stateKey = encodeKey(namespace, spec); - this.stateFamily = stateFamily; - this.elemCoder = elemCoder; - this.complete = isNewKey; - this.idTracker = new IdTracker(derivedStateTable, namespace, spec, stateFamily, complete); - } - - @Override - public Iterable> read() { - return readRange(null, null); - } - - private SortedSet> getPendingAddRange( - @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { - SortedSet> pendingInRange = pendingAdds; - if (minTimestamp != null && limitTimestamp != null) { - pendingInRange = - pendingInRange.subSet( - TimestampedValueWithId.bound(minTimestamp), - TimestampedValueWithId.bound(limitTimestamp)); - } else if (minTimestamp == null && limitTimestamp != null) { - pendingInRange = pendingInRange.headSet(TimestampedValueWithId.bound(limitTimestamp)); - } else if (limitTimestamp == null && minTimestamp != null) { - pendingInRange = pendingInRange.tailSet(TimestampedValueWithId.bound(minTimestamp)); - } - return pendingInRange; - } - - @Override - public Iterable> readRange( - @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { - idTracker.readLater(); - - final Future>> future = getFuture(minTimestamp, limitTimestamp); - try (Closeable scope = scopedReadState()) { - SortedSet> pendingInRange = - getPendingAddRange(minTimestamp, limitTimestamp); - - // Transform the return iterator so it has the same type as pendingAdds. We need to ensure - // that the ids don't overlap with any in pendingAdds, so begin with pendingAdds.size(). - Iterable> data = - new Iterable>() { - // Anything returned from windmill that has been deleted should be ignored. - private Iterable> iterable = - Iterables.filter(future.get(), tv -> !pendingDeletes.contains(tv.getTimestamp())); - - @Override - public Iterator> iterator() { - return new Iterator>() { - private Iterator> iter = iterable.iterator(); - private long currentId = pendingAdds.size(); - - @Override - public boolean hasNext() { - return iter.hasNext(); - } - - @Override - public TimestampedValueWithId next() { - return TimestampedValueWithId.of(iter.next(), currentId++); - } - }; - } - }; - - Iterable> includingAdds = - Iterables.mergeSorted( - ImmutableList.of(data, pendingInRange), TimestampedValueWithId.COMPARATOR); - Iterable> fullIterable = - Iterables.transform(includingAdds, TimestampedValueWithId::getValue); - - // TODO(reuvenlax): If we have a known bounded amount of data, cache known ranges. - return fullIterable; - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - } - - @Override - public void clear() { - cleared = true; - complete = true; - pendingAdds.clear(); - pendingDeletes.clear(); - try { - idTracker.clear(); - } catch (ExecutionException | InterruptedException e) { - throw new RuntimeException(e); - } - } - - @Override - public void clearRange(Instant minTimestamp, Instant limitTimestamp) { - getPendingAddRange(minTimestamp, limitTimestamp).clear(); - pendingDeletes.add(Range.closedOpen(minTimestamp, limitTimestamp)); - } - - @Override - public void add(TimestampedValue value) { - // We use the current size of the container as the in-memory id. This works because - // pendingAdds is completely - // cleared when it is processed (otherwise we could end up with duplicate elements in the same - // container). These - // are not the ids that will be sent to windmill. - pendingAdds.add(TimestampedValueWithId.of(value, pendingAdds.size())); - // Leave pendingDeletes alone. Since we can have multiple values with the same timestamp, we - // may still need - // overlapping deletes to remove previous entries at this timestamp. - } - - @Override - public ReadableState isEmpty() { - return new ReadableState() { - @Override - public ReadableState readLater() { - WindmillOrderedList.this.readLater(); - return this; - } - - @Override - public Boolean read() { - return Iterables.isEmpty(WindmillOrderedList.this.read()); - } - }; - } - - @Override - public OrderedListState readLater() { - return readRangeLater(null, null); - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public OrderedListState readRangeLater( - @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { - idTracker.readLater(); - getFuture(minTimestamp, limitTimestamp); - return this; - } - - @Override - public WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - TagSortedListUpdateRequest.Builder updatesBuilder = - commitBuilder - .addSortedListUpdatesBuilder() - .setStateFamily(cache.getStateFamily()) - .setTag(stateKey); - try { - if (cleared) { - // Default range. - updatesBuilder.addDeletesBuilder().build(); - cleared = false; - } - - if (!pendingAdds.isEmpty()) { - // TODO(reuvenlax): Once we start caching data, we should remove this line. We have it - // here now - // because once we persist - // added data we forget about it from the cache, so the object is no longer complete. - complete = false; - - TagSortedListInsertRequest.Builder insertBuilder = updatesBuilder.addInsertsBuilder(); - idTracker.add( - pendingAdds, - (elem, id) -> { - try { - ByteStringOutputStream elementStream = new ByteStringOutputStream(); - elemCoder.encode(elem.getValue(), elementStream, Context.OUTER); - insertBuilder.addEntries( - SortedListEntry.newBuilder() - .setValue(elementStream.toByteString()) - .setSortKey( - WindmillTimeUtils.harnessToWindmillTimestamp(elem.getTimestamp())) - .setId(id)); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - pendingAdds.clear(); - insertBuilder.build(); - } - - if (!pendingDeletes.isEmpty()) { - for (Range range : pendingDeletes.asRanges()) { - TagSortedListDeleteRequest.Builder deletesBuilder = updatesBuilder.addDeletesBuilder(); - deletesBuilder.setRange( - SortedListRange.newBuilder() - .setStart(WindmillTimeUtils.harnessToWindmillTimestamp(range.lowerEndpoint())) - .setLimit(WindmillTimeUtils.harnessToWindmillTimestamp(range.upperEndpoint()))); - deletesBuilder.build(); - idTracker.remove(range); - } - pendingDeletes.clear(); - } - } catch (ExecutionException | InterruptedException e) { - throw new RuntimeException(e); - } - return commitBuilder.buildPartial(); - } - - private Future>> getFuture( - @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { - long startSortKey = - minTimestamp != null - ? WindmillTimeUtils.harnessToWindmillTimestamp(minTimestamp) - : MIN_TS_MICROS; - long limitSortKey = - limitTimestamp != null - ? WindmillTimeUtils.harnessToWindmillTimestamp(limitTimestamp) - : MAX_TS_MICROS; - - if (complete) { - // Right now we don't cache any data, so complete means an empty list. - // TODO(reuvenlax): change this once we start caching data. - return Futures.immediateFuture(Collections.emptyList()); - } - return reader.orderedListFuture( - Range.closedOpen(startSortKey, limitSortKey), stateKey, stateFamily, elemCoder); - } - } - - static class WindmillSet extends SimpleWindmillState implements SetState { - WindmillMap windmillMap; - - WindmillSet( - StateNamespace namespace, - StateTag> address, - String stateFamily, - Coder keyCoder, - WindmillStateCache.ForKeyAndFamily cache, - boolean isNewKey) { - StateTag> internalMapAddress = - StateTags.convertToMapTagInternal(address); - WindmillMap cachedMap = - (WindmillMap) cache.get(namespace, internalMapAddress); - this.windmillMap = - (cachedMap != null) - ? cachedMap - : new WindmillMap<>( - namespace, - internalMapAddress, - stateFamily, - keyCoder, - BooleanCoder.of(), - isNewKey); - } - - @Override - protected WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - return windmillMap.persistDirectly(cache); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState< - @UnknownKeyFor @NonNull @Initialized Boolean> - contains(K k) { - return windmillMap.getOrDefault(k, false); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState< - @UnknownKeyFor @NonNull @Initialized Boolean> - addIfAbsent(K k) { - return new ReadableState() { - ReadableState putState = windmillMap.putIfAbsent(k, true); - - @Override - public @Nullable Boolean read() { - Boolean result = putState.read(); - return (result != null) ? result : false; - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState readLater() { - putState = putState.readLater(); - return this; - } - }; - } - - @Override - public void remove(K k) { - windmillMap.remove(k); - } - - @Override - public void add(K value) { - windmillMap.put(value, true); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState< - @UnknownKeyFor @NonNull @Initialized Boolean> - isEmpty() { - return windmillMap.isEmpty(); - } - - @Override - public @Nullable Iterable read() { - return windmillMap.keys().read(); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized SetState readLater() { - windmillMap.keys().readLater(); - return this; - } - - @Override - public void clear() { - windmillMap.clear(); - } - - @Override - void initializeForWorkItem( - WindmillStateReader reader, Supplier scopedReadStateSupplier) { - windmillMap.initializeForWorkItem(reader, scopedReadStateSupplier); - } - - @Override - void cleanupAfterWorkItem() { - windmillMap.cleanupAfterWorkItem(); - } - } - - static class WindmillMap extends SimpleWindmillState implements MapState { - private final StateNamespace namespace; - private final StateTag> address; - private final ByteString stateKeyPrefix; - private final String stateFamily; - private final Coder keyCoder; - private final Coder valueCoder; - private boolean complete; - - // TODO(reuvenlax): Should we evict items from the cache? We would have to make sure - // that anything in the cache that is not committed is not evicted. negativeCache could be - // evicted whenever we want. - private Map cachedValues = Maps.newHashMap(); - private Set negativeCache = Sets.newHashSet(); - private boolean cleared = false; - - private Set localAdditions = Sets.newHashSet(); - private Set localRemovals = Sets.newHashSet(); - - WindmillMap( - StateNamespace namespace, - StateTag> address, - String stateFamily, - Coder keyCoder, - Coder valueCoder, - boolean isNewKey) { - this.namespace = namespace; - this.address = address; - this.stateKeyPrefix = encodeKey(namespace, address); - this.stateFamily = stateFamily; - this.keyCoder = keyCoder; - this.valueCoder = valueCoder; - this.complete = isNewKey; - } - - private K userKeyFromProtoKey(ByteString tag) throws IOException { - Preconditions.checkState(tag.startsWith(stateKeyPrefix)); - ByteString keyBytes = tag.substring(stateKeyPrefix.size()); - return keyCoder.decode(keyBytes.newInput(), Context.OUTER); - } - - private ByteString protoKeyFromUserKey(K key) throws IOException { - ByteStringOutputStream keyStream = new ByteStringOutputStream(); - stateKeyPrefix.writeTo(keyStream); - keyCoder.encode(key, keyStream, Context.OUTER); - return keyStream.toByteString(); - } - - @Override - protected WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - if (!cleared && localAdditions.isEmpty() && localRemovals.isEmpty()) { - // No changes, so return directly. - return WorkItemCommitRequest.newBuilder().buildPartial(); - } - - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - - if (cleared) { - commitBuilder - .addTagValuePrefixDeletesBuilder() - .setStateFamily(stateFamily) - .setTagPrefix(stateKeyPrefix); - } - cleared = false; - - for (K key : localAdditions) { - ByteString keyBytes = protoKeyFromUserKey(key); - ByteStringOutputStream valueStream = new ByteStringOutputStream(); - valueCoder.encode(cachedValues.get(key), valueStream, Context.OUTER); - ByteString valueBytes = valueStream.toByteString(); - - commitBuilder - .addValueUpdatesBuilder() - .setTag(keyBytes) - .setStateFamily(stateFamily) - .getValueBuilder() - .setData(valueBytes) - .setTimestamp(Long.MAX_VALUE); - } - localAdditions.clear(); - - for (K key : localRemovals) { - ByteStringOutputStream keyStream = new ByteStringOutputStream(); - stateKeyPrefix.writeTo(keyStream); - keyCoder.encode(key, keyStream, Context.OUTER); - ByteString keyBytes = keyStream.toByteString(); - // Leaving data blank means that we delete the tag. - commitBuilder - .addValueUpdatesBuilder() - .setTag(keyBytes) - .setStateFamily(stateFamily) - .getValueBuilder() - .setTimestamp(Long.MAX_VALUE); - - V cachedValue = cachedValues.remove(key); - if (cachedValue != null) { - ByteStringOutputStream valueStream = new ByteStringOutputStream(); - valueCoder.encode(cachedValues.get(key), valueStream, Context.OUTER); - } - } - negativeCache.addAll(localRemovals); - localRemovals.clear(); - - // TODO(reuvenlax): We should store in the cache parameter, as that would enable caching the - // map - // between work items, reducing fetches to Windmill. To do so, we need keep track of the - // encoded size - // of the map, and to do so efficiently (i.e. without iterating over the entire map on every - // persist) - // we need to track the sizes of each map entry. - cache.put(namespace, address, this, 1); - return commitBuilder.buildPartial(); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState get(K key) { - return getOrDefault(key, null); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState getOrDefault( - K key, @Nullable V defaultValue) { - return new ReadableState() { - @Override - public @Nullable V read() { - Future persistedData = getFutureForKey(key); - try (Closeable scope = scopedReadState()) { - if (localRemovals.contains(key) || negativeCache.contains(key)) { - return null; - } - @Nullable V cachedValue = cachedValues.get(key); - if (cachedValue != null || complete) { - return cachedValue; - } - - V persistedValue = persistedData.get(); - if (persistedValue == null) { - negativeCache.add(key); - return defaultValue; - } - // TODO: Don't do this if it was already in cache. - cachedValues.put(key, persistedValue); - return persistedValue; - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public @UnknownKeyFor @NonNull @Initialized ReadableState readLater() { - WindmillMap.this.getFutureForKey(key); - return this; - } - }; - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState< - @UnknownKeyFor @NonNull @Initialized Iterable> - keys() { - ReadableState>> entries = entries(); - return new ReadableState>() { - @Override - public @Nullable Iterable read() { - return Iterables.transform(entries.read(), e -> e.getKey()); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState> readLater() { - entries.readLater(); - return this; - } - }; - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState< - @UnknownKeyFor @NonNull @Initialized Iterable> - values() { - ReadableState>> entries = entries(); - return new ReadableState>() { - @Override - public @Nullable Iterable read() { - return Iterables.transform(entries.read(), e -> e.getValue()); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState> readLater() { - entries.readLater(); - return this; - } - }; - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState< - @UnknownKeyFor @NonNull @Initialized Iterable< - @UnknownKeyFor @NonNull @Initialized Entry>> - entries() { - return new ReadableState>>() { - @Override - public Iterable> read() { - if (complete) { - return Iterables.unmodifiableIterable(cachedValues.entrySet()); - } - Future>> persistedData = getFuture(); - try (Closeable scope = scopedReadState()) { - Iterable> data = persistedData.get(); - Iterable> transformedData = - Iterables., Map.Entry>transform( - data, - entry -> { - try { - return new AbstractMap.SimpleEntry<>( - userKeyFromProtoKey(entry.getKey()), entry.getValue()); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - - if (data instanceof Weighted) { - // This is a known amount of data. Cache it all. - transformedData.forEach( - e -> { - // The cached data overrides what is read from state, so call putIfAbsent. - cachedValues.putIfAbsent(e.getKey(), e.getValue()); - }); - complete = true; - return Iterables.unmodifiableIterable(cachedValues.entrySet()); - } else { - // This means that the result might be too large to cache, so don't add it to the - // local cache. Instead merge the iterables, giving priority to any local additions - // (represented in cachedValued and localRemovals) that may not have been committed - // yet. - return Iterables.unmodifiableIterable( - Iterables.concat( - cachedValues.entrySet(), - Iterables.filter( - transformedData, - e -> - !cachedValues.containsKey(e.getKey()) - && !localRemovals.contains(e.getKey())))); - } - - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public @UnknownKeyFor @NonNull @Initialized ReadableState>> - readLater() { - WindmillMap.this.getFuture(); - return this; - } - }; - } - - @Override - public ReadableState isEmpty() { - return new ReadableState() { - // TODO(reuvenlax): Can we find a more efficient way of implementing isEmpty than reading - // the entire map? - ReadableState> keys = WindmillMap.this.keys(); - - @Override - public @Nullable Boolean read() { - return Iterables.isEmpty(keys.read()); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState readLater() { - keys.readLater(); - return this; - } - }; - } - - @Override - public void put(K key, V value) { - V oldValue = cachedValues.put(key, value); - if (valueCoder.consistentWithEquals() && value.equals(oldValue)) { - return; - } - localAdditions.add(key); - localRemovals.remove(key); - negativeCache.remove(key); - } - - @Override - public @UnknownKeyFor @NonNull @Initialized ReadableState computeIfAbsent( - K key, Function mappingFunction) { - Future persistedData = getFutureForKey(key); - try (Closeable scope = scopedReadState()) { - if (localRemovals.contains(key) || negativeCache.contains(key)) { - return ReadableStates.immediate(null); - } - @Nullable V cachedValue = cachedValues.get(key); - if (cachedValue != null || complete) { - return ReadableStates.immediate(cachedValue); - } - - V persistedValue = persistedData.get(); - if (persistedValue == null) { - // This is a new value. Add it to the map and return null. - put(key, mappingFunction.apply(key)); - return ReadableStates.immediate(null); - } - // TODO: Don't do this if it was already in cache. - cachedValues.put(key, persistedValue); - return ReadableStates.immediate(persistedValue); - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - } - - @Override - public void remove(K key) { - if (localRemovals.add(key)) { - cachedValues.remove(key); - localAdditions.remove(key); - } - } - - @Override - public void clear() { - cachedValues.clear(); - localAdditions.clear(); - localRemovals.clear(); - negativeCache.clear(); - cleared = true; - complete = true; - } - - private Future getFutureForKey(K key) { - try { - ByteStringOutputStream keyStream = new ByteStringOutputStream(); - stateKeyPrefix.writeTo(keyStream); - keyCoder.encode(key, keyStream, Context.OUTER); - return reader.valueFuture(keyStream.toByteString(), stateFamily, valueCoder); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private Future>> getFuture() { - if (complete) { - // The caller will merge in local cached values. - return Futures.immediateFuture(Collections.emptyList()); - } else { - return reader.valuePrefixFuture(stateKeyPrefix, stateFamily, valueCoder); - } - } - } - - private static class WindmillMultimap extends SimpleWindmillState - implements MultimapState { - - private final StateNamespace namespace; - private final StateTag> address; - private final ByteString stateKey; - private final String stateFamily; - private final Coder keyCoder; - private final Coder valueCoder; - - private enum KeyExistence { - // this key is known to exist, it has at least 1 value in either localAdditions or windmill - KNOWN_EXIST, - // this key is known to be nonexistent, it has 0 value in both localAdditions and windmill - KNOWN_NONEXISTENT, - // we don't know if this key is in this multimap, it has exact 0 value in localAddition, but - // may have no or any number of values in windmill. This is just to provide a mapping between - // the original key and the structural key. - UNKNOWN_EXISTENCE - } - - private class KeyState { - final K originalKey; - KeyExistence existence; - // valuesCached can be true if only existence == KNOWN_EXIST and all values of this key are - // cached (both values and localAdditions). - boolean valuesCached; - // Represents the values in windmill. When new values are added during user processing, they - // are added to localAdditions but not values. Those new values will be added to values only - // after they are persisted into windmill and removed from localAdditions - ConcatIterables values; - int valuesSize; - - // When new values are added during user processing, they are added to localAdditions, so that - // we can later try to persist them in windmill. When a key is removed during user processing, - // we mark removedLocally to be true so that we can later try to delete it from windmill. If - // localAdditions is not empty and removedLocally is true, values in localAdditions will be - // added to windmill after old values in windmill are removed. - List localAdditions; - boolean removedLocally; - - KeyState(K originalKey) { - this.originalKey = originalKey; - existence = KeyExistence.UNKNOWN_EXISTENCE; - valuesCached = complete; - values = new ConcatIterables<>(); - valuesSize = 0; - localAdditions = Lists.newArrayList(); - removedLocally = false; - } - } - - // Set to true when user clears the entire multimap, so that we can later send delete request to - // the windmill backend. - private boolean cleared = false; - // We use the structural value of the keys as the key in keyStateMap, so that different java - // Objects with the same content will be treated as the same Multimap key. - private Map keyStateMap = Maps.newHashMap(); - // If true, all keys are cached in keyStateMap with existence == KNOWN_EXIST. - private boolean allKeysKnown = false; - - // True if all contents of this multimap are cached in this object. - private boolean complete = false; - // hasLocalAdditions and hasLocalRemovals track whether there are local changes that needs to be - // propagated to windmill. - private boolean hasLocalAdditions = false; - private boolean hasLocalRemovals = false; - - private WindmillMultimap( - StateNamespace namespace, - StateTag> address, - String stateFamily, - Coder keyCoder, - Coder valueCoder, - boolean isNewShardingKey) { - this.namespace = namespace; - this.address = address; - this.stateKey = encodeKey(namespace, address); - this.stateFamily = stateFamily; - this.keyCoder = keyCoder; - this.valueCoder = valueCoder; - this.complete = isNewShardingKey; - this.allKeysKnown = isNewShardingKey; - } - - @Override - public void put(K key, V value) { - final Object structuralKey = keyCoder.structuralValue(key); - hasLocalAdditions = true; - keyStateMap.compute( - structuralKey, - (k, v) -> { - if (v == null) v = new KeyState(key); - v.existence = KeyExistence.KNOWN_EXIST; - v.localAdditions.add(value); - return v; - }); - } - - // Initiates a backend state read to fetch all entries if necessary. - private Future>>> necessaryEntriesFromStorageFuture( - boolean omitValues) { - if (complete) { - // Since we're complete, even if there are entries in storage we don't need to read them. - return Futures.immediateFuture(Collections.emptyList()); - } else { - return reader.multimapFetchAllFuture(omitValues, stateKey, stateFamily, valueCoder); - } - } - - // Initiates a backend state read to fetch a single entry if necessary. - private Future> necessaryKeyEntriesFromStorageFuture(K key) { - try { - ByteStringOutputStream keyStream = new ByteStringOutputStream(); - keyCoder.encode(key, keyStream, Context.OUTER); - return reader.multimapFetchSingleEntryFuture( - keyStream.toByteString(), stateKey, stateFamily, valueCoder); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public ReadableState> get(K key) { - return new ReadableState>() { - final Object structuralKey = keyCoder.structuralValue(key); - - @Override - public Iterable read() { - KeyState keyState = null; - if (allKeysKnown) { - keyState = keyStateMap.get(structuralKey); - if (keyState == null || keyState.existence == KeyExistence.UNKNOWN_EXISTENCE) { - if (keyState != null) keyStateMap.remove(structuralKey); - return Collections.emptyList(); - } - } else { - keyState = keyStateMap.computeIfAbsent(structuralKey, k -> new KeyState(key)); - } - if (keyState.existence == KeyExistence.KNOWN_NONEXISTENT) { - return Collections.emptyList(); - } - Iterable localNewValues = - Iterables.limit(keyState.localAdditions, keyState.localAdditions.size()); - if (keyState.removedLocally) { - // this key has been removed locally but the removal hasn't been sent to windmill, - // thus values in windmill(if any) are obsolete, and we only care about local values. - return Iterables.unmodifiableIterable(localNewValues); - } - if (keyState.valuesCached || complete) { - return Iterables.unmodifiableIterable( - Iterables.concat( - Iterables.limit(keyState.values, keyState.valuesSize), localNewValues)); - } - Future> persistedData = necessaryKeyEntriesFromStorageFuture(key); - try (Closeable scope = scopedReadState()) { - final Iterable persistedValues = persistedData.get(); - // Iterables.isEmpty() is O(1). - if (Iterables.isEmpty(persistedValues)) { - if (keyState.localAdditions.isEmpty()) { - // empty in both cache and windmill, mark key as KNOWN_NONEXISTENT. - keyState.existence = KeyExistence.KNOWN_NONEXISTENT; - return Collections.emptyList(); - } - return Iterables.unmodifiableIterable(localNewValues); - } - keyState.existence = KeyExistence.KNOWN_EXIST; - if (persistedValues instanceof Weighted) { - keyState.valuesCached = true; - ConcatIterables it = new ConcatIterables<>(); - it.extendWith(persistedValues); - keyState.values = it; - keyState.valuesSize = Iterables.size(persistedValues); - } - return Iterables.unmodifiableIterable( - Iterables.concat(persistedValues, localNewValues)); - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read Multimap state", e); - } - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public ReadableState> readLater() { - WindmillMultimap.this.necessaryKeyEntriesFromStorageFuture(key); - return this; - } - }; - } - - @Override - protected WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - if (!cleared && !hasLocalAdditions && !hasLocalRemovals) { - cache.put(namespace, address, this, 1); - return WorkItemCommitRequest.newBuilder().buildPartial(); - } - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - Windmill.TagMultimapUpdateRequest.Builder builder = commitBuilder.addMultimapUpdatesBuilder(); - builder.setTag(stateKey).setStateFamily(stateFamily); - - if (cleared) { - builder.setDeleteAll(true); - } - if (hasLocalRemovals || hasLocalAdditions) { - ByteStringOutputStream keyStream = new ByteStringOutputStream(); - ByteStringOutputStream valueStream = new ByteStringOutputStream(); - Iterator> iterator = keyStateMap.entrySet().iterator(); - while (iterator.hasNext()) { - KeyState keyState = iterator.next().getValue(); - if (!keyState.removedLocally && keyState.localAdditions.isEmpty()) { - if (keyState.existence == KeyExistence.KNOWN_NONEXISTENT) iterator.remove(); - continue; - } - keyCoder.encode(keyState.originalKey, keyStream, Context.OUTER); - ByteString encodedKey = keyStream.toByteStringAndReset(); - Windmill.TagMultimapEntry.Builder entryBuilder = builder.addUpdatesBuilder(); - entryBuilder.setEntryName(encodedKey); - if (keyState.removedLocally) entryBuilder.setDeleteAll(true); - keyState.removedLocally = false; - if (!keyState.localAdditions.isEmpty()) { - for (V value : keyState.localAdditions) { - valueCoder.encode(value, valueStream, Context.OUTER); - ByteString encodedValue = valueStream.toByteStringAndReset(); - entryBuilder.addValues(encodedValue); - } - // Move newly added values from localAdditions to keyState.values as those new values - // now - // are also persisted in Windmill. If a key now has no more values and is not - // KNOWN_EXIST, - // remove it from cache. - if (keyState.valuesCached) { - keyState.values.extendWith(keyState.localAdditions); - keyState.valuesSize += keyState.localAdditions.size(); - } - // Create a new localAdditions so that the cached values are unaffected. - keyState.localAdditions = Lists.newArrayList(); - } - if (!keyState.valuesCached && keyState.existence != KeyExistence.KNOWN_EXIST) { - iterator.remove(); - } - } - } - - hasLocalAdditions = false; - hasLocalRemovals = false; - cleared = false; - - cache.put(namespace, address, this, 1); - return commitBuilder.buildPartial(); - } - - @Override - public void remove(K key) { - final Object structuralKey = keyCoder.structuralValue(key); - // does not insert key if allKeysKnown. - KeyState keyState = - keyStateMap.computeIfAbsent(structuralKey, k -> allKeysKnown ? null : new KeyState(key)); - if (keyState == null || keyState.existence == KeyExistence.KNOWN_NONEXISTENT) { - return; - } - if (keyState.valuesCached && keyState.valuesSize == 0) { - // no data in windmill, deleting from local cache is sufficient. - keyStateMap.remove(structuralKey); - } else { - // there may be data in windmill that need to be removed. - hasLocalRemovals = true; - keyState.removedLocally = true; - keyState.values = new ConcatIterables<>(); - keyState.valuesSize = 0; - keyState.existence = KeyExistence.KNOWN_NONEXISTENT; - } - if (!keyState.localAdditions.isEmpty()) { - keyState.localAdditions = Lists.newArrayList(); - } - keyState.valuesCached = true; - } - - @Override - public void clear() { - keyStateMap = Maps.newHashMap(); - cleared = true; - complete = true; - allKeysKnown = true; - hasLocalAdditions = false; - hasLocalRemovals = false; - } - - @Override - public ReadableState> keys() { - return new ReadableState>() { - - private Map cachedExistKeys() { - return keyStateMap.entrySet().stream() - .filter(entry -> entry.getValue().existence == KeyExistence.KNOWN_EXIST) - .collect(Collectors.toMap(Entry::getKey, e -> e.getValue().originalKey)); - } - - @Override - public Iterable read() { - if (allKeysKnown) { - return Iterables.unmodifiableIterable(cachedExistKeys().values()); - } - Future>>> persistedData = - necessaryEntriesFromStorageFuture(true); - try (Closeable scope = scopedReadState()) { - Iterable>> entries = persistedData.get(); - if (entries instanceof Weighted) { - // This is a known amount of data, cache them all. - entries.forEach( - entry -> { - try { - K originalKey = keyCoder.decode(entry.getKey().newInput(), Context.OUTER); - KeyState keyState = - keyStateMap.computeIfAbsent( - keyCoder.structuralValue(originalKey), - stk -> new KeyState(originalKey)); - if (keyState.existence == KeyExistence.UNKNOWN_EXISTENCE) { - keyState.existence = KeyExistence.KNOWN_EXIST; - } - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - allKeysKnown = true; - keyStateMap - .values() - .removeIf( - keyState -> - keyState.existence != KeyExistence.KNOWN_EXIST - && !keyState.removedLocally); - return Iterables.unmodifiableIterable(cachedExistKeys().values()); - } else { - Map cachedExistKeys = Maps.newHashMap(); - Set cachedNonExistKeys = Sets.newHashSet(); - keyStateMap.forEach( - (structuralKey, keyState) -> { - switch (keyState.existence) { - case KNOWN_EXIST: - cachedExistKeys.put(structuralKey, keyState.originalKey); - break; - case KNOWN_NONEXISTENT: - cachedNonExistKeys.add(structuralKey); - break; - default: - break; - } - }); - // keysOnlyInWindmill is lazily loaded. - Iterable keysOnlyInWindmill = - Iterables.filter( - Iterables.transform( - entries, - entry -> { - try { - K originalKey = - keyCoder.decode(entry.getKey().newInput(), Context.OUTER); - Object structuralKey = keyCoder.structuralValue(originalKey); - if (cachedExistKeys.containsKey(structuralKey) - || cachedNonExistKeys.contains(structuralKey)) return null; - return originalKey; - } catch (IOException e) { - throw new RuntimeException(e); - } - }), - Objects::nonNull); - return Iterables.unmodifiableIterable( - Iterables.concat(cachedExistKeys.values(), keysOnlyInWindmill)); - } - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public ReadableState> readLater() { - WindmillMultimap.this.necessaryEntriesFromStorageFuture(true); - return this; - } - }; - } - - @Override - public ReadableState>> entries() { - return new ReadableState>>() { - @Override - public Iterable> read() { - if (complete) { - return Iterables.unmodifiableIterable( - unnestCachedEntries(mergedCachedEntries(null).entrySet())); - } - Future>>> persistedData = - necessaryEntriesFromStorageFuture(false); - try (Closeable scope = scopedReadState()) { - Iterable>> entries = persistedData.get(); - if (Iterables.isEmpty(entries)) { - complete = true; - allKeysKnown = true; - return Iterables.unmodifiableIterable( - unnestCachedEntries(mergedCachedEntries(null).entrySet())); - } - if (!(entries instanceof Weighted)) { - return nonWeightedEntries(entries); - } - // This is a known amount of data, cache them all. - entries.forEach( - entry -> { - try { - final K originalKey = keyCoder.decode(entry.getKey().newInput(), Context.OUTER); - final Object structuralKey = keyCoder.structuralValue(originalKey); - KeyState keyState = - keyStateMap.computeIfAbsent(structuralKey, k -> new KeyState(originalKey)); - // Ignore any key from windmill that has been marked pending deletion or is - // fully cached. - if (keyState.existence == KeyExistence.KNOWN_NONEXISTENT - || (keyState.existence == KeyExistence.KNOWN_EXIST - && keyState.valuesCached)) return; - // Or else cache contents from windmill. - keyState.existence = KeyExistence.KNOWN_EXIST; - keyState.values.extendWith(entry.getValue()); - keyState.valuesSize += Iterables.size(entry.getValue()); - keyState.valuesCached = true; - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - allKeysKnown = true; - complete = true; - return Iterables.unmodifiableIterable( - unnestCachedEntries(mergedCachedEntries(null).entrySet())); - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public ReadableState>> readLater() { - WindmillMultimap.this.necessaryEntriesFromStorageFuture(false); - return this; - } - - // Collect all cached entries into a map and all KNOWN_NONEXISTENT keys to - // knownNonexistentKeys(if not null). Note that this method is not side-effect-free: it - // unloads any key that is not KNOWN_EXIST and not pending deletion from cache; also if - // complete it marks the valuesCached of any key that is KNOWN_EXIST to true, entries() - // depends on this behavior when the fetched result is weighted to iterate the whole - // keyStateMap one less time. For each cached key, returns its structural key and a tuple of - // . - private Map>> mergedCachedEntries( - Set knownNonexistentKeys) { - Map>> cachedEntries = Maps.newHashMap(); - keyStateMap - .entrySet() - .removeIf( - (entry -> { - Object structuralKey = entry.getKey(); - KeyState keyState = entry.getValue(); - if (complete && keyState.existence == KeyExistence.KNOWN_EXIST) { - keyState.valuesCached = true; - } - ConcatIterables it = null; - if (!keyState.localAdditions.isEmpty()) { - it = new ConcatIterables<>(); - it.extendWith( - Iterables.limit(keyState.localAdditions, keyState.localAdditions.size())); - } - if (keyState.valuesCached) { - if (it == null) it = new ConcatIterables<>(); - it.extendWith(Iterables.limit(keyState.values, keyState.valuesSize)); - } - if (it != null) { - cachedEntries.put( - structuralKey, - Triple.of(keyState.originalKey, keyState.valuesCached, it)); - } - if (knownNonexistentKeys != null - && keyState.existence == KeyExistence.KNOWN_NONEXISTENT) - knownNonexistentKeys.add(structuralKey); - return (keyState.existence == KeyExistence.KNOWN_NONEXISTENT - && !keyState.removedLocally) - || keyState.existence == KeyExistence.UNKNOWN_EXISTENCE; - })); - return cachedEntries; - } - - private Iterable> unnestCachedEntries( - Iterable>>> cachedEntries) { - return Iterables.concat( - Iterables.transform( - cachedEntries, - entry -> - Iterables.transform( - entry.getValue().getRight(), - v -> new AbstractMap.SimpleEntry<>(entry.getValue().getLeft(), v)))); - } - - private Iterable> nonWeightedEntries( - Iterable>> lazyWindmillEntries) { - class ResultIterable implements Iterable> { - private final Iterable>> lazyWindmillEntries; - private final Map>> cachedEntries; - private final Set knownNonexistentKeys; - - ResultIterable( - Map>> cachedEntries, - Iterable>> lazyWindmillEntries, - Set knownNonexistentKeys) { - this.cachedEntries = cachedEntries; - this.lazyWindmillEntries = lazyWindmillEntries; - this.knownNonexistentKeys = knownNonexistentKeys; - } - - @Override - public Iterator> iterator() { - // Each time when the Iterable returned by entries() is iterated, a new Iterator is - // created. Every iterator must keep its own copy of seenCachedKeys so that if a key - // is paginated into multiple iterables from windmill, the cached values of this key - // will only be returned once. - Set seenCachedKeys = Sets.newHashSet(); - // notFullyCachedEntries returns all entries from windmill that are not fully cached - // and combines them with localAdditions. If a key is fully cached, contents of this - // key from windmill are ignored. - Iterable>> notFullyCachedEntries = - Iterables.filter( - Iterables.transform( - lazyWindmillEntries, - entry -> { - try { - final K key = - keyCoder.decode(entry.getKey().newInput(), Context.OUTER); - final Object structuralKey = keyCoder.structuralValue(key); - // key is deleted in cache thus fully cached. - if (knownNonexistentKeys.contains(structuralKey)) return null; - Triple> triple = - cachedEntries.get(structuralKey); - // no record of key in cache, return content in windmill. - if (triple == null) { - return Triple.of(structuralKey, key, entry.getValue()); - } - // key is fully cached in cache. - if (triple.getMiddle()) return null; - - // key is not fully cached, combine the content in windmill with local - // additions with only the first observed page for the key to ensure - // it is not repeated. - if (!seenCachedKeys.add(structuralKey)) { - return Triple.of(structuralKey, key, entry.getValue()); - } else { - ConcatIterables it = new ConcatIterables<>(); - it.extendWith(triple.getRight()); - it.extendWith(entry.getValue()); - return Triple.of(structuralKey, key, it); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - }), - Objects::nonNull); - Iterator> unnestWindmill = - Iterators.concat( - Iterables.transform( - notFullyCachedEntries, - entry -> - Iterables.transform( - entry.getRight(), - v -> new AbstractMap.SimpleEntry<>(entry.getMiddle(), v)) - .iterator()) - .iterator()); - Iterator> fullyCached = - unnestCachedEntries( - Iterables.filter( - cachedEntries.entrySet(), - entry -> !seenCachedKeys.contains(entry.getKey()))) - .iterator(); - return Iterators.concat(unnestWindmill, fullyCached); - } - } - - Set knownNonexistentKeys = Sets.newHashSet(); - Map>> cachedEntries = - mergedCachedEntries(knownNonexistentKeys); - return Iterables.unmodifiableIterable( - new ResultIterable(cachedEntries, lazyWindmillEntries, knownNonexistentKeys)); - } - }; - } - - @Override - public ReadableState containsKey(K key) { - return new ReadableState() { - ReadableState> values = null; - final Object structuralKey = keyCoder.structuralValue(key); - - @Override - public Boolean read() { - KeyState keyState = keyStateMap.getOrDefault(structuralKey, null); - if (keyState != null && keyState.existence != KeyExistence.UNKNOWN_EXISTENCE) { - return keyState.existence == KeyExistence.KNOWN_EXIST; - } - if (values == null) { - values = WindmillMultimap.this.get(key); - } - return !Iterables.isEmpty(values.read()); - } - - @Override - public ReadableState readLater() { - if (values == null) { - values = WindmillMultimap.this.get(key); - } - values.readLater(); - return this; - } - }; - } - - // Currently, isEmpty is implemented by reading all keys and could potentially be optimized. - // But note that if isEmpty is often followed by iterating over keys then maybe not too bad; if - // isEmpty is followed by iterating over both keys and values then it won't help much. - @Override - public ReadableState isEmpty() { - return new ReadableState() { - ReadableState> keys = null; - - @Override - public Boolean read() { - for (KeyState keyState : keyStateMap.values()) { - if (keyState.existence == KeyExistence.KNOWN_EXIST) return false; - } - if (keys == null) { - keys = WindmillMultimap.this.keys(); - } - return Iterables.isEmpty(keys.read()); - } - - @Override - public ReadableState readLater() { - if (keys == null) { - keys = WindmillMultimap.this.keys(); - } - keys.readLater(); - return this; - } - }; - } - } - - private static class WindmillBag extends SimpleWindmillState implements BagState { - - private final StateNamespace namespace; - private final StateTag> address; - private final ByteString stateKey; - private final String stateFamily; - private final Coder elemCoder; - - private boolean cleared = false; - /** - * If non-{@literal null}, this contains the complete contents of the bag, except for any local - * additions. If {@literal null} then we don't know if Windmill contains additional values which - * should be part of the bag. We'll need to read them if the work item actually wants the bag - * contents. - */ - private ConcatIterables cachedValues = null; - - private List localAdditions = new ArrayList<>(); - private long encodedSize = 0; - - private WindmillBag( - StateNamespace namespace, - StateTag> address, - String stateFamily, - Coder elemCoder, - boolean isNewKey) { - this.namespace = namespace; - this.address = address; - this.stateKey = encodeKey(namespace, address); - this.stateFamily = stateFamily; - this.elemCoder = elemCoder; - if (isNewKey) { - this.cachedValues = new ConcatIterables<>(); - } - } - - @Override - public void clear() { - cleared = true; - cachedValues = new ConcatIterables<>(); - localAdditions = new ArrayList<>(); - encodedSize = 0; - } - - /** - * Return iterable over all bag values in Windmill which should contribute to overall bag - * contents. - */ - private Iterable fetchData(Future> persistedData) { - try (Closeable scope = scopedReadState()) { - if (cachedValues != null) { - return cachedValues.snapshot(); - } - Iterable data = persistedData.get(); - if (data instanceof Weighted) { - // We have a known bounded amount of data; cache it. - cachedValues = new ConcatIterables<>(); - cachedValues.extendWith(data); - encodedSize = ((Weighted) data).getWeight(); - return cachedValues.snapshot(); - } else { - // This is an iterable that may not fit in memory at once; don't cache it. - return data; - } - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - } - - public boolean valuesAreCached() { - return cachedValues != null; - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public WindmillBag readLater() { - getFuture(); - return this; - } - - @Override - public Iterable read() { - return Iterables.concat( - fetchData(getFuture()), Iterables.limit(localAdditions, localAdditions.size())); - } - - @Override - public ReadableState isEmpty() { - return new ReadableState() { - @Override - public ReadableState readLater() { - WindmillBag.this.readLater(); - return this; - } - - @Override - public Boolean read() { - return Iterables.isEmpty(fetchData(getFuture())) && localAdditions.isEmpty(); - } - }; - } - - @Override - public void add(T input) { - localAdditions.add(input); - } - - @Override - public WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - - Windmill.TagBag.Builder bagUpdatesBuilder = null; - - if (cleared) { - bagUpdatesBuilder = commitBuilder.addBagUpdatesBuilder(); - bagUpdatesBuilder.setDeleteAll(true); - cleared = false; - } - - if (!localAdditions.isEmpty()) { - // Tell Windmill to capture the local additions. - if (bagUpdatesBuilder == null) { - bagUpdatesBuilder = commitBuilder.addBagUpdatesBuilder(); - } - for (T value : localAdditions) { - ByteStringOutputStream stream = new ByteStringOutputStream(); - // Encode the value - elemCoder.encode(value, stream, Coder.Context.OUTER); - ByteString encoded = stream.toByteString(); - if (cachedValues != null) { - // We'll capture this value in the cache below. - // Capture the value's size now since we have it. - encodedSize += encoded.size(); - } - bagUpdatesBuilder.addValues(encoded); - } - } - - if (bagUpdatesBuilder != null) { - bagUpdatesBuilder.setTag(stateKey).setStateFamily(stateFamily); - } - - if (cachedValues != null) { - if (!localAdditions.isEmpty()) { - // Capture the local additions in the cached value since we and - // Windmill are now in agreement. - cachedValues.extendWith(localAdditions); - } - // We now know the complete bag contents, and any read on it will yield a - // cached value, so cache it for future reads. - cache.put(namespace, address, this, encodedSize); - } - - // Don't reuse the localAdditions object; we don't want future changes to it to - // modify the value of cachedValues. - localAdditions = new ArrayList<>(); - - return commitBuilder.buildPartial(); - } - - private Future> getFuture() { - return cachedValues != null ? null : reader.bagFuture(stateKey, stateFamily, elemCoder); - } - } - - private static class ConcatIterables implements Iterable { - // List of component iterables. Should only be appended to in order to support snapshot(). - List> iterables; - - public ConcatIterables() { - this.iterables = new ArrayList<>(); - } - - public void extendWith(Iterable iterable) { - iterables.add(iterable); - } - - @Override - public Iterator iterator() { - return Iterators.concat(Iterables.transform(iterables, Iterable::iterator).iterator()); - } - - /** - * Returns a view of the current state of this iterable. Remembers the current length of - * iterables so that the returned value Will not change due to future extendWith() calls. - */ - public Iterable snapshot() { - final int limit = iterables.size(); - final List> iterablesList = iterables; - return () -> - Iterators.concat( - Iterators.transform( - Iterators.limit(iterablesList.iterator(), limit), Iterable::iterator)); - } - } - - private static class WindmillWatermarkHold extends WindmillState implements WatermarkHoldState { - // The encoded size of an Instant. - private static final int ENCODED_SIZE = 8; - - private final TimestampCombiner timestampCombiner; - private final StateNamespace namespace; - private final StateTag address; - private final ByteString stateKey; - private final String stateFamily; - - private boolean cleared = false; - /** - * If non-{@literal null}, the known current hold value, or absent if we know there are no - * output watermark holds. If {@literal null}, the current hold value could depend on holds in - * Windmill we do not yet know. - */ - private Optional cachedValue = null; - - private Instant localAdditions = null; - - private WindmillWatermarkHold( - StateNamespace namespace, - StateTag address, - String stateFamily, - TimestampCombiner timestampCombiner, - boolean isNewKey) { - this.namespace = namespace; - this.address = address; - this.stateKey = encodeKey(namespace, address); - this.stateFamily = stateFamily; - this.timestampCombiner = timestampCombiner; - if (isNewKey) { - cachedValue = Optional.absent(); - } - } - - @Override - public void clear() { - cleared = true; - cachedValue = Optional.absent(); - localAdditions = null; - } - - @Override - @SuppressWarnings("FutureReturnValueIgnored") - public WindmillWatermarkHold readLater() { - getFuture(); - return this; - } - - @Override - public Instant read() { - try (Closeable scope = scopedReadState()) { - Instant persistedHold = getFuture().get(); - if (persistedHold == null) { - cachedValue = Optional.absent(); - } else { - cachedValue = Optional.of(persistedHold); - } - } catch (InterruptedException | ExecutionException | IOException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read state", e); - } - - if (localAdditions == null) { - return cachedValue.orNull(); - } else if (!cachedValue.isPresent()) { - return localAdditions; - } else { - return timestampCombiner.combine(localAdditions, cachedValue.get()); - } - } - - @Override - public ReadableState isEmpty() { - throw new UnsupportedOperationException(); - } - - @Override - public void add(Instant outputTime) { - localAdditions = - (localAdditions == null) - ? outputTime - : timestampCombiner.combine(outputTime, localAdditions); - } - - @Override - public TimestampCombiner getTimestampCombiner() { - return timestampCombiner; - } - - @Override - public Future persist(final WindmillStateCache.ForKeyAndFamily cache) { - - Future result; - - if (!cleared && localAdditions == null) { - // No changes, so no need to update Windmill and no need to cache any value. - return Futures.immediateFuture(WorkItemCommitRequest.newBuilder().buildPartial()); - } - - if (cleared && localAdditions == null) { - // Just clearing the persisted state; blind delete - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - commitBuilder - .addWatermarkHoldsBuilder() - .setTag(stateKey) - .setStateFamily(stateFamily) - .setReset(true); - - result = Futures.immediateFuture(commitBuilder.buildPartial()); - } else if (cleared && localAdditions != null) { - // Since we cleared before adding, we can do a blind overwrite of persisted state - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - commitBuilder - .addWatermarkHoldsBuilder() - .setTag(stateKey) - .setStateFamily(stateFamily) - .setReset(true) - .addTimestamps(WindmillTimeUtils.harnessToWindmillTimestamp(localAdditions)); - - cachedValue = Optional.of(localAdditions); - - result = Futures.immediateFuture(commitBuilder.buildPartial()); - } else if (!cleared && localAdditions != null) { - // Otherwise, we need to combine the local additions with the already persisted data - result = combineWithPersisted(); - } else { - throw new IllegalStateException("Unreachable condition"); - } - - return Futures.lazyTransform( - result, - result1 -> { - cleared = false; - localAdditions = null; - if (cachedValue != null) { - cache.put(namespace, address, WindmillWatermarkHold.this, ENCODED_SIZE); - } - return result1; - }); - } - - private Future getFuture() { - return cachedValue != null - ? Futures.immediateFuture(cachedValue.orNull()) - : reader.watermarkFuture(stateKey, stateFamily); - } - - /** - * Combines local additions with persisted data and mutates the {@code commitBuilder} to write - * the result. - */ - private Future combineWithPersisted() { - boolean windmillCanCombine = false; - - // If the combined output time depends only on the window, then we are just blindly adding - // the same value that may or may not already be present. This depends on the state only being - // used for one window. - windmillCanCombine |= timestampCombiner.dependsOnlyOnWindow(); - - // If the combined output time depends only on the earliest input timestamp, then because - // assignOutputTime is monotonic, the hold only depends on the earliest output timestamp - // (which is the value submitted as a watermark hold). The only way holds for later inputs - // can be redundant is if the are later (or equal) to the earliest. So taking the MIN - // implicitly, as Windmill does, has the desired behavior. - windmillCanCombine |= timestampCombiner.dependsOnlyOnEarliestTimestamp(); - - if (windmillCanCombine) { - // We do a blind write and let Windmill take the MIN - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - commitBuilder - .addWatermarkHoldsBuilder() - .setTag(stateKey) - .setStateFamily(stateFamily) - .addTimestamps(WindmillTimeUtils.harnessToWindmillTimestamp(localAdditions)); - - if (cachedValue != null) { - cachedValue = - Optional.of( - cachedValue.isPresent() - ? timestampCombiner.combine(cachedValue.get(), localAdditions) - : localAdditions); - } - - return Futures.immediateFuture(commitBuilder.buildPartial()); - } else { - // The non-fast path does a read-modify-write - return Futures.lazyTransform( - (cachedValue != null) - ? Futures.immediateFuture(cachedValue.orNull()) - : reader.watermarkFuture(stateKey, stateFamily), - priorHold -> { - cachedValue = - Optional.of( - (priorHold != null) - ? timestampCombiner.combine(priorHold, localAdditions) - : localAdditions); - WorkItemCommitRequest.Builder commitBuilder = WorkItemCommitRequest.newBuilder(); - commitBuilder - .addWatermarkHoldsBuilder() - .setTag(stateKey) - .setStateFamily(stateFamily) - .setReset(true) - .addTimestamps(WindmillTimeUtils.harnessToWindmillTimestamp(cachedValue.get())); - - return commitBuilder.buildPartial(); - }); - } - } - } - - private static class WindmillCombiningState extends WindmillState - implements CombiningState { - - private final WindmillBag bag; - private final CombineFn combineFn; - - /* We use a separate, in-memory AccumT rather than relying on the WindmillWatermarkBag's - * localAdditions, because we want to combine multiple InputT's to a single AccumT - * before adding it. - */ - private AccumT localAdditionsAccum; - private boolean hasLocalAdditions = false; - - private WindmillCombiningState( - StateNamespace namespace, - StateTag> address, - String stateFamily, - Coder accumCoder, - CombineFn combineFn, - WindmillStateCache.ForKeyAndFamily cache, - boolean isNewKey) { - StateTag> internalBagAddress = StateTags.convertToBagTagInternal(address); - WindmillBag cachedBag = - (WindmillBag) cache.get(namespace, internalBagAddress); - this.bag = - (cachedBag != null) - ? cachedBag - : new WindmillBag<>(namespace, internalBagAddress, stateFamily, accumCoder, isNewKey); - this.combineFn = combineFn; - this.localAdditionsAccum = combineFn.createAccumulator(); - } - - @Override - void initializeForWorkItem( - WindmillStateReader reader, Supplier scopedReadStateSupplier) { - super.initializeForWorkItem(reader, scopedReadStateSupplier); - this.bag.initializeForWorkItem(reader, scopedReadStateSupplier); - } - - @Override - void cleanupAfterWorkItem() { - super.cleanupAfterWorkItem(); - bag.cleanupAfterWorkItem(); - } - - @Override - public WindmillCombiningState readLater() { - bag.readLater(); - return this; - } - - @Override - public OutputT read() { - return combineFn.extractOutput(getAccum()); - } - - @Override - public void add(InputT input) { - hasLocalAdditions = true; - localAdditionsAccum = combineFn.addInput(localAdditionsAccum, input); - } - - @Override - public void clear() { - bag.clear(); - localAdditionsAccum = combineFn.createAccumulator(); - hasLocalAdditions = false; - } - - @Override - public Future persist(WindmillStateCache.ForKeyAndFamily cache) - throws IOException { - if (hasLocalAdditions) { - if (COMPACT_NOW.get().get() || bag.valuesAreCached()) { - // Implicitly clears the bag and combines local and persisted accumulators. - localAdditionsAccum = getAccum(); - } - bag.add(combineFn.compact(localAdditionsAccum)); - localAdditionsAccum = combineFn.createAccumulator(); - hasLocalAdditions = false; - } - - return bag.persist(cache); - } - - @Override - public AccumT getAccum() { - Iterable accums = - Iterables.concat(bag.read(), Collections.singleton(localAdditionsAccum)); - - // Compact things - AccumT merged = combineFn.mergeAccumulators(accums); - bag.clear(); - localAdditionsAccum = merged; - hasLocalAdditions = true; - return merged; - } - - @Override - public ReadableState isEmpty() { - final ReadableState bagIsEmpty = bag.isEmpty(); - return new ReadableState() { - @Override - public ReadableState readLater() { - bagIsEmpty.readLater(); - return this; - } - - @Override - public Boolean read() { - return !hasLocalAdditions && bagIsEmpty.read(); - } - }; - } - - @Override - public void addAccum(AccumT accum) { - hasLocalAdditions = true; - localAdditionsAccum = combineFn.mergeAccumulators(Arrays.asList(localAdditionsAccum, accum)); - } - - @Override - public AccumT mergeAccumulators(Iterable accumulators) { - return combineFn.mergeAccumulators(accumulators); - } - } - - @VisibleForTesting - static final ThreadLocal> COMPACT_NOW = - ThreadLocal.withInitial( - () -> - new Supplier() { - /* The rate at which, on average, this will return true. */ - static final double RATE = 0.002; - Random random = new Random(); - long counter = nextSample(); - - private long nextSample() { - // Use geometric distribution to find next true value. - // This lets us avoid invoking random.nextDouble() on every call. - return (long) Math.floor(Math.log(random.nextDouble()) / Math.log(1 - RATE)); - } - - @Override - public Boolean get() { - counter--; - if (counter < 0) { - counter = nextSample(); - return true; - } else { - return false; - } - } - }); -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSources.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSources.java index 872dc1e89a79f..a9050236efc80 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSources.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WorkerCustomSources.java @@ -776,6 +776,9 @@ public double getRemainingParallelism() { private static class UnboundedReaderIterator extends NativeReader.NativeReaderIterator>> { + // Do not close reader. The reader is cached in StreamingModeExecutionContext.readerCache, and + // will be reused until the cache is evicted, expired or invalidated. + // See UnboundedReader#iterator(). private final UnboundedSource.UnboundedReader reader; private final StreamingModeExecutionContext context; private final boolean started; @@ -862,7 +865,9 @@ public WindowedValue> getCurrent() throws NoSuchElementExce } @Override - public void close() {} + public void close() { + // Don't close reader. + } @Override public NativeReader.Progress getProgress() { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/options/StreamingDataflowWorkerOptions.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/options/StreamingDataflowWorkerOptions.java index cc5b3302b01bb..bacfa1eef63bb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/options/StreamingDataflowWorkerOptions.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/options/StreamingDataflowWorkerOptions.java @@ -21,7 +21,7 @@ import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; import org.apache.beam.runners.dataflow.worker.windmill.appliance.JniWindmillApplianceServer; -import org.apache.beam.runners.dataflow.worker.windmill.grpcclient.GrpcWindmillServer; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillServer; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.DefaultValueFactory; import org.apache.beam.sdk.options.Description; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java index 529bb0a419070..9858666c40a23 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java @@ -32,9 +32,9 @@ import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import javax.annotation.concurrent.ThreadSafe; -import org.apache.beam.runners.dataflow.worker.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java index a902d2b13a776..9d7a9131f5849 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ComputationState.java @@ -23,9 +23,9 @@ import java.util.Map; import java.util.concurrent.ConcurrentLinkedQueue; import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Instant; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInput.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInput.java new file mode 100644 index 0000000000000..04eecadc1e5c1 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInput.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.sideinput; + +import com.google.auto.value.AutoValue; +import java.util.Optional; +import javax.annotation.Nullable; + +/** + * Entry in the side input cache that stores the value and the encoded size of the value. + * + *

Can be in 1 of 3 states: + * + *

    + *
  • Ready with a value. + *
  • Ready with no value, represented as {@link Optional} + *
  • Not ready. + *
+ */ +@AutoValue +public abstract class SideInput { + static SideInput ready(@Nullable T value, int encodedSize) { + return new AutoValue_SideInput<>(true, Optional.ofNullable(value), encodedSize); + } + + static SideInput notReady() { + return new AutoValue_SideInput<>(false, Optional.empty(), 0); + } + + public abstract boolean isReady(); + + public abstract Optional value(); + + public abstract int size(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputCache.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputCache.java new file mode 100644 index 0000000000000..721c477435ef9 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputCache.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.sideinput; + +import com.google.auto.value.AutoValue; +import com.google.errorprone.annotations.CheckReturnValue; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.Cache; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheBuilder; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.Weigher; + +/** + * Wrapper around {@code Cache} that mostly delegates to the underlying + * cache, but adds threadsafe functionality to invalidate and load entries that are not ready. + * + * @implNote Returned values are explicitly cast, because the {@link #sideInputCache} holds wildcard + * types of all objects. + */ +@CheckReturnValue +final class SideInputCache { + + private static final long MAXIMUM_CACHE_WEIGHT = 100000000; /* 100 MB */ + private static final long CACHE_ENTRY_EXPIRY_MINUTES = 1L; + + private final Cache, SideInput> sideInputCache; + + SideInputCache(Cache, SideInput> sideInputCache) { + this.sideInputCache = sideInputCache; + } + + static SideInputCache create() { + return new SideInputCache( + CacheBuilder.newBuilder() + .maximumWeight(MAXIMUM_CACHE_WEIGHT) + .expireAfterWrite(CACHE_ENTRY_EXPIRY_MINUTES, TimeUnit.MINUTES) + .weigher((Weigher, SideInput>) (id, entry) -> entry.size()) + .build()); + } + + synchronized SideInput invalidateThenLoadNewEntry( + Key key, Callable> cacheLoaderFn) throws ExecutionException { + // Invalidate the existing not-ready entry. This must be done atomically + // so that another thread doesn't replace the entry with a ready entry, which + // would then be deleted here. + Optional> newEntry = getIfPresentUnchecked(key); + if (newEntry.isPresent() && !newEntry.get().isReady()) { + sideInputCache.invalidate(key); + } + + return getUnchecked(key, cacheLoaderFn); + } + + Optional> get(Key key) { + return getIfPresentUnchecked(key); + } + + SideInput getOrLoad(Key key, Callable> cacheLoaderFn) + throws ExecutionException { + return getUnchecked(key, cacheLoaderFn); + } + + @SuppressWarnings({ + "unchecked" // cacheLoaderFn loads SideInput, and key is of type T, so value for Key is + // always SideInput. + }) + private SideInput getUnchecked(Key key, Callable> cacheLoaderFn) + throws ExecutionException { + return (SideInput) sideInputCache.get(key, cacheLoaderFn); + } + + @SuppressWarnings({ + "unchecked" // cacheLoaderFn loads SideInput, and key is of type T, so value for Key is + // always SideInput. + }) + private Optional> getIfPresentUnchecked(Key key) { + return Optional.ofNullable((SideInput) sideInputCache.getIfPresent(key)); + } + + @AutoValue + abstract static class Key { + static Key create( + TupleTag tag, BoundedWindow window, TypeDescriptor typeDescriptor) { + return new AutoValue_SideInputCache_Key<>(tag, window, typeDescriptor); + } + + abstract TupleTag tag(); + + abstract BoundedWindow window(); + + abstract TypeDescriptor typeDescriptor(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputState.java new file mode 100644 index 0000000000000..d7af10d29e1f5 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputState.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.sideinput; + +/** Indicates the caller's knowledge of whether a particular side input has been computed. */ +public enum SideInputState { + CACHED_IN_WORK_ITEM, + KNOWN_READY, + UNKNOWN +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java new file mode 100644 index 0000000000000..aa61c4219353a --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcher.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.sideinput; + +import static org.apache.beam.sdk.transforms.Materializations.ITERABLE_MATERIALIZATION_URN; +import static org.apache.beam.sdk.transforms.Materializations.MULTIMAP_MATERIALIZATION_URN; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collections; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.Callable; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.beam.runners.core.InMemoryMultimapSideInputView; +import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.IterableCoder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.transforms.Materializations.IterableView; +import org.apache.beam.sdk.transforms.Materializations.MultimapView; +import org.apache.beam.sdk.transforms.ViewFn; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.util.ByteStringOutputStream; +import org.apache.beam.sdk.values.PCollectionView; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.WindowingStrategy; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Class responsible for fetching state from the windmill server. */ +@NotThreadSafe +public class SideInputStateFetcher { + private static final Logger LOG = LoggerFactory.getLogger(SideInputStateFetcher.class); + + private static final Set SUPPORTED_MATERIALIZATIONS = + ImmutableSet.of(ITERABLE_MATERIALIZATION_URN, MULTIMAP_MATERIALIZATION_URN); + + private final SideInputCache sideInputCache; + private final MetricTrackingWindmillServerStub server; + private long bytesRead = 0L; + + public SideInputStateFetcher(MetricTrackingWindmillServerStub server) { + this(server, SideInputCache.create()); + } + + SideInputStateFetcher(MetricTrackingWindmillServerStub server, SideInputCache sideInputCache) { + this.server = server; + this.sideInputCache = sideInputCache; + } + + private static Iterable decodeRawData(PCollectionView view, GlobalData data) + throws IOException { + return !data.getData().isEmpty() + ? IterableCoder.of(getCoder(view)).decode(data.getData().newInput()) + : Collections.emptyList(); + } + + @SuppressWarnings({ + "deprecation" // Required as part of the SideInputCacheKey, and not exposed. + }) + private static TupleTag getInternalTag(PCollectionView view) { + return view.getTagInternal(); + } + + @SuppressWarnings("deprecation") + private static ViewFn getViewFn(PCollectionView view) { + return view.getViewFn(); + } + + @SuppressWarnings({ + "deprecation" // The view's internal coder is required to decode the raw data. + }) + private static Coder getCoder(PCollectionView view) { + return view.getCoderInternal(); + } + + /** Returns a view of the underlying cache that keeps track of bytes read separately. */ + public SideInputStateFetcher byteTrackingView() { + return new SideInputStateFetcher(server, sideInputCache); + } + + public long getBytesRead() { + return bytesRead; + } + + /** + * Fetch the given side input, storing it in a process-level cache. + * + *

If state is KNOWN_READY, attempt to fetch the data regardless of whether a not-ready entry + * was cached. + */ + public SideInput fetchSideInput( + PCollectionView view, + BoundedWindow sideWindow, + String stateFamily, + SideInputState state, + Supplier scopedReadStateSupplier) { + Callable> loadSideInputFromWindmill = + () -> loadSideInputFromWindmill(view, sideWindow, stateFamily, scopedReadStateSupplier); + SideInputCache.Key sideInputCacheKey = + SideInputCache.Key.create( + getInternalTag(view), sideWindow, getViewFn(view).getTypeDescriptor()); + + try { + if (state == SideInputState.KNOWN_READY) { + Optional> existingCacheEntry = sideInputCache.get(sideInputCacheKey); + if (!existingCacheEntry.isPresent()) { + return sideInputCache.getOrLoad(sideInputCacheKey, loadSideInputFromWindmill); + } + + if (!existingCacheEntry.get().isReady()) { + return sideInputCache.invalidateThenLoadNewEntry( + sideInputCacheKey, loadSideInputFromWindmill); + } + + return existingCacheEntry.get(); + } + + return sideInputCache.getOrLoad(sideInputCacheKey, loadSideInputFromWindmill); + } catch (Exception e) { + LOG.error("Fetch failed: ", e); + throw new RuntimeException("Exception while fetching side input: ", e); + } + } + + private GlobalData fetchGlobalDataFromWindmill( + PCollectionView view, + SideWindowT sideWindow, + String stateFamily, + Supplier scopedReadStateSupplier) + throws IOException { + @SuppressWarnings({ + "deprecation", // Internal windowStrategy is required to fetch side input data from Windmill. + "unchecked" // Internal windowing strategy matches WindowingStrategy. + }) + WindowingStrategy sideWindowStrategy = + (WindowingStrategy) view.getWindowingStrategyInternal(); + + Coder windowCoder = sideWindowStrategy.getWindowFn().windowCoder(); + + ByteStringOutputStream windowStream = new ByteStringOutputStream(); + windowCoder.encode(sideWindow, windowStream); + + Windmill.GlobalDataRequest request = + Windmill.GlobalDataRequest.newBuilder() + .setDataId( + Windmill.GlobalDataId.newBuilder() + .setTag(getInternalTag(view).getId()) + .setVersion(windowStream.toByteString()) + .build()) + .setStateFamily(stateFamily) + .setExistenceWatermarkDeadline( + WindmillTimeUtils.harnessToWindmillTimestamp( + sideWindowStrategy.getTrigger().getWatermarkThatGuaranteesFiring(sideWindow))) + .build(); + + try (Closeable ignored = scopedReadStateSupplier.get()) { + return server.getSideInputData(request); + } + } + + private SideInput loadSideInputFromWindmill( + PCollectionView view, + BoundedWindow sideWindow, + String stateFamily, + Supplier scopedReadStateSupplier) + throws IOException { + validateViewMaterialization(view); + GlobalData data = + fetchGlobalDataFromWindmill(view, sideWindow, stateFamily, scopedReadStateSupplier); + bytesRead += data.getSerializedSize(); + return data.getIsReady() ? createSideInputCacheEntry(view, data) : SideInput.notReady(); + } + + private void validateViewMaterialization(PCollectionView view) { + String materializationUrn = getViewFn(view).getMaterialization().getUrn(); + checkState( + SUPPORTED_MATERIALIZATIONS.contains(materializationUrn), + "Only materialization's of type %s supported, received %s", + SUPPORTED_MATERIALIZATIONS, + materializationUrn); + } + + private SideInput createSideInputCacheEntry(PCollectionView view, GlobalData data) + throws IOException { + Iterable rawData = decodeRawData(view, data); + switch (getViewFn(view).getMaterialization().getUrn()) { + case ITERABLE_MATERIALIZATION_URN: + { + @SuppressWarnings({ + "unchecked", // ITERABLE_MATERIALIZATION_URN has ViewFn. + "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) + }) + ViewFn viewFn = (ViewFn) getViewFn(view); + return SideInput.ready(viewFn.apply(() -> rawData), data.getData().size()); + } + case MULTIMAP_MATERIALIZATION_URN: + { + @SuppressWarnings({ + "unchecked", // MULTIMAP_MATERIALIZATION_URN has ViewFn. + "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) + }) + ViewFn viewFn = (ViewFn) getViewFn(view); + Coder keyCoder = ((KvCoder) getCoder(view)).getKeyCoder(); + + @SuppressWarnings({ + "unchecked", // Safe since multimap rawData is of type Iterable> + "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) + }) + T multimapSideInputValue = + viewFn.apply( + InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) rawData)); + return SideInput.ready(multimapSideInputValue, data.getData().size()); + } + default: + { + throw new IllegalStateException( + "Unknown side input materialization format requested: " + + getViewFn(view).getMaterialization().getUrn()); + } + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java index a160b0e6ad036..dcff1f73f10fd 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java @@ -123,6 +123,22 @@ public int activeCount() { return activeCount.intValue(); } + public long bytesOutstanding() { + return bytesOutstanding; + } + + public long elementsOutstanding() { + return elementsOutstanding; + } + + public long maximumBytesOutstanding() { + return maximumBytesOutstanding; + } + + public long maximumElementsOutstanding() { + return maximumElementsOutstanding; + } + public String summaryHtml() { monitor.enter(); try { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java index fe81eece13830..8caa79cd3f76f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerBase.java @@ -19,10 +19,10 @@ import java.io.IOException; import java.util.Set; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream.WorkItemReceiver; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; /** diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java index 1bb5359e06f48..c327e68d7e913 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillServerStub.java @@ -21,10 +21,10 @@ import java.io.PrintWriter; import java.util.Set; import org.apache.beam.runners.dataflow.worker.status.StatusDataProvider; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream.WorkItemReceiver; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; /** Stub for communicating with a Windmill server. */ diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/AbstractWindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java similarity index 98% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/AbstractWindmillStream.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java index ea7efff7a06d9..4e47676989a6e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/AbstractWindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/AbstractWindmillStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill; +package org.apache.beam.runners.dataflow.worker.windmill.client; import java.io.IOException; import java.io.PrintWriter; @@ -30,6 +30,7 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Supplier; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Status; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.StatusRuntimeException; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java similarity index 84% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStream.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java index 4dd4164fc4efd..fa1f797a19114 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStream.java @@ -15,15 +15,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill; +package org.apache.beam.runners.dataflow.worker.windmill.client; -import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import javax.annotation.concurrent.ThreadSafe; -import org.checkerframework.checker.nullness.qual.Nullable; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.joda.time.Instant; /** Superclass for streams returned by streaming Windmill methods. */ @@ -41,16 +41,11 @@ public interface WindmillStream { /** Handle representing a stream of GetWork responses. */ @ThreadSafe interface GetWorkStream extends WindmillStream { - /** Functional interface for receiving WorkItems. */ - @FunctionalInterface - interface WorkItemReceiver { - void receiveWork( - String computation, - @Nullable Instant inputDataWatermark, - @Nullable Instant synchronizedProcessingTime, - Windmill.WorkItem workItem, - Collection getWorkStreamLatencies); - } + /** Adjusts the {@link GetWorkBudget} for the stream. */ + void adjustBudget(long itemsDelta, long bytesDelta); + + /** Returns the remaining in-flight {@link GetWorkBudget}. */ + GetWorkBudget remainingBudget(); } /** Interface for streaming GetDataRequests to Windmill. */ diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStreamPool.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java similarity index 99% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStreamPool.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java index 9cd4ab0ea4a5b..9f1b67edc1e0a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStreamPool.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPool.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill; +package org.apache.beam.runners.dataflow.worker.windmill.client; import java.util.ArrayList; import java.util.HashMap; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/AppendableInputStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/AppendableInputStream.java similarity index 98% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/AppendableInputStream.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/AppendableInputStream.java index dbd3613ee4c29..6a0d0a63d5a95 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/AppendableInputStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/AppendableInputStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import java.io.IOException; import java.io.InputStream; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GetWorkTimingInfosTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkTimingInfosTracker.java similarity index 99% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GetWorkTimingInfosTracker.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkTimingInfosTracker.java index e6710993af9b4..221b18be164c6 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GetWorkTimingInfosTracker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkTimingInfosTracker.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import java.util.ArrayList; import java.util.Collection; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcCommitWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java similarity index 96% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcCommitWorkStream.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java index 1bba40805dec4..5d0a5085fe1b7 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcCommitWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcCommitWorkStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; @@ -27,15 +27,16 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; import java.util.function.Function; -import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitStatus; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingCommitRequestChunk; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingCommitResponse; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingCommitWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcDeadlineClientInterceptor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDeadlineClientInterceptor.java similarity index 97% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcDeadlineClientInterceptor.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDeadlineClientInterceptor.java index 6b0e19cbb4802..629006e23596e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcDeadlineClientInterceptor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDeadlineClientInterceptor.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import java.util.concurrent.TimeUnit; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.CallOptions; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java new file mode 100644 index 0000000000000..ef9156f9c0503 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; + +import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.LOCALHOST; +import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.localhostChannel; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Random; +import java.util.Set; +import javax.annotation.concurrent.GuardedBy; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactory; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Manages endpoints and stubs for connecting to the Windmill Dispatcher. */ +@ThreadSafe +class GrpcDispatcherClient { + private static final Logger LOG = LoggerFactory.getLogger(GrpcDispatcherClient.class); + private final WindmillStubFactory windmillStubFactory; + + @GuardedBy("this") + private final List dispatcherStubs; + + @GuardedBy("this") + private final Set dispatcherEndpoints; + + @GuardedBy("this") + private final Random rand; + + private GrpcDispatcherClient( + WindmillStubFactory windmillStubFactory, + List dispatcherStubs, + Set dispatcherEndpoints, + Random rand) { + this.windmillStubFactory = windmillStubFactory; + this.dispatcherStubs = dispatcherStubs; + this.dispatcherEndpoints = dispatcherEndpoints; + this.rand = rand; + } + + static GrpcDispatcherClient create(WindmillStubFactory windmillStubFactory) { + return new GrpcDispatcherClient( + windmillStubFactory, new ArrayList<>(), new HashSet<>(), new Random()); + } + + @VisibleForTesting + static GrpcDispatcherClient forTesting( + WindmillStubFactory windmillGrpcStubFactory, + List dispatcherStubs, + Set dispatcherEndpoints) { + Preconditions.checkArgument(dispatcherEndpoints.size() == dispatcherStubs.size()); + return new GrpcDispatcherClient( + windmillGrpcStubFactory, dispatcherStubs, dispatcherEndpoints, new Random()); + } + + synchronized CloudWindmillServiceV1Alpha1Stub getDispatcherStub() { + Preconditions.checkState( + !dispatcherStubs.isEmpty(), "windmillServiceEndpoint has not been set"); + + return (dispatcherStubs.size() == 1 + ? dispatcherStubs.get(0) + : dispatcherStubs.get(rand.nextInt(dispatcherStubs.size()))); + } + + synchronized boolean isReady() { + return !dispatcherStubs.isEmpty(); + } + + synchronized void consumeWindmillDispatcherEndpoints( + ImmutableSet dispatcherEndpoints) { + Preconditions.checkArgument( + dispatcherEndpoints != null && !dispatcherEndpoints.isEmpty(), + "Cannot set dispatcher endpoints to nothing."); + if (this.dispatcherEndpoints.equals(dispatcherEndpoints)) { + // The endpoints are equal don't recreate the stubs. + return; + } + + LOG.info("Creating a new windmill stub, endpoints: {}", dispatcherEndpoints); + if (!this.dispatcherEndpoints.isEmpty()) { + LOG.info("Previous windmill stub endpoints: {}", this.dispatcherEndpoints); + } + + resetDispatcherEndpoints(dispatcherEndpoints); + } + + private synchronized void resetDispatcherEndpoints( + ImmutableSet newDispatcherEndpoints) { + LOG.info("Initializing Streaming Engine GRPC client for endpoints: {}", newDispatcherEndpoints); + this.dispatcherStubs.clear(); + this.dispatcherEndpoints.clear(); + this.dispatcherEndpoints.addAll(newDispatcherEndpoints); + + dispatcherEndpoints.stream() + .map(this::createDispatcherStubForWindmillService) + .forEach(dispatcherStubs::add); + } + + private CloudWindmillServiceV1Alpha1Stub createDispatcherStubForWindmillService( + HostAndPort endpoint) { + if (LOCALHOST.equals(endpoint.getHost())) { + return CloudWindmillServiceV1Alpha1Grpc.newStub(localhostChannel(endpoint.getPort())); + } + + // Use an in-process stub if testing. + return windmillStubFactory.getKind() == WindmillStubFactory.Kind.IN_PROCESS + ? windmillStubFactory.inProcess().get() + : windmillStubFactory.remote().apply(WindmillServiceAddress.create(endpoint)); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java similarity index 95% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStream.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java index 238cc771dce8b..ea9cd7f0fa321 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Verify.verify; @@ -33,8 +33,6 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; -import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; @@ -43,9 +41,12 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataResponse; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetDataResponse; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.grpcclient.GrpcGetDataStreamRequests.QueuedBatch; -import org.apache.beam.runners.dataflow.worker.windmill.grpcclient.GrpcGetDataStreamRequests.QueuedRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcGetDataStreamRequests.QueuedBatch; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcGetDataStreamRequests.QueuedRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; import org.joda.time.Instant; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStreamRequests.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStreamRequests.java similarity index 98% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStreamRequests.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStreamRequests.java index 7da7b13958b9b..cda9537127d92 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStreamRequests.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetDataStreamRequests.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import com.google.auto.value.AutoOneOf; import java.util.ArrayList; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java similarity index 89% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkStream.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java index 4660fe25b13b3..d7d9bfddffb02 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import java.io.IOException; import java.io.PrintWriter; @@ -27,16 +27,18 @@ import java.util.function.Function; import javax.annotation.Nullable; import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; -import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequestExtension; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkResponseChunk; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream.WorkItemReceiver; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; @@ -44,7 +46,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -final class GrpcGetWorkStream +public final class GrpcGetWorkStream extends AbstractWindmillStream implements GetWorkStream { @@ -79,7 +81,7 @@ private GrpcGetWorkStream( this.inflightBytes = new AtomicLong(); } - static GrpcGetWorkStream create( + public static GrpcGetWorkStream create( Function< StreamObserver, StreamObserver> @@ -190,6 +192,19 @@ protected void startThrottleTimer() { getWorkThrottleTimer.start(); } + @Override + public void adjustBudget(long itemsDelta, long bytesDelta) { + // no-op + } + + @Override + public GetWorkBudget remainingBudget() { + return GetWorkBudget.builder() + .setBytes(request.getMaxBytes() - inflightBytes.get()) + .setItems(request.getMaxItems() - inflightMessages.get()) + .build(); + } + private class WorkItemBuffer { private final GetWorkTimingInfosTracker workTimingInfosTracker; private String computation; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkerMetadataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java similarity index 93% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkerMetadataStream.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java index 427fd412ec7f4..a403feddb4503 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkerMetadataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStream.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import com.google.errorprone.annotations.concurrent.GuardedBy; import java.io.PrintWriter; @@ -23,13 +23,14 @@ import java.util.Set; import java.util.function.Consumer; import java.util.function.Function; -import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkerMetadataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkerMetadataResponse; import org.apache.beam.runners.dataflow.worker.windmill.WindmillEndpoints; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkerMetadataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkerMetadataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; import org.slf4j.Logger; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java new file mode 100644 index 0000000000000..3a881df714624 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServer.java @@ -0,0 +1,355 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; + +import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.LOCALHOST; +import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.inProcessChannel; +import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.localhostChannel; + +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Supplier; +import javax.annotation.Nullable; +import org.apache.beam.runners.dataflow.worker.options.StreamingDataflowWorkerOptions; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitWorkRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitWorkResponse; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetConfigRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetConfigResponse; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetDataResponse; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkResponse; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ReportStatsRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ReportStatsResponse; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillApplianceGrpc; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillStubFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.StreamingEngineThrottleTimers; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.util.BackOff; +import org.apache.beam.sdk.util.BackOffUtils; +import org.apache.beam.sdk.util.FluentBackoff; +import org.apache.beam.sdk.util.Sleeper; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Channel; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.StatusRuntimeException; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; +import org.joda.time.Duration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** gRPC client for communicating with Streaming Engine. */ +@SuppressFBWarnings({ + // Very likely real potentials for bugs. + "JLM_JSR166_UTILCONCURRENT_MONITORENTER", // https://github.com/apache/beam/issues/19273 + "IS2_INCONSISTENT_SYNC" // https://github.com/apache/beam/issues/19271 +}) +@SuppressWarnings("nullness") // TODO(https://github.com/apache/beam/issues/20497 +public final class GrpcWindmillServer extends WindmillServerStub { + private static final Logger LOG = LoggerFactory.getLogger(GrpcWindmillServer.class); + private static final int DEFAULT_LOG_EVERY_N_FAILURES = 20; + private static final Duration MIN_BACKOFF = Duration.millis(1); + private static final Duration MAX_BACKOFF = Duration.standardSeconds(30); + private static final int NO_HEALTH_CHECK = -1; + private static final String GRPC_LOCALHOST = "grpc:localhost"; + + private final GrpcWindmillStreamFactory windmillStreamFactory; + private final GrpcDispatcherClient dispatcherClient; + private final StreamingDataflowWorkerOptions options; + private final StreamingEngineThrottleTimers throttleTimers; + private Duration maxBackoff; + private @Nullable WindmillApplianceGrpc.WindmillApplianceBlockingStub syncApplianceStub; + + private GrpcWindmillServer( + StreamingDataflowWorkerOptions options, GrpcDispatcherClient grpcDispatcherClient) { + this.options = options; + this.throttleTimers = StreamingEngineThrottleTimers.create(); + this.maxBackoff = MAX_BACKOFF; + this.windmillStreamFactory = + GrpcWindmillStreamFactory.of( + JobHeader.newBuilder() + .setJobId(options.getJobId()) + .setProjectId(options.getProject()) + .setWorkerId(options.getWorkerId()) + .build()) + .setWindmillMessagesBetweenIsReadyChecks( + options.getWindmillMessagesBetweenIsReadyChecks()) + .setMaxBackOffSupplier(() -> maxBackoff) + .setLogEveryNStreamFailures( + options.getWindmillServiceStreamingLogEveryNStreamFailures()) + .setStreamingRpcBatchLimit(options.getWindmillServiceStreamingRpcBatchLimit()) + .build(); + windmillStreamFactory.scheduleHealthChecks( + options.getWindmillServiceStreamingRpcHealthCheckPeriodMs()); + + this.dispatcherClient = grpcDispatcherClient; + this.syncApplianceStub = null; + } + + private static StreamingDataflowWorkerOptions testOptions(boolean enableStreamingEngine) { + StreamingDataflowWorkerOptions options = + PipelineOptionsFactory.create().as(StreamingDataflowWorkerOptions.class); + options.setProject("project"); + options.setJobId("job"); + options.setWorkerId("worker"); + List experiments = + options.getExperiments() == null ? new ArrayList<>() : options.getExperiments(); + if (enableStreamingEngine) { + experiments.add(GcpOptions.STREAMING_ENGINE_EXPERIMENT); + } + options.setExperiments(experiments); + + options.setWindmillServiceStreamingRpcBatchLimit(Integer.MAX_VALUE); + options.setWindmillServiceStreamingRpcHealthCheckPeriodMs(NO_HEALTH_CHECK); + options.setWindmillServiceStreamingLogEveryNStreamFailures(DEFAULT_LOG_EVERY_N_FAILURES); + + return options; + } + + /** Create new instance of {@link GrpcWindmillServer}. */ + public static GrpcWindmillServer create(StreamingDataflowWorkerOptions workerOptions) + throws IOException { + + GrpcWindmillServer grpcWindmillServer = + new GrpcWindmillServer( + workerOptions, + GrpcDispatcherClient.create( + WindmillStubFactory.remoteStubFactory( + workerOptions.getWindmillServiceRpcChannelAliveTimeoutSec(), + workerOptions.getGcpCredential()))); + if (workerOptions.getWindmillServiceEndpoint() != null) { + grpcWindmillServer.configureWindmillServiceEndpoints(); + } else if (!workerOptions.isEnableStreamingEngine() + && workerOptions.getLocalWindmillHostport() != null) { + grpcWindmillServer.configureLocalHost(); + } + + return grpcWindmillServer; + } + + @VisibleForTesting + static GrpcWindmillServer newTestInstance(String name) { + ManagedChannel inProcessChannel = inProcessChannel(name); + CloudWindmillServiceV1Alpha1Stub stub = + CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel); + List dispatcherStubs = Lists.newArrayList(stub); + Set dispatcherEndpoints = Sets.newHashSet(HostAndPort.fromHost(name)); + GrpcDispatcherClient dispatcherClient = + GrpcDispatcherClient.forTesting( + WindmillStubFactory.inProcessStubFactory(name, unused -> inProcessChannel), + dispatcherStubs, + dispatcherEndpoints); + return new GrpcWindmillServer(testOptions(/* enableStreamingEngine= */ true), dispatcherClient); + } + + @VisibleForTesting + static GrpcWindmillServer newApplianceTestInstance(Channel channel) { + GrpcWindmillServer testServer = + new GrpcWindmillServer( + testOptions(/* enableStreamingEngine= */ false), + // No-op, Appliance does not use Dispatcher to call Streaming Engine. + GrpcDispatcherClient.create(WindmillStubFactory.inProcessStubFactory("test"))); + testServer.syncApplianceStub = createWindmillApplianceStubWithDeadlineInterceptor(channel); + return testServer; + } + + private static WindmillApplianceGrpc.WindmillApplianceBlockingStub + createWindmillApplianceStubWithDeadlineInterceptor(Channel channel) { + return WindmillApplianceGrpc.newBlockingStub(channel) + .withInterceptors(GrpcDeadlineClientInterceptor.withDefaultUnaryRpcDeadline()); + } + + private static UnsupportedOperationException unsupportedUnaryRequestInStreamingEngineException( + String rpcName) { + return new UnsupportedOperationException( + String.format("Unary %s calls are not supported in Streaming Engine.", rpcName)); + } + + private void configureWindmillServiceEndpoints() { + Set endpoints = new HashSet<>(); + for (String endpoint : Splitter.on(',').split(options.getWindmillServiceEndpoint())) { + endpoints.add( + HostAndPort.fromString(endpoint).withDefaultPort(options.getWindmillServicePort())); + } + + dispatcherClient.consumeWindmillDispatcherEndpoints(ImmutableSet.copyOf(endpoints)); + } + + private void configureLocalHost() { + int portStart = options.getLocalWindmillHostport().lastIndexOf(':'); + String endpoint = options.getLocalWindmillHostport().substring(0, portStart); + Preconditions.checkState(GRPC_LOCALHOST.equals(endpoint)); + int port = Integer.parseInt(options.getLocalWindmillHostport().substring(portStart + 1)); + dispatcherClient.consumeWindmillDispatcherEndpoints( + ImmutableSet.of(HostAndPort.fromParts(LOCALHOST, port))); + initializeLocalHost(port); + } + + @Override + public void setWindmillServiceEndpoints(Set endpoints) { + dispatcherClient.consumeWindmillDispatcherEndpoints(ImmutableSet.copyOf(endpoints)); + } + + @Override + public boolean isReady() { + return dispatcherClient.isReady(); + } + + private synchronized void initializeLocalHost(int port) { + this.maxBackoff = Duration.millis(500); + if (options.isEnableStreamingEngine()) { + dispatcherClient.consumeWindmillDispatcherEndpoints( + ImmutableSet.of(HostAndPort.fromParts(LOCALHOST, port))); + } else { + this.syncApplianceStub = + createWindmillApplianceStubWithDeadlineInterceptor(localhostChannel(port)); + } + } + + @Override + public void appendSummaryHtml(PrintWriter writer) { + windmillStreamFactory.appendSummaryHtml(writer); + } + + private ResponseT callWithBackoff(Supplier function) { + // Configure backoff to retry calls forever, with a maximum sane retry interval. + BackOff backoff = + FluentBackoff.DEFAULT.withInitialBackoff(MIN_BACKOFF).withMaxBackoff(maxBackoff).backoff(); + + int rpcErrors = 0; + while (true) { + try { + return function.get(); + } catch (StatusRuntimeException e) { + try { + if (++rpcErrors % 20 == 0) { + LOG.warn( + "Many exceptions calling gRPC. Last exception: {} with status {}", + e, + e.getStatus()); + } + if (!BackOffUtils.next(Sleeper.DEFAULT, backoff)) { + throw new RpcException(e); + } + } catch (IOException | InterruptedException i) { + if (i instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + RpcException rpcException = new RpcException(e); + rpcException.addSuppressed(i); + throw rpcException; + } + } + } + } + + @Override + public GetWorkResponse getWork(GetWorkRequest request) { + if (syncApplianceStub != null) { + return callWithBackoff(() -> syncApplianceStub.getWork(request)); + } + + throw new RpcException(unsupportedUnaryRequestInStreamingEngineException("GetWork")); + } + + @Override + public GetDataResponse getData(GetDataRequest request) { + if (syncApplianceStub != null) { + return callWithBackoff(() -> syncApplianceStub.getData(request)); + } + + throw new RpcException(unsupportedUnaryRequestInStreamingEngineException("GetData")); + } + + @Override + public CommitWorkResponse commitWork(CommitWorkRequest request) { + if (syncApplianceStub != null) { + return callWithBackoff(() -> syncApplianceStub.commitWork(request)); + } + throw new RpcException(unsupportedUnaryRequestInStreamingEngineException("CommitWork")); + } + + @Override + public GetWorkStream getWorkStream(GetWorkRequest request, WorkItemReceiver receiver) { + return windmillStreamFactory.createGetWorkStream( + dispatcherClient.getDispatcherStub(), + GetWorkRequest.newBuilder(request) + .setJobId(options.getJobId()) + .setProjectId(options.getProject()) + .setWorkerId(options.getWorkerId()) + .build(), + throttleTimers.getWorkThrottleTimer(), + receiver); + } + + @Override + public GetDataStream getDataStream() { + return windmillStreamFactory.createGetDataStream( + dispatcherClient.getDispatcherStub(), throttleTimers.getDataThrottleTimer()); + } + + @Override + public CommitWorkStream commitWorkStream() { + return windmillStreamFactory.createCommitWorkStream( + dispatcherClient.getDispatcherStub(), throttleTimers.commitWorkThrottleTimer()); + } + + @Override + public GetConfigResponse getConfig(GetConfigRequest request) { + if (syncApplianceStub != null) { + return callWithBackoff(() -> syncApplianceStub.getConfig(request)); + } + + throw new RpcException( + new UnsupportedOperationException("GetConfig not supported in Streaming Engine.")); + } + + @Override + public ReportStatsResponse reportStats(ReportStatsRequest request) { + if (syncApplianceStub != null) { + return callWithBackoff(() -> syncApplianceStub.reportStats(request)); + } + + throw new RpcException( + new UnsupportedOperationException("ReportStats not supported in Streaming Engine.")); + } + + @Override + public long getAndResetThrottleTime() { + return throttleTimers.getAndResetThrottleTime(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java new file mode 100644 index 0000000000000..e474ebf18b297 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillStreamFactory.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; + +import static org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS; + +import com.google.auto.value.AutoBuilder; +import java.io.PrintWriter; +import java.util.Set; +import java.util.Timer; +import java.util.TimerTask; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Consumer; +import java.util.function.Supplier; +import javax.annotation.concurrent.ThreadSafe; +import org.apache.beam.runners.dataflow.worker.status.StatusDataProvider; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillEndpoints; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkerMetadataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.sdk.util.BackOff; +import org.apache.beam.sdk.util.FluentBackoff; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; +import org.joda.time.Duration; +import org.joda.time.Instant; + +/** + * Creates gRPC streaming connections to Windmill Service. Maintains a set of all currently opened + * RPC streams for health check/heartbeat requests to keep the streams alive. + */ +@ThreadSafe +public final class GrpcWindmillStreamFactory implements StatusDataProvider { + private static final Duration MIN_BACKOFF = Duration.millis(1); + private static final Duration DEFAULT_MAX_BACKOFF = Duration.standardSeconds(30); + private static final int DEFAULT_LOG_EVERY_N_STREAM_FAILURES = 1; + private static final int DEFAULT_STREAMING_RPC_BATCH_LIMIT = Integer.MAX_VALUE; + private static final int DEFAULT_WINDMILL_MESSAGES_BETWEEN_IS_READY_CHECKS = 1; + + private final JobHeader jobHeader; + private final int logEveryNStreamFailures; + private final int streamingRpcBatchLimit; + private final int windmillMessagesBetweenIsReadyChecks; + private final Supplier grpcBackOff; + private final Set> streamRegistry; + private final AtomicLong streamIdGenerator; + + GrpcWindmillStreamFactory( + JobHeader jobHeader, + int logEveryNStreamFailures, + int streamingRpcBatchLimit, + int windmillMessagesBetweenIsReadyChecks, + Supplier maxBackOffSupplier) { + this.jobHeader = jobHeader; + this.logEveryNStreamFailures = logEveryNStreamFailures; + this.streamingRpcBatchLimit = streamingRpcBatchLimit; + this.windmillMessagesBetweenIsReadyChecks = windmillMessagesBetweenIsReadyChecks; + // Configure backoff to retry calls forever, with a maximum sane retry interval. + this.grpcBackOff = + Suppliers.memoize( + () -> + FluentBackoff.DEFAULT + .withInitialBackoff(MIN_BACKOFF) + .withMaxBackoff(maxBackOffSupplier.get()) + .backoff()); + this.streamRegistry = ConcurrentHashMap.newKeySet(); + this.streamIdGenerator = new AtomicLong(); + } + + /** + * Returns a new {@link Builder} for {@link GrpcWindmillStreamFactory} with default values set for + * the given {@link JobHeader}. + */ + public static GrpcWindmillStreamFactory.Builder of(JobHeader jobHeader) { + return new AutoBuilder_GrpcWindmillStreamFactory_Builder() + .setJobHeader(jobHeader) + .setWindmillMessagesBetweenIsReadyChecks(DEFAULT_WINDMILL_MESSAGES_BETWEEN_IS_READY_CHECKS) + .setMaxBackOffSupplier(() -> DEFAULT_MAX_BACKOFF) + .setLogEveryNStreamFailures(DEFAULT_LOG_EVERY_N_STREAM_FAILURES) + .setStreamingRpcBatchLimit(DEFAULT_STREAMING_RPC_BATCH_LIMIT); + } + + private static CloudWindmillServiceV1Alpha1Stub withDeadline( + CloudWindmillServiceV1Alpha1Stub stub) { + // Deadlines are absolute points in time, so generate a new one everytime this function is + // called. + return stub.withDeadlineAfter( + AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS); + } + + public GetWorkStream createGetWorkStream( + CloudWindmillServiceV1Alpha1Stub stub, + GetWorkRequest request, + ThrottleTimer getWorkThrottleTimer, + WorkItemReceiver processWorkItem) { + return GrpcGetWorkStream.create( + responseObserver -> withDeadline(stub).getWorkStream(responseObserver), + request, + grpcBackOff.get(), + newStreamObserverFactory(), + streamRegistry, + logEveryNStreamFailures, + getWorkThrottleTimer, + processWorkItem); + } + + public GetDataStream createGetDataStream( + CloudWindmillServiceV1Alpha1Stub stub, ThrottleTimer getDataThrottleTimer) { + return GrpcGetDataStream.create( + responseObserver -> withDeadline(stub).getDataStream(responseObserver), + grpcBackOff.get(), + newStreamObserverFactory(), + streamRegistry, + logEveryNStreamFailures, + getDataThrottleTimer, + jobHeader, + streamIdGenerator, + streamingRpcBatchLimit); + } + + public CommitWorkStream createCommitWorkStream( + CloudWindmillServiceV1Alpha1Stub stub, ThrottleTimer commitWorkThrottleTimer) { + return GrpcCommitWorkStream.create( + responseObserver -> withDeadline(stub).commitWorkStream(responseObserver), + grpcBackOff.get(), + newStreamObserverFactory(), + streamRegistry, + logEveryNStreamFailures, + commitWorkThrottleTimer, + jobHeader, + streamIdGenerator, + streamingRpcBatchLimit); + } + + public GetWorkerMetadataStream createGetWorkerMetadataStream( + CloudWindmillServiceV1Alpha1Stub stub, + ThrottleTimer getWorkerMetadataThrottleTimer, + Consumer onNewWindmillEndpoints) { + return GrpcGetWorkerMetadataStream.create( + responseObserver -> withDeadline(stub).getWorkerMetadataStream(responseObserver), + grpcBackOff.get(), + newStreamObserverFactory(), + streamRegistry, + logEveryNStreamFailures, + jobHeader, + 0, + getWorkerMetadataThrottleTimer, + onNewWindmillEndpoints); + } + + private StreamObserverFactory newStreamObserverFactory() { + return StreamObserverFactory.direct( + DEFAULT_STREAM_RPC_DEADLINE_SECONDS * 2, windmillMessagesBetweenIsReadyChecks); + } + + /** + * Schedules streaming RPC health checks to run on a background daemon thread, which will be + * cleaned up when the JVM shutdown. + */ + public void scheduleHealthChecks(int healthCheckInterval) { + if (healthCheckInterval < 0) { + return; + } + + new Timer("WindmillHealthCheckTimer") + .schedule( + new TimerTask() { + @Override + public void run() { + Instant reportThreshold = Instant.now().minus(Duration.millis(healthCheckInterval)); + for (AbstractWindmillStream stream : streamRegistry) { + stream.maybeSendHealthCheck(reportThreshold); + } + } + }, + 0, + healthCheckInterval); + } + + @Override + public void appendSummaryHtml(PrintWriter writer) { + writer.write("Active Streams:
"); + for (AbstractWindmillStream stream : streamRegistry) { + stream.appendSummaryHtml(writer); + writer.write("
"); + } + } + + @AutoBuilder(ofClass = GrpcWindmillStreamFactory.class) + interface Builder { + Builder setJobHeader(JobHeader jobHeader); + + Builder setLogEveryNStreamFailures(int logEveryNStreamFailures); + + Builder setStreamingRpcBatchLimit(int streamingRpcBatchLimit); + + Builder setWindmillMessagesBetweenIsReadyChecks(int windmillMessagesBetweenIsReadyChecks); + + Builder setMaxBackOffSupplier(Supplier maxBackOff); + + GrpcWindmillStreamFactory build(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/auth/VendoredCredentialsAdapter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/auth/VendoredCredentialsAdapter.java new file mode 100644 index 0000000000000..23f6fb801a4f3 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/auth/VendoredCredentialsAdapter.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.auth; + +import java.io.IOException; +import java.net.URI; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Executor; + +/** + * Create a wrapper around credentials that delegates to the underlying {@link + * com.google.auth.Credentials}. Note that this class should override every method that is not final + * and not static and call the delegate directly. + * + *

TODO: Replace this with an auto generated proxy which calls the underlying implementation + * delegate to reduce maintenance burden. + */ +public class VendoredCredentialsAdapter + extends org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.Credentials { + + private final com.google.auth.Credentials credentials; + + public VendoredCredentialsAdapter(com.google.auth.Credentials credentials) { + this.credentials = credentials; + } + + @Override + public String getAuthenticationType() { + return credentials.getAuthenticationType(); + } + + @Override + public Map> getRequestMetadata() throws IOException { + return credentials.getRequestMetadata(); + } + + @Override + public void getRequestMetadata( + final URI uri, + Executor executor, + final org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.RequestMetadataCallback callback) { + credentials.getRequestMetadata( + uri, executor, new VendoredRequestMetadataCallbackAdapter(callback)); + } + + @Override + public Map> getRequestMetadata(URI uri) throws IOException { + return credentials.getRequestMetadata(uri); + } + + @Override + public boolean hasRequestMetadata() { + return credentials.hasRequestMetadata(); + } + + @Override + public boolean hasRequestMetadataOnly() { + return credentials.hasRequestMetadataOnly(); + } + + @Override + public void refresh() throws IOException { + credentials.refresh(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/auth/VendoredRequestMetadataCallbackAdapter.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/auth/VendoredRequestMetadataCallbackAdapter.java new file mode 100644 index 0000000000000..8b1b695287e70 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/auth/VendoredRequestMetadataCallbackAdapter.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.auth; + +import java.util.List; +import java.util.Map; + +/** + * Create a wrapper around credentials callback that delegates to the underlying vendored {@link + * com.google.auth.RequestMetadataCallback}. Note that this class should override every method that + * is not final and not static and call the delegate directly. + * + *

TODO: Replace this with an auto generated proxy which calls the underlying implementation + * delegate to reduce maintenance burden. + */ +public class VendoredRequestMetadataCallbackAdapter + implements com.google.auth.RequestMetadataCallback { + + private final org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.RequestMetadataCallback + callback; + + VendoredRequestMetadataCallbackAdapter( + org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.RequestMetadataCallback callback) { + this.callback = callback; + } + + @Override + public void onSuccess(Map> metadata) { + callback.onSuccess(metadata); + } + + @Override + public void onFailure(Throwable exception) { + callback.onFailure(exception); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/DirectStreamObserver.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/DirectStreamObserver.java similarity index 98% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/DirectStreamObserver.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/DirectStreamObserver.java index 3c7798126e59e..5fb22476ab3a4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/DirectStreamObserver.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/DirectStreamObserver.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers; import java.util.concurrent.Phaser; import java.util.concurrent.TimeUnit; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ForwardingClientResponseObserver.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/ForwardingClientResponseObserver.java similarity index 96% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ForwardingClientResponseObserver.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/ForwardingClientResponseObserver.java index a1f80598d89a8..007717d03b58f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ForwardingClientResponseObserver.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/ForwardingClientResponseObserver.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.ClientCallStreamObserver; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.ClientResponseObserver; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamObserverFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverFactory.java similarity index 97% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamObserverFactory.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverFactory.java index e0878b7b0b91b..e3f12687638d9 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamObserverFactory.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/StreamObserverFactory.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers; import java.util.function.Function; import org.apache.beam.sdk.fn.stream.AdvancingPhaser; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillChannelFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillChannelFactory.java new file mode 100644 index 0000000000000..48cf8ff3f7612 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillChannelFactory.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs; + +import java.net.Inet6Address; +import java.net.InetSocketAddress; +import java.util.concurrent.TimeUnit; +import javax.net.ssl.SSLException; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Channel; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.inprocess.InProcessChannelBuilder; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.netty.GrpcSslContexts; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.netty.NegotiationType; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.netty.NettyChannelBuilder; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; + +/** Utility class used to create different RPC Channels. */ +public final class WindmillChannelFactory { + public static final String LOCALHOST = "localhost"; + private static final int DEFAULT_GRPC_PORT = 443; + + private WindmillChannelFactory() {} + + public static ManagedChannel inProcessChannel(String channelName) { + return InProcessChannelBuilder.forName(channelName).directExecutor().build(); + } + + public static Channel localhostChannel(int port) { + return NettyChannelBuilder.forAddress(LOCALHOST, port) + .maxInboundMessageSize(Integer.MAX_VALUE) + .negotiationType(NegotiationType.PLAINTEXT) + .build(); + } + + static Channel remoteChannel( + WindmillServiceAddress windmillServiceAddress, int windmillServiceRpcChannelTimeoutSec) { + switch (windmillServiceAddress.getKind()) { + case IPV6: + return remoteChannel(windmillServiceAddress.ipv6(), windmillServiceRpcChannelTimeoutSec); + case GCP_SERVICE_ADDRESS: + return remoteChannel( + windmillServiceAddress.gcpServiceAddress(), windmillServiceRpcChannelTimeoutSec); + // switch is exhaustive will never happen. + default: + throw new UnsupportedOperationException( + "Only IPV6 and GCP_SERVICE_ADDRESS are supported WindmillServiceAddresses."); + } + } + + public static Channel remoteChannel( + HostAndPort endpoint, int windmillServiceRpcChannelTimeoutSec) { + try { + return createRemoteChannel( + NettyChannelBuilder.forAddress(endpoint.getHost(), endpoint.getPort()), + windmillServiceRpcChannelTimeoutSec); + } catch (SSLException sslException) { + throw new WindmillChannelCreationException(endpoint, sslException); + } + } + + public static Channel remoteChannel( + Inet6Address directEndpoint, int port, int windmillServiceRpcChannelTimeoutSec) { + try { + return createRemoteChannel( + NettyChannelBuilder.forAddress(new InetSocketAddress(directEndpoint, port)), + windmillServiceRpcChannelTimeoutSec); + } catch (SSLException sslException) { + throw new WindmillChannelCreationException(directEndpoint.toString(), sslException); + } + } + + public static Channel remoteChannel( + Inet6Address directEndpoint, int windmillServiceRpcChannelTimeoutSec) { + try { + return createRemoteChannel( + NettyChannelBuilder.forAddress(new InetSocketAddress(directEndpoint, DEFAULT_GRPC_PORT)), + windmillServiceRpcChannelTimeoutSec); + } catch (SSLException sslException) { + throw new WindmillChannelCreationException(directEndpoint.toString(), sslException); + } + } + + @SuppressWarnings("nullness") + private static Channel createRemoteChannel( + NettyChannelBuilder channelBuilder, int windmillServiceRpcChannelTimeoutSec) + throws SSLException { + if (windmillServiceRpcChannelTimeoutSec > 0) { + channelBuilder + .keepAliveTime(windmillServiceRpcChannelTimeoutSec, TimeUnit.SECONDS) + .keepAliveTimeout(windmillServiceRpcChannelTimeoutSec, TimeUnit.SECONDS) + .keepAliveWithoutCalls(true); + } + + return channelBuilder + .flowControlWindow(10 * 1024 * 1024) + .maxInboundMessageSize(Integer.MAX_VALUE) + .maxInboundMetadataSize(1024 * 1024) + .negotiationType(NegotiationType.TLS) + // Set ciphers(null) to not use GCM, which is disabled for Dataflow + // due to it being horribly slow. + .sslContext(GrpcSslContexts.forClient().ciphers(null).build()) + .build(); + } + + public static class WindmillChannelCreationException extends IllegalStateException { + private WindmillChannelCreationException(HostAndPort endpoint, SSLException sourceException) { + super( + String.format( + "Exception thrown when trying to create channel to endpoint={host:%s; port:%d}", + endpoint.getHost(), endpoint.getPort()), + sourceException); + } + + WindmillChannelCreationException(String directEndpoint, Throwable sourceException) { + super( + String.format( + "Exception thrown when trying to create channel to endpoint={%s}", directEndpoint), + sourceException); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactory.java new file mode 100644 index 0000000000000..0c7719b0bc130 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/stubs/WindmillStubFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs; + +import static org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory.remoteChannel; + +import com.google.auth.Credentials; +import com.google.auto.value.AutoOneOf; +import java.util.function.Function; +import java.util.function.Supplier; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; +import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.auth.VendoredCredentialsAdapter; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.auth.MoreCallCredentials; + +/** + * Used to create stubs to talk to Streaming Engine. Stubs are either in-process for testing, or + * remote. + */ +@AutoOneOf(WindmillStubFactory.Kind.class) +public abstract class WindmillStubFactory { + + public static WindmillStubFactory inProcessStubFactory( + String testName, Function channelFactory) { + return AutoOneOf_WindmillStubFactory.inProcess( + () -> CloudWindmillServiceV1Alpha1Grpc.newStub(channelFactory.apply(testName))); + } + + public static WindmillStubFactory inProcessStubFactory(String testName) { + return AutoOneOf_WindmillStubFactory.inProcess( + () -> + CloudWindmillServiceV1Alpha1Grpc.newStub( + WindmillChannelFactory.inProcessChannel(testName))); + } + + public static WindmillStubFactory remoteStubFactory( + int rpcChannelTimeoutSec, Credentials gcpCredentials) { + return AutoOneOf_WindmillStubFactory.remote( + directEndpoint -> + CloudWindmillServiceV1Alpha1Grpc.newStub( + remoteChannel(directEndpoint, rpcChannelTimeoutSec)) + .withCallCredentials( + MoreCallCredentials.from(new VendoredCredentialsAdapter(gcpCredentials)))); + } + + public abstract Kind getKind(); + + public abstract Supplier inProcess(); + + public abstract Function remote(); + + public enum Kind { + IN_PROCESS, + REMOTE + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/AvroGenericCoder.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/throttling/StreamingEngineThrottleTimers.java similarity index 51% rename from sdks/java/core/src/main/java/org/apache/beam/sdk/coders/AvroGenericCoder.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/throttling/StreamingEngineThrottleTimers.java index 7d90206ce4c5a..6b8dd2720374a 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/AvroGenericCoder.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/throttling/StreamingEngineThrottleTimers.java @@ -15,26 +15,27 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.coders; +package org.apache.beam.runners.dataflow.worker.windmill.client.throttling; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; +import com.google.auto.value.AutoValue; -/** - * AvroCoder specialisation for GenericRecord. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.coders.AvroGenericCoder instead of this one. - */ -@Deprecated -public class AvroGenericCoder extends AvroCoder { - AvroGenericCoder(Schema schema) { - super(GenericRecord.class, schema); +@AutoValue +public abstract class StreamingEngineThrottleTimers { + + public static StreamingEngineThrottleTimers create() { + return new AutoValue_StreamingEngineThrottleTimers( + new ThrottleTimer(), new ThrottleTimer(), new ThrottleTimer()); } - public static AvroGenericCoder of(Schema schema) { - return new AvroGenericCoder(schema); + public long getAndResetThrottleTime() { + return getWorkThrottleTimer().getAndResetThrottleTime() + + getDataThrottleTimer().getAndResetThrottleTime() + + commitWorkThrottleTimer().getAndResetThrottleTime(); } + + public abstract ThrottleTimer getWorkThrottleTimer(); + + public abstract ThrottleTimer getDataThrottleTimer(); + + public abstract ThrottleTimer commitWorkThrottleTimer(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/ThrottleTimer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/throttling/ThrottleTimer.java similarity index 94% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/ThrottleTimer.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/throttling/ThrottleTimer.java index 237339aff3993..f660112721ba2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/ThrottleTimer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/throttling/ThrottleTimer.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.throttling; import org.joda.time.Instant; @@ -25,7 +25,7 @@ * CommitWork are both blocked for x, totalTime will be 2x. However, if 2 GetWork streams are both * blocked for x totalTime will be x. All methods are thread safe. */ -class ThrottleTimer { +public final class ThrottleTimer { // This is -1 if not currently being throttled or the time in // milliseconds when throttling for this type started. private long startTime = -1; @@ -36,7 +36,7 @@ class ThrottleTimer { /** * Starts the timer if it has not been started and does nothing if it has already been started. */ - synchronized void start() { + public synchronized void start() { if (!throttled()) { // This timer is not started yet so start it now. startTime = Instant.now().getMillis(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServer.java deleted file mode 100644 index 19cb90297df5b..0000000000000 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServer.java +++ /dev/null @@ -1,607 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; - -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; -import java.io.IOException; -import java.io.PrintWriter; -import java.net.URI; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.Set; -import java.util.Timer; -import java.util.TimerTask; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executor; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import java.util.function.Supplier; -import org.apache.beam.runners.dataflow.worker.options.StreamingDataflowWorkerOptions; -import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; -import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitWorkRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitWorkResponse; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetConfigRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetConfigResponse; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetDataRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetDataResponse; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkResponse; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ReportStatsRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ReportStatsResponse; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillApplianceGrpc; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream.WorkItemReceiver; -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.sdk.util.BackOffUtils; -import org.apache.beam.sdk.util.FluentBackoff; -import org.apache.beam.sdk.util.Sleeper; -import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Channel; -import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.StatusRuntimeException; -import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.auth.MoreCallCredentials; -import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.inprocess.InProcessChannelBuilder; -import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.netty.GrpcSslContexts; -import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.netty.NegotiationType; -import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.netty.NettyChannelBuilder; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; -import org.joda.time.Duration; -import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** gRPC client for communicating with Streaming Engine. */ -// Very likely real potential for bugs - https://github.com/apache/beam/issues/19273 -// Very likely real potential for bugs - https://github.com/apache/beam/issues/19271 -@SuppressFBWarnings({"JLM_JSR166_UTILCONCURRENT_MONITORENTER", "IS2_INCONSISTENT_SYNC"}) -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public final class GrpcWindmillServer extends WindmillServerStub { - private static final Logger LOG = LoggerFactory.getLogger(GrpcWindmillServer.class); - - // If a connection cannot be established, gRPC will fail fast so this deadline can be relatively - // high. - private static final long DEFAULT_STREAM_RPC_DEADLINE_SECONDS = 300; - private static final int DEFAULT_LOG_EVERY_N_FAILURES = 20; - private static final String LOCALHOST = "localhost"; - private static final Duration MIN_BACKOFF = Duration.millis(1); - private static final Duration MAX_BACKOFF = Duration.standardSeconds(30); - private static final AtomicLong nextId = new AtomicLong(0); - private static final int NO_HEALTH_CHECK = -1; - - private final StreamingDataflowWorkerOptions options; - private final int streamingRpcBatchLimit; - private final List stubList; - private final ThrottleTimer getWorkThrottleTimer; - private final ThrottleTimer getDataThrottleTimer; - private final ThrottleTimer commitWorkThrottleTimer; - private final Random rand; - private final Set> streamRegistry; - private ImmutableSet endpoints; - private int logEveryNStreamFailures; - private Duration maxBackoff = MAX_BACKOFF; - private WindmillApplianceGrpc.WindmillApplianceBlockingStub syncApplianceStub = null; - - private GrpcWindmillServer(StreamingDataflowWorkerOptions options) { - this.options = options; - this.streamingRpcBatchLimit = options.getWindmillServiceStreamingRpcBatchLimit(); - this.stubList = new ArrayList<>(); - this.logEveryNStreamFailures = options.getWindmillServiceStreamingLogEveryNStreamFailures(); - this.endpoints = ImmutableSet.of(); - this.getWorkThrottleTimer = new ThrottleTimer(); - this.getDataThrottleTimer = new ThrottleTimer(); - this.commitWorkThrottleTimer = new ThrottleTimer(); - this.rand = new Random(); - this.streamRegistry = Collections.newSetFromMap(new ConcurrentHashMap<>()); - } - - private static StreamingDataflowWorkerOptions testOptions(boolean enableStreamingEngine) { - StreamingDataflowWorkerOptions options = - PipelineOptionsFactory.create().as(StreamingDataflowWorkerOptions.class); - options.setProject("project"); - options.setJobId("job"); - options.setWorkerId("worker"); - List experiments = - options.getExperiments() == null ? new ArrayList<>() : options.getExperiments(); - if (enableStreamingEngine) { - experiments.add(GcpOptions.STREAMING_ENGINE_EXPERIMENT); - } - options.setExperiments(experiments); - - options.setWindmillServiceStreamingRpcBatchLimit(Integer.MAX_VALUE); - options.setWindmillServiceStreamingRpcHealthCheckPeriodMs(NO_HEALTH_CHECK); - options.setWindmillServiceStreamingLogEveryNStreamFailures(DEFAULT_LOG_EVERY_N_FAILURES); - - return options; - } - - /** Create new instance of {@link GrpcWindmillServer}. */ - public static GrpcWindmillServer create(StreamingDataflowWorkerOptions workerOptions) - throws IOException { - GrpcWindmillServer grpcWindmillServer = new GrpcWindmillServer(workerOptions); - if (workerOptions.getWindmillServiceEndpoint() != null) { - grpcWindmillServer.configureWindmillServiceEndpoints(); - } else if (!workerOptions.isEnableStreamingEngine() - && workerOptions.getLocalWindmillHostport() != null) { - grpcWindmillServer.configureLocalHost(); - } - - if (workerOptions.getWindmillServiceStreamingRpcHealthCheckPeriodMs() > 0) { - grpcWindmillServer.scheduleHealthCheckTimer( - workerOptions, () -> grpcWindmillServer.streamRegistry); - } - - return grpcWindmillServer; - } - - @VisibleForTesting - static GrpcWindmillServer newTestInstance(String name) { - GrpcWindmillServer testServer = - new GrpcWindmillServer(testOptions(/* enableStreamingEngine= */ true)); - testServer.stubList.add(CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel(name))); - return testServer; - } - - @VisibleForTesting - static GrpcWindmillServer newApplianceTestInstance(Channel channel) { - GrpcWindmillServer testServer = - new GrpcWindmillServer(testOptions(/* enableStreamingEngine= */ false)); - testServer.syncApplianceStub = createWindmillApplianceStubWithDeadlineInterceptor(channel); - return testServer; - } - - private static WindmillApplianceGrpc.WindmillApplianceBlockingStub - createWindmillApplianceStubWithDeadlineInterceptor(Channel channel) { - return WindmillApplianceGrpc.newBlockingStub(channel) - .withInterceptors(GrpcDeadlineClientInterceptor.withDefaultUnaryRpcDeadline()); - } - - private static Channel inProcessChannel(String name) { - return InProcessChannelBuilder.forName(name).directExecutor().build(); - } - - private static Channel localhostChannel(int port) { - return NettyChannelBuilder.forAddress(LOCALHOST, port) - .maxInboundMessageSize(Integer.MAX_VALUE) - .negotiationType(NegotiationType.PLAINTEXT) - .build(); - } - - private static UnsupportedOperationException unsupportedUnaryRequestInStreamingEngineException( - String rpcName) { - return new UnsupportedOperationException( - String.format("Unary %s calls are not supported in Streaming Engine.", rpcName)); - } - - private void scheduleHealthCheckTimer( - StreamingDataflowWorkerOptions options, Supplier>> streams) { - new Timer("WindmillHealthCheckTimer") - .schedule( - new HealthCheckTimerTask(options, streams), - 0, - options.getWindmillServiceStreamingRpcHealthCheckPeriodMs()); - } - - private void configureWindmillServiceEndpoints() throws IOException { - Set endpoints = new HashSet<>(); - for (String endpoint : Splitter.on(',').split(options.getWindmillServiceEndpoint())) { - endpoints.add( - HostAndPort.fromString(endpoint).withDefaultPort(options.getWindmillServicePort())); - } - initializeWindmillService(endpoints); - } - - private void configureLocalHost() { - int portStart = options.getLocalWindmillHostport().lastIndexOf(':'); - String endpoint = options.getLocalWindmillHostport().substring(0, portStart); - assert ("grpc:localhost".equals(endpoint)); - int port = Integer.parseInt(options.getLocalWindmillHostport().substring(portStart + 1)); - this.endpoints = ImmutableSet.of(HostAndPort.fromParts(LOCALHOST, port)); - initializeLocalHost(port); - } - - @Override - public synchronized void setWindmillServiceEndpoints(Set endpoints) - throws IOException { - Preconditions.checkNotNull(endpoints); - if (endpoints.equals(this.endpoints)) { - // The endpoints are equal don't recreate the stubs. - return; - } - LOG.info("Creating a new windmill stub, endpoints: {}", endpoints); - if (this.endpoints != null) { - LOG.info("Previous windmill stub endpoints: {}", this.endpoints); - } - initializeWindmillService(endpoints); - } - - @Override - public synchronized boolean isReady() { - return !stubList.isEmpty(); - } - - private synchronized void initializeLocalHost(int port) { - this.logEveryNStreamFailures = 1; - this.maxBackoff = Duration.millis(500); - Channel channel = localhostChannel(port); - if (options.isEnableStreamingEngine()) { - this.stubList.add(CloudWindmillServiceV1Alpha1Grpc.newStub(channel)); - } else { - this.syncApplianceStub = createWindmillApplianceStubWithDeadlineInterceptor(channel); - } - } - - private synchronized void initializeWindmillService(Set endpoints) - throws IOException { - LOG.info("Initializing Streaming Engine GRPC client for endpoints: {}", endpoints); - this.stubList.clear(); - this.endpoints = ImmutableSet.copyOf(endpoints); - for (HostAndPort endpoint : this.endpoints) { - if (LOCALHOST.equals(endpoint.getHost())) { - initializeLocalHost(endpoint.getPort()); - } else { - this.stubList.add( - CloudWindmillServiceV1Alpha1Grpc.newStub(remoteChannel(endpoint)) - .withCallCredentials( - MoreCallCredentials.from( - new VendoredCredentialsAdapter(options.getGcpCredential())))); - } - } - } - - private Channel remoteChannel(HostAndPort endpoint) throws IOException { - NettyChannelBuilder builder = - NettyChannelBuilder.forAddress(endpoint.getHost(), endpoint.getPort()); - int timeoutSec = options.getWindmillServiceRpcChannelAliveTimeoutSec(); - if (timeoutSec > 0) { - builder - .keepAliveTime(timeoutSec, TimeUnit.SECONDS) - .keepAliveTimeout(timeoutSec, TimeUnit.SECONDS) - .keepAliveWithoutCalls(true); - } - return builder - .flowControlWindow(10 * 1024 * 1024) - .maxInboundMessageSize(Integer.MAX_VALUE) - .maxInboundMetadataSize(1024 * 1024) - .negotiationType(NegotiationType.TLS) - // Set ciphers(null) to not use GCM, which is disabled for Dataflow - // due to it being horribly slow. - .sslContext(GrpcSslContexts.forClient().ciphers(null).build()) - .build(); - } - - /** - * Stubs returned from this method do not (and should not) have {@link - * org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Deadline}(s) set since they represent an absolute - * point in time. {@link org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Deadline}(s) should not be - * treated as a timeout which represents a relative point in time. - * - * @see Official gRPC deadline documentation for more - * details. - */ - private synchronized CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub() { - if (stubList.isEmpty()) { - throw new RuntimeException("windmillServiceEndpoint has not been set"); - } - - return stubList.size() == 1 ? stubList.get(0) : stubList.get(rand.nextInt(stubList.size())); - } - - @Override - public void appendSummaryHtml(PrintWriter writer) { - writer.write("Active Streams:
"); - for (AbstractWindmillStream stream : streamRegistry) { - stream.appendSummaryHtml(writer); - writer.write("
"); - } - } - - // Configure backoff to retry calls forever, with a maximum sane retry interval. - private BackOff grpcBackoff() { - return FluentBackoff.DEFAULT - .withInitialBackoff(MIN_BACKOFF) - .withMaxBackoff(maxBackoff) - .backoff(); - } - - private ResponseT callWithBackoff(Supplier function) { - BackOff backoff = grpcBackoff(); - int rpcErrors = 0; - while (true) { - try { - return function.get(); - } catch (StatusRuntimeException e) { - try { - if (++rpcErrors % 20 == 0) { - LOG.warn( - "Many exceptions calling gRPC. Last exception: {} with status {}", - e, - e.getStatus()); - } - if (!BackOffUtils.next(Sleeper.DEFAULT, backoff)) { - throw new RpcException(e); - } - } catch (IOException | InterruptedException i) { - if (i instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - RpcException rpcException = new RpcException(e); - rpcException.addSuppressed(i); - throw rpcException; - } - } - } - } - - @Override - public GetWorkResponse getWork(GetWorkRequest request) { - if (syncApplianceStub != null) { - return callWithBackoff(() -> syncApplianceStub.getWork(request)); - } - - throw new RpcException(unsupportedUnaryRequestInStreamingEngineException("GetWork")); - } - - @Override - public GetDataResponse getData(GetDataRequest request) { - if (syncApplianceStub != null) { - return callWithBackoff(() -> syncApplianceStub.getData(request)); - } - - throw new RpcException(unsupportedUnaryRequestInStreamingEngineException("GetData")); - } - - @Override - public CommitWorkResponse commitWork(CommitWorkRequest request) { - if (syncApplianceStub != null) { - return callWithBackoff(() -> syncApplianceStub.commitWork(request)); - } - throw new RpcException(unsupportedUnaryRequestInStreamingEngineException("CommitWork")); - } - - private StreamObserverFactory newStreamObserverFactory() { - return StreamObserverFactory.direct( - DEFAULT_STREAM_RPC_DEADLINE_SECONDS * 2, options.getWindmillMessagesBetweenIsReadyChecks()); - } - - @Override - public GetWorkStream getWorkStream(GetWorkRequest request, WorkItemReceiver receiver) { - GetWorkRequest getWorkRequest = - GetWorkRequest.newBuilder(request) - .setJobId(options.getJobId()) - .setProjectId(options.getProject()) - .setWorkerId(options.getWorkerId()) - .build(); - - return GrpcGetWorkStream.create( - responseObserver -> - stub() - // Deadlines are absolute points in time, so generate a new one everytime this - // function is called. - .withDeadlineAfter( - AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) - .getWorkStream(responseObserver), - getWorkRequest, - grpcBackoff(), - newStreamObserverFactory(), - streamRegistry, - logEveryNStreamFailures, - getWorkThrottleTimer, - receiver); - } - - @Override - public GetDataStream getDataStream() { - return GrpcGetDataStream.create( - responseObserver -> - stub() - // Deadlines are absolute points in time, so generate a new one everytime this - // function is called. - .withDeadlineAfter( - AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) - .getDataStream(responseObserver), - grpcBackoff(), - newStreamObserverFactory(), - streamRegistry, - logEveryNStreamFailures, - getDataThrottleTimer, - makeHeader(), - nextId, - streamingRpcBatchLimit); - } - - @Override - public CommitWorkStream commitWorkStream() { - return GrpcCommitWorkStream.create( - responseObserver -> - stub() - // Deadlines are absolute points in time, so generate a new one everytime this - // function is called. - .withDeadlineAfter( - AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) - .commitWorkStream(responseObserver), - grpcBackoff(), - newStreamObserverFactory(), - streamRegistry, - logEveryNStreamFailures, - commitWorkThrottleTimer, - makeHeader(), - nextId, - streamingRpcBatchLimit); - } - - @Override - public GetConfigResponse getConfig(GetConfigRequest request) { - if (syncApplianceStub != null) { - return callWithBackoff(() -> syncApplianceStub.getConfig(request)); - } - - throw new RpcException( - new UnsupportedOperationException("GetConfig not supported in Streaming Engine.")); - } - - @Override - public ReportStatsResponse reportStats(ReportStatsRequest request) { - if (syncApplianceStub != null) { - return callWithBackoff(() -> syncApplianceStub.reportStats(request)); - } - - throw new RpcException( - new UnsupportedOperationException("ReportStats not supported in Streaming Engine.")); - } - - @Override - public long getAndResetThrottleTime() { - return getWorkThrottleTimer.getAndResetThrottleTime() - + getDataThrottleTimer.getAndResetThrottleTime() - + commitWorkThrottleTimer.getAndResetThrottleTime(); - } - - private JobHeader makeHeader() { - return JobHeader.newBuilder() - .setJobId(options.getJobId()) - .setProjectId(options.getProject()) - .setWorkerId(options.getWorkerId()) - .build(); - } - - /** - * Create a wrapper around credentials callback that delegates to the underlying vendored {@link - * com.google.auth.RequestMetadataCallback}. Note that this class should override every method - * that is not final and not static and call the delegate directly. - * - *

TODO: Replace this with an auto generated proxy which calls the underlying implementation - * delegate to reduce maintenance burden. - */ - private static class VendoredRequestMetadataCallbackAdapter - implements com.google.auth.RequestMetadataCallback { - - private final org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.RequestMetadataCallback - callback; - - private VendoredRequestMetadataCallbackAdapter( - org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.RequestMetadataCallback callback) { - this.callback = callback; - } - - @Override - public void onSuccess(Map> metadata) { - callback.onSuccess(metadata); - } - - @Override - public void onFailure(Throwable exception) { - callback.onFailure(exception); - } - } - - /** - * Create a wrapper around credentials that delegates to the underlying {@link - * com.google.auth.Credentials}. Note that this class should override every method that is not - * final and not static and call the delegate directly. - * - *

TODO: Replace this with an auto generated proxy which calls the underlying implementation - * delegate to reduce maintenance burden. - */ - private static class VendoredCredentialsAdapter - extends org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.Credentials { - - private final com.google.auth.Credentials credentials; - - private VendoredCredentialsAdapter(com.google.auth.Credentials credentials) { - this.credentials = credentials; - } - - @Override - public String getAuthenticationType() { - return credentials.getAuthenticationType(); - } - - @Override - public Map> getRequestMetadata() throws IOException { - return credentials.getRequestMetadata(); - } - - @Override - public void getRequestMetadata( - final URI uri, - Executor executor, - final org.apache.beam.vendor.grpc.v1p54p0.com.google.auth.RequestMetadataCallback - callback) { - credentials.getRequestMetadata( - uri, executor, new VendoredRequestMetadataCallbackAdapter(callback)); - } - - @Override - public Map> getRequestMetadata(URI uri) throws IOException { - return credentials.getRequestMetadata(uri); - } - - @Override - public boolean hasRequestMetadata() { - return credentials.hasRequestMetadata(); - } - - @Override - public boolean hasRequestMetadataOnly() { - return credentials.hasRequestMetadataOnly(); - } - - @Override - public void refresh() throws IOException { - credentials.refresh(); - } - } - - private static class HealthCheckTimerTask extends TimerTask { - private final StreamingDataflowWorkerOptions options; - private final Supplier>> streams; - - public HealthCheckTimerTask( - StreamingDataflowWorkerOptions options, - Supplier>> streams) { - this.options = options; - this.streams = streams; - } - - @Override - public void run() { - Instant reportThreshold = - Instant.now() - .minus(Duration.millis(options.getWindmillServiceStreamingRpcHealthCheckPeriodMs())); - for (AbstractWindmillStream stream : streams.get()) { - stream.maybeSendHealthCheck(reportThreshold); - } - } - } -} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/CachingStateTable.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/CachingStateTable.java new file mode 100644 index 0000000000000..bcaf8bf21a2da --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/CachingStateTable.java @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.Closeable; +import java.util.Optional; +import javax.annotation.Nullable; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTable; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.core.StateTags; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.BagState; +import org.apache.beam.sdk.state.CombiningState; +import org.apache.beam.sdk.state.MapState; +import org.apache.beam.sdk.state.MultimapState; +import org.apache.beam.sdk.state.OrderedListState; +import org.apache.beam.sdk.state.SetState; +import org.apache.beam.sdk.state.State; +import org.apache.beam.sdk.state.StateContext; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.state.WatermarkHoldState; +import org.apache.beam.sdk.transforms.Combine; +import org.apache.beam.sdk.transforms.CombineWithContext; +import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; +import org.apache.beam.sdk.util.CombineFnUtil; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; + +final class CachingStateTable extends StateTable { + private final String stateFamily; + private final WindmillStateReader reader; + private final WindmillStateCache.ForKeyAndFamily cache; + private final boolean isSystemTable; + private final Supplier scopedReadStateSupplier; + private final @Nullable StateTable derivedStateTable; + private final boolean isNewKey; + + private CachingStateTable(Builder builder) { + this.stateFamily = builder.stateFamily; + this.reader = builder.reader; + this.cache = builder.cache; + this.isSystemTable = builder.isSystemTable; + this.isNewKey = builder.isNewKey; + this.scopedReadStateSupplier = builder.scopedReadStateSupplier; + this.derivedStateTable = builder.derivedStateTable; + + if (this.isSystemTable) { + Preconditions.checkState(derivedStateTable == null); + } else { + Preconditions.checkNotNull(this.derivedStateTable); + } + } + + static CachingStateTable.Builder builder( + String stateFamily, + WindmillStateReader reader, + WindmillStateCache.ForKeyAndFamily cache, + boolean isNewKey, + Supplier scopedReadStateSupplier) { + return new CachingStateTable.Builder( + stateFamily, reader, cache, scopedReadStateSupplier, isNewKey); + } + + @Override + @SuppressWarnings("deprecation") + protected StateTag.StateBinder binderForNamespace(StateNamespace namespace, StateContext c) { + // Look up state objects in the cache or create new ones if not found. The state will + // be added to the cache in persist(). + return new StateTag.StateBinder() { + @Override + public BagState bindBag(StateTag> address, Coder elemCoder) { + StateTag> resolvedAddress = + isSystemTable ? StateTags.makeSystemTagInternal(address) : address; + + WindmillBag result = + cache + .get(namespace, resolvedAddress) + .map(bagState -> (WindmillBag) bagState) + .orElseGet( + () -> + new WindmillBag<>( + namespace, resolvedAddress, stateFamily, elemCoder, isNewKey)); + + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + @Override + public SetState bindSet(StateTag> spec, Coder elemCoder) { + WindmillSet result = + new WindmillSet<>(namespace, spec, stateFamily, elemCoder, cache, isNewKey); + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + @Override + public MapState bindMap( + StateTag> spec, Coder keyCoder, Coder valueCoder) { + WindmillMap result = + cache + .get(namespace, spec) + .map(mapState -> (WindmillMap) mapState) + .orElseGet( + () -> + new WindmillMap<>( + namespace, spec, stateFamily, keyCoder, valueCoder, isNewKey)); + + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + @Override + public MultimapState bindMultimap( + StateTag> spec, + Coder keyCoder, + Coder valueCoder) { + WindmillMultimap result = + cache + .get(namespace, spec) + .map(multimapState -> (WindmillMultimap) multimapState) + .orElseGet( + () -> + new WindmillMultimap<>( + namespace, spec, stateFamily, keyCoder, valueCoder, isNewKey)); + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + @Override + public OrderedListState bindOrderedList( + StateTag> spec, Coder elemCoder) { + StateTag> specOrInternalTag = addressOrInternalTag(spec); + + WindmillOrderedList result = + cache + .get(namespace, specOrInternalTag) + .map(orderedList -> (WindmillOrderedList) orderedList) + .orElseGet( + () -> + new WindmillOrderedList<>( + Optional.ofNullable(derivedStateTable).orElse(CachingStateTable.this), + namespace, + specOrInternalTag, + stateFamily, + elemCoder, + isNewKey)); + + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + @Override + public WatermarkHoldState bindWatermark( + StateTag address, TimestampCombiner timestampCombiner) { + StateTag addressOrInternalTag = addressOrInternalTag(address); + + WindmillWatermarkHold result = + cache + .get(namespace, addressOrInternalTag) + .map(watermarkHold -> (WindmillWatermarkHold) watermarkHold) + .orElseGet( + () -> + new WindmillWatermarkHold( + namespace, address, stateFamily, timestampCombiner, isNewKey)); + + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + @Override + public CombiningState bindCombiningValue( + StateTag> address, + Coder accumCoder, + Combine.CombineFn combineFn) { + StateTag> addressOrInternalTag = + addressOrInternalTag(address); + + WindmillCombiningState result = + new WindmillCombiningState<>( + namespace, + addressOrInternalTag, + stateFamily, + accumCoder, + combineFn, + cache, + isNewKey); + + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + @Override + public + CombiningState bindCombiningValueWithContext( + StateTag> address, + Coder accumCoder, + CombineWithContext.CombineFnWithContext combineFn) { + return bindCombiningValue( + addressOrInternalTag(address), accumCoder, CombineFnUtil.bindContext(combineFn, c)); + } + + @Override + public ValueState bindValue(StateTag> address, Coder coder) { + StateTag> addressOrInternalTag = addressOrInternalTag(address); + + WindmillValue result = + cache + .get(namespace, addressOrInternalTag) + .map(value -> (WindmillValue) value) + .orElseGet( + () -> + new WindmillValue<>( + namespace, addressOrInternalTag, stateFamily, coder, isNewKey)); + + result.initializeForWorkItem(reader, scopedReadStateSupplier); + return result; + } + + private StateTag addressOrInternalTag(StateTag address) { + return isSystemTable ? StateTags.makeSystemTagInternal(address) : address; + } + }; + } + + static class Builder { + private final String stateFamily; + private final WindmillStateReader reader; + private final WindmillStateCache.ForKeyAndFamily cache; + private final Supplier scopedReadStateSupplier; + private final boolean isNewKey; + private boolean isSystemTable; + private @Nullable StateTable derivedStateTable; + + private Builder( + String stateFamily, + WindmillStateReader reader, + WindmillStateCache.ForKeyAndFamily cache, + Supplier scopedReadStateSupplier, + boolean isNewKey) { + this.stateFamily = stateFamily; + this.reader = reader; + this.cache = cache; + this.scopedReadStateSupplier = scopedReadStateSupplier; + this.isNewKey = isNewKey; + this.isSystemTable = true; + this.derivedStateTable = null; + } + + Builder withDerivedState(StateTable derivedStateTable) { + this.isSystemTable = false; + this.derivedStateTable = derivedStateTable; + return this; + } + + CachingStateTable build() { + return new CachingStateTable(this); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ConcatIterables.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ConcatIterables.java new file mode 100644 index 0000000000000..4bb806bd70fd5 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ConcatIterables.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterators; + +class ConcatIterables implements Iterable { + // List of component iterables. Should only be appended to in order to support snapshot(). + private final List> iterables; + + public ConcatIterables() { + this.iterables = new ArrayList<>(); + } + + public void extendWith(Iterable iterable) { + iterables.add(iterable); + } + + @Override + public Iterator iterator() { + return Iterators.concat(Iterables.transform(iterables, Iterable::iterator).iterator()); + } + + /** + * Returns a view of the current state of this iterable. Remembers the current length of iterables + * so that the returned value Will not change due to future extendWith() calls. + */ + public Iterable snapshot() { + final int limit = iterables.size(); + final List> iterablesList = iterables; + return () -> + Iterators.concat( + Iterators.transform( + Iterators.limit(iterablesList.iterator(), limit), Iterable::iterator)); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/IdTracker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/IdTracker.java new file mode 100644 index 0000000000000..5090626ae8eeb --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/IdTracker.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedSet; +import java.util.concurrent.ExecutionException; +import java.util.function.BiConsumer; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTable; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.core.StateTags; +import org.apache.beam.sdk.coders.InstantCoder; +import org.apache.beam.sdk.coders.MapCoder; +import org.apache.beam.sdk.coders.VarLongCoder; +import org.apache.beam.sdk.state.StateContexts; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.values.TimestampedValue; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.RangeSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.TreeRangeSet; +import org.joda.time.Duration; +import org.joda.time.Instant; + +/** + * Tracker for the ids used in an ordered list. + * + *

Windmill accepts an int64 id for each timestamped-element in the list. Unique elements are + * identified by the pair of timestamp and id. This means that tow unique elements e1, e2 must have + * different (ts1, id1), (ts2, id2) pairs. To accomplish this we bucket time into five-minute + * buckets, and store a free list of ids available for each bucket. + * + *

When a timestamp range is deleted, we remove id tracking for elements in that range. In order + * to handle the case where a range is deleted piecemeal, we track sub-range deletions for each + * range. For example: + * + *

12:00 - 12:05 ids 12:05 - 12:10 ids + * + *

delete 12:00-12:06 + * + *

12:00 - 12:05 *removed* 12:05 - 12:10 ids subranges deleted 12:05-12:06 + * + *

delete 12:06 - 12:07 + * + *

12:05 - 12:10 ids subranges deleted 12:05-12:07 + * + *

delete 12:07 - 12:10 + * + *

12:05 - 12:10 *removed* + */ +@SuppressWarnings("nullness" // TODO(https://github.com/apache/beam/issues/20497) +) +final class IdTracker { + @VisibleForTesting static final String IDS_AVAILABLE_STR = "IdsAvailable"; + @VisibleForTesting static final String DELETIONS_STR = "Deletions"; + // Note that this previously was Long.MIN_VALUE but ids are unsigned when + // sending to windmill for Streaming Engine. For updated appliance + // pipelines with existing state, there may be negative ids. + @VisibleForTesting static final long NEW_RANGE_MIN_ID = 0; + + @VisibleForTesting + static final MapCoder, RangeSet> IDS_AVAILABLE_CODER = + MapCoder.of(new RangeCoder<>(InstantCoder.of()), new RangeSetCoder<>(VarLongCoder.of())); + + @VisibleForTesting + static final MapCoder, RangeSet> SUBRANGE_DELETIONS_CODER = + MapCoder.of(new RangeCoder<>(InstantCoder.of()), new RangeSetCoder<>(InstantCoder.of())); + + private static final long NEW_RANGE_MAX_ID = Long.MAX_VALUE; + // We track ids on five-minute boundaries. + private static final Duration RESOLUTION = Duration.standardMinutes(5); + // A map from five-minute ranges to the set of ids available in that interval. + private final ValueState, RangeSet>> idsAvailableValue; + // If a timestamp-range in the map has been partially cleared, the cleared intervals are stored + // here. + private final ValueState, RangeSet>> subRangeDeletionsValue; + + IdTracker(StateTable stateTable, StateNamespace namespace, StateTag spec) { + StateTag, RangeSet>>> idsAvailableTag = + StateTags.makeSystemTagInternal( + StateTags.value(spec.getId() + IDS_AVAILABLE_STR, IDS_AVAILABLE_CODER)); + StateTag, RangeSet>>> subRangeDeletionsTag = + StateTags.makeSystemTagInternal( + StateTags.value(spec.getId() + DELETIONS_STR, SUBRANGE_DELETIONS_CODER)); + + this.idsAvailableValue = + stateTable.get(namespace, idsAvailableTag, StateContexts.nullContext()); + this.subRangeDeletionsValue = + stateTable.get(namespace, subRangeDeletionsTag, StateContexts.nullContext()); + } + + static > + Map, RangeSet> newSortedRangeMap() { + return Maps.newTreeMap( + Comparator., Instant>comparing(Range::lowerEndpoint) + .thenComparing(Range::upperEndpoint)); + } + + private Range getTrackedRange(Instant ts) { + Instant snapped = + new Instant(ts.getMillis() - ts.plus(RESOLUTION).getMillis() % RESOLUTION.getMillis()); + return Range.closedOpen(snapped, snapped.plus(RESOLUTION)); + } + + @SuppressWarnings("FutureReturnValueIgnored") + void readLater() { + idsAvailableValue.readLater(); + subRangeDeletionsValue.readLater(); + } + + Map, RangeSet> readIdsAvailable() { + Map, RangeSet> idsAvailable = idsAvailableValue.read(); + return idsAvailable != null ? idsAvailable : newSortedRangeMap(); + } + + Map, RangeSet> readSubRangeDeletions() { + Map, RangeSet> subRangeDeletions = subRangeDeletionsValue.read(); + return subRangeDeletions != null ? subRangeDeletions : newSortedRangeMap(); + } + + void clear() throws ExecutionException, InterruptedException { + idsAvailableValue.clear(); + subRangeDeletionsValue.clear(); + } + + void add( + SortedSet> elements, BiConsumer, Long> output) + throws ExecutionException, InterruptedException { + Range currentIdRange = null; + long currentId = 0; + + Range currentTsRange = null; + RangeSet currentTsRangeDeletions = null; + + Map, RangeSet> idsAvailable = readIdsAvailable(); + Map, RangeSet> subRangeDeletions = readSubRangeDeletions(); + + RangeSet availableIdsForTsRange = null; + Iterator> idRangeIter = null; + RangeSet idsUsed = TreeRangeSet.create(); + for (TimestampedValueWithId pendingAdd : elements) { + // Since elements are in increasing ts order, often we'll be able to reuse the previous + // iteration's range. + if (currentTsRange == null + || !currentTsRange.contains(pendingAdd.getValue().getTimestamp())) { + if (availableIdsForTsRange != null) { + // We're moving onto a new ts range. Remove all used ids + availableIdsForTsRange.removeAll(idsUsed); + idsUsed = TreeRangeSet.create(); + } + + // Lookup the range for the current timestamp. + currentTsRange = getTrackedRange(pendingAdd.getValue().getTimestamp()); + // Lookup available ids for this timestamp range. If nothing there, we default to all ids + // available. + availableIdsForTsRange = + idsAvailable.computeIfAbsent( + currentTsRange, + r -> + TreeRangeSet.create( + ImmutableList.of(Range.closedOpen(NEW_RANGE_MIN_ID, NEW_RANGE_MAX_ID)))); + idRangeIter = availableIdsForTsRange.asRanges().iterator(); + currentIdRange = null; + currentTsRangeDeletions = subRangeDeletions.get(currentTsRange); + } + + if (currentIdRange == null || currentId >= currentIdRange.upperEndpoint()) { + // Move to the next range of free ids, and start assigning ranges from there. + currentIdRange = idRangeIter.next(); + currentId = currentIdRange.lowerEndpoint(); + } + + if (currentTsRangeDeletions != null) { + currentTsRangeDeletions.remove( + Range.closedOpen( + pendingAdd.getValue().getTimestamp(), + pendingAdd.getValue().getTimestamp().plus(Duration.millis(1)))); + } + idsUsed.add(Range.closedOpen(currentId, currentId + 1)); + output.accept(pendingAdd.getValue(), currentId++); + } + if (availableIdsForTsRange != null) { + availableIdsForTsRange.removeAll(idsUsed); + } + writeValues(idsAvailable, subRangeDeletions); + } + + // Remove a timestamp range. Returns ids freed up. + void remove(Range tsRange) throws ExecutionException, InterruptedException { + Map, RangeSet> idsAvailable = readIdsAvailable(); + Map, RangeSet> subRangeDeletions = readSubRangeDeletions(); + + for (Range current = getTrackedRange(tsRange.lowerEndpoint()); + current.lowerEndpoint().isBefore(tsRange.upperEndpoint()); + current = getTrackedRange(current.lowerEndpoint().plus(RESOLUTION))) { + // TODO(reuvenlax): shouldn't need to iterate over all ranges. + boolean rangeCleared; + if (!tsRange.encloses(current)) { + // This can happen if the beginning or the end of tsRange doesn't fall on a RESOLUTION + // boundary. Since we are deleting a portion of a tracked range, track what we are deleting. + RangeSet rangeDeletions = + subRangeDeletions.computeIfAbsent(current, r -> TreeRangeSet.create()); + rangeDeletions.add(tsRange.intersection(current)); + // If we ended up deleting the whole range, then we can simply remove it from the tracking + // map. + rangeCleared = rangeDeletions.encloses(current); + } else { + rangeCleared = true; + } + if (rangeCleared) { + // Remove the range from both maps. + idsAvailable.remove(current); + subRangeDeletions.remove(current); + } + } + writeValues(idsAvailable, subRangeDeletions); + } + + private void writeValues( + Map, RangeSet> idsAvailable, + Map, RangeSet> subRangeDeletions) { + if (idsAvailable.isEmpty()) { + idsAvailable.clear(); + } else { + idsAvailableValue.write(idsAvailable); + } + if (subRangeDeletions.isEmpty()) { + subRangeDeletionsValue.clear(); + } else { + subRangeDeletionsValue.write(subRangeDeletions); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/PagingIterable.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/PagingIterable.java new file mode 100644 index 0000000000000..73f076d920134 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/PagingIterable.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.AbstractIterator; + +/** + * An iterable over elements backed by paginated GetData requests to Windmill. The iterable may be + * iterated over an arbitrary number of times and multiple iterators may be active simultaneously. + * + *

There are two pattern we wish to support with low -memory and -latency: + * + *

    + *
  1. Re-iterate over the initial elements multiple times (eg Iterables.first). We'll cache the + * initial 'page' of values returned by Windmill from our first request for the lifetime of + * the iterable. + *
  2. Iterate through all elements of a very large collection. We'll send the GetData request for + * the next page when the current page is begun. We'll discard intermediate pages and only + * retain the first. Thus the maximum memory pressure is one page plus one page per call to + * iterator. + *
+ */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +class PagingIterable implements Iterable { + /** + * The reader we will use for scheduling continuation pages. + * + *

NOTE We've made this explicit to remind us to be careful not to cache the iterable. + */ + private final WindmillStateReader reader; + + /** Initial values returned for the first page. Never reclaimed. */ + private final List firstPage; + + /** State tag with continuation position set for second page. */ + private final StateTag secondPagePos; + + /** Coder for elements. */ + private final Coder coder; + + PagingIterable( + WindmillStateReader reader, + List firstPage, + StateTag secondPagePos, + Coder coder) { + this.reader = reader; + this.firstPage = firstPage; + this.secondPagePos = secondPagePos; + this.coder = coder; + } + + @Override + public Iterator iterator() { + return new PagingIterableIterator(); + } + + private class PagingIterableIterator extends AbstractIterator { + private Iterator currentPage = firstPage.iterator(); + private StateTag nextPagePos = secondPagePos; + private Future> pendingNextPage = + // NOTE: The results of continuation page reads are never cached. + reader.continuationFuture(nextPagePos, coder); + + @Override + protected ResultT computeNext() { + while (true) { + if (currentPage.hasNext()) { + return currentPage.next(); + } + if (pendingNextPage == null) { + return endOfData(); + } + + ValuesAndContPosition valuesAndContPosition; + try { + valuesAndContPosition = pendingNextPage.get(); + } catch (InterruptedException | ExecutionException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read value from state", e); + } + currentPage = valuesAndContPosition.getValues().iterator(); + StateTag.Builder nextPageBuilder = + StateTag.of( + nextPagePos.getKind(), + nextPagePos.getTag(), + nextPagePos.getStateFamily(), + valuesAndContPosition.getContinuationPosition()) + .toBuilder(); + if (secondPagePos.getSortedListRange() != null) { + nextPageBuilder.setSortedListRange(secondPagePos.getSortedListRange()); + } + if (secondPagePos.getOmitValues() != null) { + nextPageBuilder.setOmitValues(secondPagePos.getOmitValues()); + } + if (secondPagePos.getMultimapKey() != null) { + nextPageBuilder.setMultimapKey(secondPagePos.getMultimapKey()); + } + nextPagePos = nextPageBuilder.build(); + pendingNextPage = + // NOTE: The results of continuation page reads are never cached. + reader.continuationFuture(nextPagePos, coder); + } + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/RangeCoder.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/RangeCoder.java new file mode 100644 index 0000000000000..0e11531226f75 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/RangeCoder.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.List; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.StructuredCoder; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.BoundType; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; +import org.checkerframework.checker.nullness.qual.Nullable; + +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +/** Coder for closed-open ranges. */ +class RangeCoder> extends StructuredCoder> { + private final Coder boundCoder; + + RangeCoder(Coder boundCoder) { + this.boundCoder = NullableCoder.of(boundCoder); + } + + @Override + public List> getCoderArguments() { + return Lists.newArrayList(boundCoder); + } + + @Override + public void verifyDeterministic() throws NonDeterministicException { + boundCoder.verifyDeterministic(); + } + + @Override + public void encode(Range value, OutputStream outStream) throws IOException { + Preconditions.checkState( + value.lowerBoundType().equals(BoundType.CLOSED), "unexpected range " + value); + Preconditions.checkState( + value.upperBoundType().equals(BoundType.OPEN), "unexpected range " + value); + boundCoder.encode(value.hasLowerBound() ? value.lowerEndpoint() : null, outStream); + boundCoder.encode(value.hasUpperBound() ? value.upperEndpoint() : null, outStream); + } + + @Override + public Range decode(InputStream inStream) throws IOException { + @Nullable T lower = boundCoder.decode(inStream); + @Nullable T upper = boundCoder.decode(inStream); + if (lower == null) { + return upper != null ? Range.lessThan(upper) : Range.all(); + } else if (upper == null) { + return Range.atLeast(lower); + } else { + return Range.closedOpen(lower, upper); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/RangeSetCoder.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/RangeSetCoder.java new file mode 100644 index 0000000000000..291a83e81ee60 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/RangeSetCoder.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.CustomCoder; +import org.apache.beam.sdk.coders.SetCoder; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.RangeSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.TreeRangeSet; + +class RangeSetCoder> extends CustomCoder> { + private final SetCoder> rangesCoder; + + RangeSetCoder(Coder boundCoder) { + this.rangesCoder = SetCoder.of(new RangeCoder<>(boundCoder)); + } + + @Override + public void encode(RangeSet value, OutputStream outStream) throws IOException { + rangesCoder.encode(value.asRanges(), outStream); + } + + @Override + public RangeSet decode(InputStream inStream) throws IOException { + return TreeRangeSet.create(rangesCoder.decode(inStream)); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/SimpleWindmillState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/SimpleWindmillState.java new file mode 100644 index 0000000000000..bd7f8041c6800 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/SimpleWindmillState.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.IOException; +import java.util.concurrent.Future; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; + +/** + * Base class for implementations of {@link WindmillState} where the {@link #persist} call does not + * require any asynchronous reading. + */ +abstract class SimpleWindmillState extends WindmillState { + @Override + public final Future persist( + WindmillStateCache.ForKeyAndFamily cache) throws IOException { + return Futures.immediateFuture(persistDirectly(cache)); + } + + /** + * Returns a {@link Windmill.WorkItemCommitRequest} that can be used to persist this state to + * Windmill. + */ + protected abstract Windmill.WorkItemCommitRequest persistDirectly( + WindmillStateCache.ForKeyAndFamily cache) throws IOException; +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/StateTag.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/StateTag.java new file mode 100644 index 0000000000000..13c2a9e66baa3 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/StateTag.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import com.google.auto.value.AutoValue; +import javax.annotation.Nullable; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; + +/** + * When combined with a key and computationId, represents the unique address for state managed by + * Windmill. + */ +@AutoValue +public abstract class StateTag { + static StateTag of( + Kind kind, ByteString tag, String stateFamily, @Nullable RequestPositionT requestPosition) { + return new AutoValue_StateTag.Builder() + .setKind(kind) + .setTag(tag) + .setStateFamily(stateFamily) + .setRequestPosition(requestPosition) + .build(); + } + + public static StateTag of( + Kind kind, ByteString tag, String stateFamily) { + return of(kind, tag, stateFamily, null); + } + + abstract Kind getKind(); + + abstract ByteString getTag(); + + abstract String getStateFamily(); + + /** + * For {@link Kind#BAG, Kind#ORDERED_LIST, Kind#VALUE_PREFIX, KIND#MULTIMAP_SINGLE_ENTRY, + * KIND#MULTIMAP_ALL} kinds: A previous 'continuation_position' returned by Windmill to signal the + * resulting state was incomplete. Sending that position will request the next page of values. + * Null for first request. + * + *

Null for other kinds. + */ + @Nullable + public abstract RequestPositionT getRequestPosition(); + + /** For {@link Kind#ORDERED_LIST} kinds: the range to fetch or delete. */ + @Nullable + abstract Range getSortedListRange(); + + /** For {@link Kind#MULTIMAP_SINGLE_ENTRY} kinds: the key in the multimap to fetch or delete. */ + @Nullable + abstract ByteString getMultimapKey(); + + /** + * For {@link Kind#MULTIMAP_ALL} kinds: will only return the keys of the multimap and not the + * values if true. + */ + @Nullable + abstract Boolean getOmitValues(); + + public abstract Builder toBuilder(); + + public enum Kind { + VALUE, + BAG, + WATERMARK, + ORDERED_LIST, + VALUE_PREFIX, + MULTIMAP_SINGLE_ENTRY, + MULTIMAP_ALL + } + + @AutoValue.Builder + abstract static class Builder { + abstract Builder setKind(Kind kind); + + abstract Builder setTag(ByteString tag); + + abstract Builder setStateFamily(String stateFamily); + + abstract Builder setRequestPosition( + @Nullable RequestPositionT requestPosition); + + abstract Builder setSortedListRange(@Nullable Range sortedListRange); + + abstract Builder setMultimapKey(@Nullable ByteString encodedMultimapKey); + + abstract Builder setOmitValues(Boolean omitValues); + + abstract StateTag build(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/TimestampedValueWithId.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/TimestampedValueWithId.java new file mode 100644 index 0000000000000..e180efafb65bc --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/TimestampedValueWithId.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import com.google.auto.value.AutoValue; +import java.util.Comparator; +import org.apache.beam.sdk.values.TimestampedValue; +import org.joda.time.Instant; + +@AutoValue +abstract class TimestampedValueWithId { + public static final Comparator> COMPARATOR = + Comparator., Instant>comparing(v -> v.getValue().getTimestamp()) + .thenComparingLong(TimestampedValueWithId::getId); + + static TimestampedValueWithId of(TimestampedValue value, long id) { + return new AutoValue_TimestampedValueWithId<>(value, id); + } + + static TimestampedValueWithId bound(Instant ts) { + return of(TimestampedValue.of(null, ts), Long.MIN_VALUE); + } + + abstract TimestampedValue getValue(); + + abstract long getId(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ToIterableFunction.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ToIterableFunction.java new file mode 100644 index 0000000000000..3db058c79a03b --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ToIterableFunction.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; + +/** Function to extract an {@link Iterable} from the continuation-supporting page read future. */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +public class ToIterableFunction + implements Function, Iterable> { + private final StateTag stateTag; + private final Coder coder; + /** + * Reader to request continuation pages from, or {@literal null} if no continuation pages + * required. + */ + private @Nullable WindmillStateReader reader; + + public ToIterableFunction( + WindmillStateReader reader, StateTag stateTag, Coder coder) { + this.reader = reader; + this.stateTag = stateTag; + this.coder = coder; + } + + @SuppressFBWarnings( + value = "NP_METHOD_PARAMETER_TIGHTENS_ANNOTATION", + justification = "https://github.com/google/guava/issues/920") + @Override + public Iterable apply( + @Nonnull ValuesAndContPosition valuesAndContPosition) { + if (valuesAndContPosition.getContinuationPosition() == null) { + // Number of values is small enough Windmill sent us the entire bag in one response. + reader = null; + return valuesAndContPosition.getValues(); + } else { + // Return an iterable which knows how to come back for more. + StateTag.Builder continuationTBuilder = + StateTag.of( + stateTag.getKind(), + stateTag.getTag(), + stateTag.getStateFamily(), + valuesAndContPosition.getContinuationPosition()) + .toBuilder(); + if (stateTag.getSortedListRange() != null) { + continuationTBuilder.setSortedListRange(stateTag.getSortedListRange()).build(); + } + if (stateTag.getMultimapKey() != null) { + continuationTBuilder.setMultimapKey(stateTag.getMultimapKey()).build(); + } + if (stateTag.getOmitValues() != null) { + continuationTBuilder.setOmitValues(stateTag.getOmitValues()).build(); + } + return new PagingIterable<>( + reader, valuesAndContPosition.getValues(), continuationTBuilder.build(), coder); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ValuesAndContPosition.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ValuesAndContPosition.java new file mode 100644 index 0000000000000..a1002fee43806 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/ValuesAndContPosition.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.util.List; +import javax.annotation.Nullable; + +/** + * An in-memory collection of deserialized values and an optional continuation position to pass to + * Windmill when fetching the next page of values. + */ +public class ValuesAndContPosition { + private final List values; + + /** Position to pass to next request for next page of values. Null if done. */ + private final @Nullable ContinuationT continuationPosition; + + public ValuesAndContPosition(List values, @Nullable ContinuationT continuationPosition) { + this.values = values; + this.continuationPosition = continuationPosition; + } + + public List getValues() { + return values; + } + + @Nullable + public ContinuationT getContinuationPosition() { + return continuationPosition; + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/AvroCoderTestPojo.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WeightedList.java similarity index 54% rename from sdks/java/core/src/test/java/org/apache/beam/sdk/coders/AvroCoderTestPojo.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WeightedList.java index d170e6447b89a..1b39d07c6fe7b 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/AvroCoderTestPojo.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WeightedList.java @@ -15,37 +15,40 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.coders; +package org.apache.beam.runners.dataflow.worker.windmill.state; -import java.util.Objects; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; -import org.checkerframework.checker.nullness.qual.Nullable; +import java.util.List; +import org.apache.beam.sdk.util.Weighted; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ForwardingList; -/** A Pojo at the top level for use in tests. */ -class AvroCoderTestPojo { +@VisibleForTesting +public class WeightedList extends ForwardingList implements Weighted { + private final List delegate; + long weight; - public String text; - - // Empty constructor required for Avro decoding. - @SuppressWarnings("unused") - public AvroCoderTestPojo() {} - - public AvroCoderTestPojo(String text) { - this.text = text; + WeightedList(List delegate) { + this.delegate = delegate; + this.weight = 0; } @Override - public boolean equals(@Nullable Object other) { - return (other instanceof AvroCoderTestPojo) && ((AvroCoderTestPojo) other).text.equals(text); + protected List delegate() { + return delegate; } @Override - public int hashCode() { - return Objects.hash(AvroCoderTestPojo.class, text); + public boolean add(T elem) { + throw new UnsupportedOperationException("Must use AddWeighted()"); } @Override - public String toString() { - return MoreObjects.toStringHelper(this).add("text", text).toString(); + public long getWeight() { + return weight; + } + + public void addWeighted(T elem, long weight) { + delegate.add(elem); + this.weight += weight; } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillBag.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillBag.java new file mode 100644 index 0000000000000..7cdb3776dfa18 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillBag.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.BagState; +import org.apache.beam.sdk.state.ReadableState; +import org.apache.beam.sdk.util.ByteStringOutputStream; +import org.apache.beam.sdk.util.Weighted; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; + +@SuppressWarnings({ + "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +public class WindmillBag extends SimpleWindmillState implements BagState { + + private final StateNamespace namespace; + private final StateTag> address; + private final ByteString stateKey; + private final String stateFamily; + private final Coder elemCoder; + + private boolean cleared = false; + /** + * If non-{@literal null}, this contains the complete contents of the bag, except for any local + * additions. If {@literal null} then we don't know if Windmill contains additional values which + * should be part of the bag. We'll need to read them if the work item actually wants the bag + * contents. + */ + private ConcatIterables cachedValues = null; + + private List localAdditions = new ArrayList<>(); + private long encodedSize = 0; + + WindmillBag( + StateNamespace namespace, + StateTag> address, + String stateFamily, + Coder elemCoder, + boolean isNewKey) { + this.namespace = namespace; + this.address = address; + this.stateKey = WindmillStateUtil.encodeKey(namespace, address); + this.stateFamily = stateFamily; + this.elemCoder = elemCoder; + if (isNewKey) { + this.cachedValues = new ConcatIterables<>(); + } + } + + @Override + public void clear() { + cleared = true; + cachedValues = new ConcatIterables<>(); + localAdditions = new ArrayList<>(); + encodedSize = 0; + } + + /** + * Return iterable over all bag values in Windmill which should contribute to overall bag + * contents. + */ + private Iterable fetchData(Future> persistedData) { + try (Closeable scope = scopedReadState()) { + if (cachedValues != null) { + return cachedValues.snapshot(); + } + Iterable data = persistedData.get(); + if (data instanceof Weighted) { + // We have a known bounded amount of data; cache it. + cachedValues = new ConcatIterables<>(); + cachedValues.extendWith(data); + encodedSize = ((Weighted) data).getWeight(); + return cachedValues.snapshot(); + } else { + // This is an iterable that may not fit in memory at once; don't cache it. + return data; + } + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + } + + public boolean valuesAreCached() { + return cachedValues != null; + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public WindmillBag readLater() { + getFuture(); + return this; + } + + @Override + public Iterable read() { + return Iterables.concat( + fetchData(getFuture()), Iterables.limit(localAdditions, localAdditions.size())); + } + + @Override + public ReadableState isEmpty() { + return new ReadableState() { + @Override + public ReadableState readLater() { + WindmillBag.this.readLater(); + return this; + } + + @Override + public Boolean read() { + return Iterables.isEmpty(fetchData(getFuture())) && localAdditions.isEmpty(); + } + }; + } + + @Override + public void add(T input) { + localAdditions.add(input); + } + + @Override + public Windmill.WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) + throws IOException { + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + + Windmill.TagBag.Builder bagUpdatesBuilder = null; + + if (cleared) { + bagUpdatesBuilder = commitBuilder.addBagUpdatesBuilder(); + bagUpdatesBuilder.setDeleteAll(true); + cleared = false; + } + + if (!localAdditions.isEmpty()) { + // Tell Windmill to capture the local additions. + if (bagUpdatesBuilder == null) { + bagUpdatesBuilder = commitBuilder.addBagUpdatesBuilder(); + } + for (T value : localAdditions) { + ByteStringOutputStream stream = new ByteStringOutputStream(); + // Encode the value + elemCoder.encode(value, stream, Coder.Context.OUTER); + ByteString encoded = stream.toByteString(); + if (cachedValues != null) { + // We'll capture this value in the cache below. + // Capture the value's size now since we have it. + encodedSize += encoded.size(); + } + bagUpdatesBuilder.addValues(encoded); + } + } + + if (bagUpdatesBuilder != null) { + bagUpdatesBuilder.setTag(stateKey).setStateFamily(stateFamily); + } + + if (cachedValues != null) { + if (!localAdditions.isEmpty()) { + // Capture the local additions in the cached value since we and + // Windmill are now in agreement. + cachedValues.extendWith(localAdditions); + } + // We now know the complete bag contents, and any read on it will yield a + // cached value, so cache it for future reads. + cache.put(namespace, address, this, encodedSize); + } + + // Don't reuse the localAdditions object; we don't want future changes to it to + // modify the value of cachedValues. + localAdditions = new ArrayList<>(); + + return commitBuilder.buildPartial(); + } + + private Future> getFuture() { + return cachedValues != null ? null : reader.bagFuture(stateKey, stateFamily, elemCoder); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillCombiningState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillCombiningState.java new file mode 100644 index 0000000000000..98359913c7033 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillCombiningState.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.concurrent.Future; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.core.StateTags; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.BagState; +import org.apache.beam.sdk.state.CombiningState; +import org.apache.beam.sdk.state.ReadableState; +import org.apache.beam.sdk.transforms.Combine; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; + +@NotThreadSafe +class WindmillCombiningState extends WindmillState + implements CombiningState { + + private final WindmillBag bag; + private final Combine.CombineFn combineFn; + + /* We use a separate, in-memory AccumT rather than relying on the WindmillWatermarkBag's + * localAdditions, because we want to combine multiple InputT's to a single AccumT + * before adding it. + */ + private AccumT localAdditionsAccumulator; + private boolean hasLocalAdditions; + + WindmillCombiningState( + StateNamespace namespace, + StateTag> address, + String stateFamily, + Coder accumCoder, + Combine.CombineFn combineFn, + WindmillStateCache.ForKeyAndFamily cache, + boolean isNewKey) { + StateTag> internalBagAddress = StateTags.convertToBagTagInternal(address); + this.bag = + cache + .get(namespace, internalBagAddress) + .map(state -> (WindmillBag) state) + .orElseGet( + () -> + new WindmillBag<>( + namespace, internalBagAddress, stateFamily, accumCoder, isNewKey)); + + this.combineFn = combineFn; + this.localAdditionsAccumulator = combineFn.createAccumulator(); + this.hasLocalAdditions = false; + } + + @Override + void initializeForWorkItem( + WindmillStateReader reader, Supplier scopedReadStateSupplier) { + super.initializeForWorkItem(reader, scopedReadStateSupplier); + this.bag.initializeForWorkItem(reader, scopedReadStateSupplier); + } + + @Override + void cleanupAfterWorkItem() { + super.cleanupAfterWorkItem(); + bag.cleanupAfterWorkItem(); + } + + @Override + public WindmillCombiningState readLater() { + bag.readLater(); + return this; + } + + @Override + @SuppressWarnings("nullness") + public OutputT read() { + return combineFn.extractOutput(getAccum()); + } + + @Override + public void add(InputT input) { + hasLocalAdditions = true; + localAdditionsAccumulator = combineFn.addInput(localAdditionsAccumulator, input); + } + + @Override + public void clear() { + bag.clear(); + localAdditionsAccumulator = combineFn.createAccumulator(); + hasLocalAdditions = false; + } + + @Override + public Future persist(WindmillStateCache.ForKeyAndFamily cache) + throws IOException { + if (hasLocalAdditions) { + if (WindmillStateInternals.COMPACT_NOW.get().get() || bag.valuesAreCached()) { + // Implicitly clears the bag and combines local and persisted accumulators. + localAdditionsAccumulator = getAccum(); + } + bag.add(combineFn.compact(localAdditionsAccumulator)); + localAdditionsAccumulator = combineFn.createAccumulator(); + hasLocalAdditions = false; + } + + return bag.persist(cache); + } + + @Override + public AccumT getAccum() { + Iterable accumulators = + Iterables.concat(bag.read(), Collections.singleton(localAdditionsAccumulator)); + + // Compact things + AccumT merged = combineFn.mergeAccumulators(accumulators); + bag.clear(); + localAdditionsAccumulator = merged; + hasLocalAdditions = true; + return merged; + } + + @Override + public ReadableState isEmpty() { + final ReadableState bagIsEmpty = bag.isEmpty(); + return new ReadableState() { + @Override + public ReadableState readLater() { + bagIsEmpty.readLater(); + return this; + } + + @Override + public Boolean read() { + return !hasLocalAdditions && bagIsEmpty.read(); + } + }; + } + + @Override + public void addAccum(AccumT accumulator) { + hasLocalAdditions = true; + localAdditionsAccumulator = + combineFn.mergeAccumulators(Arrays.asList(localAdditionsAccumulator, accumulator)); + } + + @Override + public AccumT mergeAccumulators(Iterable accumulators) { + return combineFn.mergeAccumulators(accumulators); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillMap.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillMap.java new file mode 100644 index 0000000000000..43490a725ac4f --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillMap.java @@ -0,0 +1,449 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import static org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateUtil.encodeKey; + +import java.io.Closeable; +import java.io.IOException; +import java.util.AbstractMap; +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.function.Function; +import javax.annotation.Nullable; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.MapState; +import org.apache.beam.sdk.state.ReadableState; +import org.apache.beam.sdk.state.ReadableStates; +import org.apache.beam.sdk.util.ByteStringOutputStream; +import org.apache.beam.sdk.util.Weighted; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; + +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +public class WindmillMap extends SimpleWindmillState implements MapState { + private final StateNamespace namespace; + private final StateTag> address; + private final ByteString stateKeyPrefix; + private final String stateFamily; + private final Coder keyCoder; + private final Coder valueCoder; + // TODO(reuvenlax): Should we evict items from the cache? We would have to make sure + // that anything in the cache that is not committed is not evicted. negativeCache could be + // evicted whenever we want. + private final Map cachedValues = Maps.newHashMap(); + private final Set negativeCache = Sets.newHashSet(); + private final Set localAdditions = Sets.newHashSet(); + private final Set localRemovals = Sets.newHashSet(); + private boolean complete; + private boolean cleared = false; + + WindmillMap( + StateNamespace namespace, + StateTag> address, + String stateFamily, + Coder keyCoder, + Coder valueCoder, + boolean isNewKey) { + this.namespace = namespace; + this.address = address; + this.stateKeyPrefix = encodeKey(namespace, address); + this.stateFamily = stateFamily; + this.keyCoder = keyCoder; + this.valueCoder = valueCoder; + this.complete = isNewKey; + } + + private K userKeyFromProtoKey(ByteString tag) throws IOException { + Preconditions.checkState(tag.startsWith(stateKeyPrefix)); + ByteString keyBytes = tag.substring(stateKeyPrefix.size()); + return keyCoder.decode(keyBytes.newInput(), Coder.Context.OUTER); + } + + private ByteString protoKeyFromUserKey(K key) throws IOException { + ByteStringOutputStream keyStream = new ByteStringOutputStream(); + stateKeyPrefix.writeTo(keyStream); + keyCoder.encode(key, keyStream, Coder.Context.OUTER); + return keyStream.toByteString(); + } + + @Override + protected Windmill.WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) + throws IOException { + if (!cleared && localAdditions.isEmpty() && localRemovals.isEmpty()) { + // No changes, so return directly. + return Windmill.WorkItemCommitRequest.newBuilder().buildPartial(); + } + + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + + if (cleared) { + commitBuilder + .addTagValuePrefixDeletesBuilder() + .setStateFamily(stateFamily) + .setTagPrefix(stateKeyPrefix); + } + cleared = false; + + for (K key : localAdditions) { + ByteString keyBytes = protoKeyFromUserKey(key); + ByteStringOutputStream valueStream = new ByteStringOutputStream(); + valueCoder.encode(cachedValues.get(key), valueStream, Coder.Context.OUTER); + ByteString valueBytes = valueStream.toByteString(); + + commitBuilder + .addValueUpdatesBuilder() + .setTag(keyBytes) + .setStateFamily(stateFamily) + .getValueBuilder() + .setData(valueBytes) + .setTimestamp(Long.MAX_VALUE); + } + localAdditions.clear(); + + for (K key : localRemovals) { + ByteStringOutputStream keyStream = new ByteStringOutputStream(); + stateKeyPrefix.writeTo(keyStream); + keyCoder.encode(key, keyStream, Coder.Context.OUTER); + ByteString keyBytes = keyStream.toByteString(); + // Leaving data blank means that we delete the tag. + commitBuilder + .addValueUpdatesBuilder() + .setTag(keyBytes) + .setStateFamily(stateFamily) + .getValueBuilder() + .setTimestamp(Long.MAX_VALUE); + + V cachedValue = cachedValues.remove(key); + if (cachedValue != null) { + ByteStringOutputStream valueStream = new ByteStringOutputStream(); + valueCoder.encode(cachedValues.get(key), valueStream, Coder.Context.OUTER); + } + } + negativeCache.addAll(localRemovals); + localRemovals.clear(); + + // TODO(reuvenlax): We should store in the cache parameter, as that would enable caching the + // map + // between work items, reducing fetches to Windmill. To do so, we need keep track of the + // encoded size + // of the map, and to do so efficiently (i.e. without iterating over the entire map on every + // persist) + // we need to track the sizes of each map entry. + cache.put(namespace, address, this, 1); + return commitBuilder.buildPartial(); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState get(K key) { + return getOrDefault(key, null); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState getOrDefault( + K key, @Nullable V defaultValue) { + return new WindmillMapReadResultReadableState(key, defaultValue); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState< + @UnknownKeyFor @NonNull @Initialized Iterable> + keys() { + ReadableState>> entries = entries(); + return new WindmillMapKeysReadableState(entries); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState< + @UnknownKeyFor @NonNull @Initialized Iterable> + values() { + ReadableState>> entries = entries(); + return new WindmillMapValuesReadableState(entries); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState< + @UnknownKeyFor @NonNull @Initialized Iterable< + Map.@UnknownKeyFor @NonNull @Initialized Entry>> + entries() { + return new WindmillMapEntriesReadableState(); + } + + @Override + public ReadableState isEmpty() { + return new WindmillMapIsEmptyReadableState(); + } + + @Override + public void put(K key, V value) { + V oldValue = cachedValues.put(key, value); + if (valueCoder.consistentWithEquals() && value.equals(oldValue)) { + return; + } + localAdditions.add(key); + localRemovals.remove(key); + negativeCache.remove(key); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState computeIfAbsent( + K key, Function mappingFunction) { + Future persistedData = getFutureForKey(key); + try (Closeable scope = scopedReadState()) { + if (localRemovals.contains(key) || negativeCache.contains(key)) { + return ReadableStates.immediate(null); + } + @Nullable V cachedValue = cachedValues.get(key); + if (cachedValue != null || complete) { + return ReadableStates.immediate(cachedValue); + } + + V persistedValue = persistedData.get(); + if (persistedValue == null) { + // This is a new value. Add it to the map and return null. + put(key, mappingFunction.apply(key)); + return ReadableStates.immediate(null); + } + // TODO: Don't do this if it was already in cache. + cachedValues.put(key, persistedValue); + return ReadableStates.immediate(persistedValue); + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + } + + @Override + public void remove(K key) { + if (localRemovals.add(key)) { + cachedValues.remove(key); + localAdditions.remove(key); + } + } + + @Override + public void clear() { + cachedValues.clear(); + localAdditions.clear(); + localRemovals.clear(); + negativeCache.clear(); + cleared = true; + complete = true; + } + + private Future getFutureForKey(K key) { + try { + ByteStringOutputStream keyStream = new ByteStringOutputStream(); + stateKeyPrefix.writeTo(keyStream); + keyCoder.encode(key, keyStream, Coder.Context.OUTER); + return reader.valueFuture(keyStream.toByteString(), stateFamily, valueCoder); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Future>> getFuture() { + if (complete) { + // The caller will merge in local cached values. + return Futures.immediateFuture(Collections.emptyList()); + } else { + return reader.valuePrefixFuture(stateKeyPrefix, stateFamily, valueCoder); + } + } + + private class WindmillMapKeysReadableState implements ReadableState> { + private final ReadableState>> entries; + + public WindmillMapKeysReadableState(ReadableState>> entries) { + this.entries = entries; + } + + @Override + public Iterable read() { + return Iterables.transform(entries.read(), Map.Entry::getKey); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState> readLater() { + entries.readLater(); + return this; + } + } + + private class WindmillMapValuesReadableState implements ReadableState> { + private final ReadableState>> entries; + + public WindmillMapValuesReadableState(ReadableState>> entries) { + this.entries = entries; + } + + @Override + public @Nullable Iterable read() { + return Iterables.transform(entries.read(), Map.Entry::getValue); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState> readLater() { + entries.readLater(); + return this; + } + } + + private class WindmillMapEntriesReadableState + implements ReadableState>> { + @Override + public Iterable> read() { + if (complete) { + return Iterables.unmodifiableIterable(cachedValues.entrySet()); + } + Future>> persistedData = getFuture(); + try (Closeable scope = scopedReadState()) { + Iterable> data = persistedData.get(); + Iterable> transformedData = + Iterables.transform( + data, + entry -> { + try { + return new AbstractMap.SimpleEntry<>( + userKeyFromProtoKey(entry.getKey()), entry.getValue()); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + if (data instanceof Weighted) { + // This is a known amount of data. Cache it all. + transformedData.forEach( + e -> { + // The cached data overrides what is read from state, so call putIfAbsent. + cachedValues.putIfAbsent(e.getKey(), e.getValue()); + }); + complete = true; + return Iterables.unmodifiableIterable(cachedValues.entrySet()); + } else { + // This means that the result might be too large to cache, so don't add it to the + // local cache. Instead merge the iterables, giving priority to any local additions + // (represented in cachedValued and localRemovals) that may not have been committed + // yet. + return Iterables.unmodifiableIterable( + Iterables.concat( + cachedValues.entrySet(), + Iterables.filter( + transformedData, + e -> + !cachedValues.containsKey(e.getKey()) + && !localRemovals.contains(e.getKey())))); + } + + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public @UnknownKeyFor @NonNull @Initialized ReadableState>> + readLater() { + WindmillMap.this.getFuture(); + return this; + } + } + + private class WindmillMapIsEmptyReadableState implements ReadableState { + // TODO(reuvenlax): Can we find a more efficient way of implementing isEmpty than reading + // the entire map? + final ReadableState> keys = WindmillMap.this.keys(); + + @Override + public @Nullable Boolean read() { + return Iterables.isEmpty(keys.read()); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState readLater() { + keys.readLater(); + return this; + } + } + + private class WindmillMapReadResultReadableState implements ReadableState { + private final K key; + private final @Nullable V defaultValue; + + public WindmillMapReadResultReadableState(K key, @Nullable V defaultValue) { + this.key = key; + this.defaultValue = defaultValue; + } + + @Override + public @Nullable V read() { + Future persistedData = getFutureForKey(key); + try (Closeable scope = scopedReadState()) { + if (localRemovals.contains(key) || negativeCache.contains(key)) { + return null; + } + @Nullable V cachedValue = cachedValues.get(key); + if (cachedValue != null || complete) { + return cachedValue; + } + + V persistedValue = persistedData.get(); + if (persistedValue == null) { + negativeCache.add(key); + return defaultValue; + } + // TODO: Don't do this if it was already in cache. + cachedValues.put(key, persistedValue); + return persistedValue; + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public @UnknownKeyFor @NonNull @Initialized ReadableState readLater() { + WindmillMap.this.getFutureForKey(key); + return this; + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillMultimap.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillMultimap.java new file mode 100644 index 0000000000000..1c0b3df44c21f --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillMultimap.java @@ -0,0 +1,732 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import static org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateUtil.encodeKey; + +import java.io.Closeable; +import java.io.IOException; +import java.util.AbstractMap; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.stream.Collectors; +import org.apache.beam.repackaged.core.org.apache.commons.lang3.tuple.Triple; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.MultimapState; +import org.apache.beam.sdk.state.ReadableState; +import org.apache.beam.sdk.util.ByteStringOutputStream; +import org.apache.beam.sdk.util.Weighted; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterators; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; + +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +public class WindmillMultimap extends SimpleWindmillState implements MultimapState { + + private final StateNamespace namespace; + private final StateTag> address; + private final ByteString stateKey; + private final String stateFamily; + private final Coder keyCoder; + private final Coder valueCoder; + // Set to true when user clears the entire multimap, so that we can later send delete request to + // the windmill backend. + private boolean cleared = false; + // We use the structural value of the keys as the key in keyStateMap, so that different java + // Objects with the same content will be treated as the same Multimap key. + private Map keyStateMap = Maps.newHashMap(); + // If true, all keys are cached in keyStateMap with existence == KNOWN_EXIST. + private boolean allKeysKnown; + // True if all contents of this multimap are cached in this object. + private boolean complete; + // hasLocalAdditions and hasLocalRemovals track whether there are local changes that needs to be + // propagated to windmill. + private boolean hasLocalAdditions = false; + private boolean hasLocalRemovals = false; + + WindmillMultimap( + StateNamespace namespace, + StateTag> address, + String stateFamily, + Coder keyCoder, + Coder valueCoder, + boolean isNewShardingKey) { + this.namespace = namespace; + this.address = address; + this.stateKey = encodeKey(namespace, address); + this.stateFamily = stateFamily; + this.keyCoder = keyCoder; + this.valueCoder = valueCoder; + this.complete = isNewShardingKey; + this.allKeysKnown = isNewShardingKey; + } + + private static Iterable> unnestCachedEntries( + Iterable>>> cachedEntries) { + return Iterables.concat( + Iterables.transform( + cachedEntries, + entry -> + Iterables.transform( + entry.getValue().getRight(), + v -> new AbstractMap.SimpleEntry<>(entry.getValue().getLeft(), v)))); + } + + @Override + public void put(K key, V value) { + final Object structuralKey = keyCoder.structuralValue(key); + hasLocalAdditions = true; + keyStateMap.compute( + structuralKey, + (k, v) -> { + if (v == null) v = new KeyState(key); + v.existence = KeyExistence.KNOWN_EXIST; + v.localAdditions.add(value); + return v; + }); + } + + // Initiates a backend state read to fetch all entries if necessary. + private Future>>> necessaryEntriesFromStorageFuture( + boolean omitValues) { + if (complete) { + // Since we're complete, even if there are entries in storage we don't need to read them. + return Futures.immediateFuture(Collections.emptyList()); + } else { + return reader.multimapFetchAllFuture(omitValues, stateKey, stateFamily, valueCoder); + } + } + + // Initiates a backend state read to fetch a single entry if necessary. + private Future> necessaryKeyEntriesFromStorageFuture(K key) { + try { + ByteStringOutputStream keyStream = new ByteStringOutputStream(); + keyCoder.encode(key, keyStream, Coder.Context.OUTER); + return reader.multimapFetchSingleEntryFuture( + keyStream.toByteString(), stateKey, stateFamily, valueCoder); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public ReadableState> get(K key) { + return new ReadResultReadableState(key); + } + + @Override + protected Windmill.WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) + throws IOException { + if (!cleared && !hasLocalAdditions && !hasLocalRemovals) { + cache.put(namespace, address, this, 1); + return Windmill.WorkItemCommitRequest.newBuilder().buildPartial(); + } + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + Windmill.TagMultimapUpdateRequest.Builder builder = commitBuilder.addMultimapUpdatesBuilder(); + builder.setTag(stateKey).setStateFamily(stateFamily); + + if (cleared) { + builder.setDeleteAll(true); + } + if (hasLocalRemovals || hasLocalAdditions) { + ByteStringOutputStream keyStream = new ByteStringOutputStream(); + ByteStringOutputStream valueStream = new ByteStringOutputStream(); + Iterator> iterator = keyStateMap.entrySet().iterator(); + while (iterator.hasNext()) { + KeyState keyState = iterator.next().getValue(); + if (!keyState.removedLocally && keyState.localAdditions.isEmpty()) { + if (keyState.existence == KeyExistence.KNOWN_NONEXISTENT) iterator.remove(); + continue; + } + keyCoder.encode(keyState.originalKey, keyStream, Coder.Context.OUTER); + ByteString encodedKey = keyStream.toByteStringAndReset(); + Windmill.TagMultimapEntry.Builder entryBuilder = builder.addUpdatesBuilder(); + entryBuilder.setEntryName(encodedKey); + if (keyState.removedLocally) entryBuilder.setDeleteAll(true); + keyState.removedLocally = false; + if (!keyState.localAdditions.isEmpty()) { + for (V value : keyState.localAdditions) { + valueCoder.encode(value, valueStream, Coder.Context.OUTER); + ByteString encodedValue = valueStream.toByteStringAndReset(); + entryBuilder.addValues(encodedValue); + } + // Move newly added values from localAdditions to keyState.values as those new values + // now + // are also persisted in Windmill. If a key now has no more values and is not + // KNOWN_EXIST, + // remove it from cache. + if (keyState.valuesCached) { + keyState.values.extendWith(keyState.localAdditions); + keyState.valuesSize += keyState.localAdditions.size(); + } + // Create a new localAdditions so that the cached values are unaffected. + keyState.localAdditions = Lists.newArrayList(); + } + if (!keyState.valuesCached && keyState.existence != KeyExistence.KNOWN_EXIST) { + iterator.remove(); + } + } + } + + hasLocalAdditions = false; + hasLocalRemovals = false; + cleared = false; + + cache.put(namespace, address, this, 1); + return commitBuilder.buildPartial(); + } + + @Override + public void remove(K key) { + final Object structuralKey = keyCoder.structuralValue(key); + // does not insert key if allKeysKnown. + KeyState keyState = + keyStateMap.computeIfAbsent(structuralKey, k -> allKeysKnown ? null : new KeyState(key)); + if (keyState == null || keyState.existence == KeyExistence.KNOWN_NONEXISTENT) { + return; + } + if (keyState.valuesCached && keyState.valuesSize == 0) { + // no data in windmill, deleting from local cache is sufficient. + keyStateMap.remove(structuralKey); + } else { + // there may be data in windmill that need to be removed. + hasLocalRemovals = true; + keyState.removedLocally = true; + keyState.values = new ConcatIterables<>(); + keyState.valuesSize = 0; + keyState.existence = KeyExistence.KNOWN_NONEXISTENT; + } + if (!keyState.localAdditions.isEmpty()) { + keyState.localAdditions = Lists.newArrayList(); + } + keyState.valuesCached = true; + } + + @Override + public void clear() { + keyStateMap = Maps.newHashMap(); + cleared = true; + complete = true; + allKeysKnown = true; + hasLocalAdditions = false; + hasLocalRemovals = false; + } + + @Override + public ReadableState> keys() { + return new KeysReadableState(); + } + + @Override + public ReadableState>> entries() { + return new EntriesReadableState(); + } + + @Override + public ReadableState containsKey(K key) { + return new ContainsKeyReadableState(key); + } + + // Currently, isEmpty is implemented by reading all keys and could potentially be optimized. + // But note that if isEmpty is often followed by iterating over keys then maybe not too bad; if + // isEmpty is followed by iterating over both keys and values then it won't help much. + @Override + public ReadableState isEmpty() { + return new IsEmptyReadableState(); + } + + private enum KeyExistence { + // this key is known to exist, it has at least 1 value in either localAdditions or windmill + KNOWN_EXIST, + // this key is known to be nonexistent, it has 0 value in both localAdditions and windmill + KNOWN_NONEXISTENT, + // we don't know if this key is in this multimap, it has exact 0 value in localAddition, but + // may have no or any number of values in windmill. This is just to provide a mapping between + // the original key and the structural key. + UNKNOWN_EXISTENCE + } + + private class KeyState { + final K originalKey; + KeyExistence existence; + // valuesCached can be true if only existence == KNOWN_EXIST and all values of this key are + // cached (both values and localAdditions). + boolean valuesCached; + // Represents the values in windmill. When new values are added during user processing, they + // are added to localAdditions but not values. Those new values will be added to values only + // after they are persisted into windmill and removed from localAdditions + ConcatIterables values; + int valuesSize; + + // When new values are added during user processing, they are added to localAdditions, so that + // we can later try to persist them in windmill. When a key is removed during user processing, + // we mark removedLocally to be true so that we can later try to delete it from windmill. If + // localAdditions is not empty and removedLocally is true, values in localAdditions will be + // added to windmill after old values in windmill are removed. + List localAdditions; + boolean removedLocally; + + KeyState(K originalKey) { + this.originalKey = originalKey; + existence = KeyExistence.UNKNOWN_EXISTENCE; + valuesCached = complete; + values = new ConcatIterables<>(); + valuesSize = 0; + localAdditions = Lists.newArrayList(); + removedLocally = false; + } + } + + private class ReadResultReadableState implements ReadableState> { + final Object structuralKey; + private final K key; + + public ReadResultReadableState(K key) { + this.key = key; + structuralKey = keyCoder.structuralValue(key); + } + + @Override + public Iterable read() { + KeyState keyState = null; + if (allKeysKnown) { + keyState = keyStateMap.get(structuralKey); + if (keyState == null || keyState.existence == KeyExistence.UNKNOWN_EXISTENCE) { + if (keyState != null) keyStateMap.remove(structuralKey); + return Collections.emptyList(); + } + } else { + keyState = keyStateMap.computeIfAbsent(structuralKey, k -> new KeyState(key)); + } + if (keyState.existence == KeyExistence.KNOWN_NONEXISTENT) { + return Collections.emptyList(); + } + Iterable localNewValues = + Iterables.limit(keyState.localAdditions, keyState.localAdditions.size()); + if (keyState.removedLocally) { + // this key has been removed locally but the removal hasn't been sent to windmill, + // thus values in windmill(if any) are obsolete, and we only care about local values. + return Iterables.unmodifiableIterable(localNewValues); + } + if (keyState.valuesCached || complete) { + return Iterables.unmodifiableIterable( + Iterables.concat( + Iterables.limit(keyState.values, keyState.valuesSize), localNewValues)); + } + Future> persistedData = necessaryKeyEntriesFromStorageFuture(key); + try (Closeable scope = scopedReadState()) { + final Iterable persistedValues = persistedData.get(); + // Iterables.isEmpty() is O(1). + if (Iterables.isEmpty(persistedValues)) { + if (keyState.localAdditions.isEmpty()) { + // empty in both cache and windmill, mark key as KNOWN_NONEXISTENT. + keyState.existence = KeyExistence.KNOWN_NONEXISTENT; + return Collections.emptyList(); + } + return Iterables.unmodifiableIterable(localNewValues); + } + keyState.existence = KeyExistence.KNOWN_EXIST; + if (persistedValues instanceof Weighted) { + keyState.valuesCached = true; + ConcatIterables it = new ConcatIterables<>(); + it.extendWith(persistedValues); + keyState.values = it; + keyState.valuesSize = Iterables.size(persistedValues); + } + return Iterables.unmodifiableIterable(Iterables.concat(persistedValues, localNewValues)); + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read Multimap state", e); + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public ReadableState> readLater() { + WindmillMultimap.this.necessaryKeyEntriesFromStorageFuture(key); + return this; + } + } + + private class KeysReadableState implements ReadableState> { + + private Map cachedExistKeys() { + return keyStateMap.entrySet().stream() + .filter(entry -> entry.getValue().existence == KeyExistence.KNOWN_EXIST) + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().originalKey)); + } + + @Override + public Iterable read() { + if (allKeysKnown) { + return Iterables.unmodifiableIterable(cachedExistKeys().values()); + } + Future>>> persistedData = + necessaryEntriesFromStorageFuture(true); + try (Closeable scope = scopedReadState()) { + Iterable>> entries = persistedData.get(); + if (entries instanceof Weighted) { + // This is a known amount of data, cache them all. + entries.forEach( + entry -> { + try { + K originalKey = keyCoder.decode(entry.getKey().newInput(), Coder.Context.OUTER); + KeyState keyState = + keyStateMap.computeIfAbsent( + keyCoder.structuralValue(originalKey), stk -> new KeyState(originalKey)); + if (keyState.existence == KeyExistence.UNKNOWN_EXISTENCE) { + keyState.existence = KeyExistence.KNOWN_EXIST; + } + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + allKeysKnown = true; + keyStateMap + .values() + .removeIf( + keyState -> + keyState.existence != KeyExistence.KNOWN_EXIST && !keyState.removedLocally); + return Iterables.unmodifiableIterable(cachedExistKeys().values()); + } else { + Map cachedExistKeys = Maps.newHashMap(); + Set cachedNonExistKeys = Sets.newHashSet(); + keyStateMap.forEach( + (structuralKey, keyState) -> { + switch (keyState.existence) { + case KNOWN_EXIST: + cachedExistKeys.put(structuralKey, keyState.originalKey); + break; + case KNOWN_NONEXISTENT: + cachedNonExistKeys.add(structuralKey); + break; + default: + break; + } + }); + // keysOnlyInWindmill is lazily loaded. + Iterable keysOnlyInWindmill = + Iterables.filter( + Iterables.transform( + entries, + entry -> { + try { + K originalKey = + keyCoder.decode(entry.getKey().newInput(), Coder.Context.OUTER); + Object structuralKey = keyCoder.structuralValue(originalKey); + if (cachedExistKeys.containsKey(structuralKey) + || cachedNonExistKeys.contains(structuralKey)) return null; + return originalKey; + } catch (IOException e) { + throw new RuntimeException(e); + } + }), + Objects::nonNull); + return Iterables.unmodifiableIterable( + Iterables.concat(cachedExistKeys.values(), keysOnlyInWindmill)); + } + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public ReadableState> readLater() { + WindmillMultimap.this.necessaryEntriesFromStorageFuture(true); + return this; + } + } + + private class EntriesReadableState implements ReadableState>> { + @Override + public Iterable> read() { + if (complete) { + return Iterables.unmodifiableIterable( + unnestCachedEntries(mergedCachedEntries(null).entrySet())); + } + Future>>> persistedData = + necessaryEntriesFromStorageFuture(false); + try (Closeable scope = scopedReadState()) { + Iterable>> entries = persistedData.get(); + if (Iterables.isEmpty(entries)) { + complete = true; + allKeysKnown = true; + return Iterables.unmodifiableIterable( + unnestCachedEntries(mergedCachedEntries(null).entrySet())); + } + if (!(entries instanceof Weighted)) { + return nonWeightedEntries(entries); + } + // This is a known amount of data, cache them all. + entries.forEach( + entry -> { + try { + final K originalKey = + keyCoder.decode(entry.getKey().newInput(), Coder.Context.OUTER); + final Object structuralKey = keyCoder.structuralValue(originalKey); + KeyState keyState = + keyStateMap.computeIfAbsent(structuralKey, k -> new KeyState(originalKey)); + // Ignore any key from windmill that has been marked pending deletion or is + // fully cached. + if (keyState.existence == KeyExistence.KNOWN_NONEXISTENT + || (keyState.existence == KeyExistence.KNOWN_EXIST && keyState.valuesCached)) + return; + // Or else cache contents from windmill. + keyState.existence = KeyExistence.KNOWN_EXIST; + keyState.values.extendWith(entry.getValue()); + keyState.valuesSize += Iterables.size(entry.getValue()); + keyState.valuesCached = true; + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + allKeysKnown = true; + complete = true; + return Iterables.unmodifiableIterable( + unnestCachedEntries(mergedCachedEntries(null).entrySet())); + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public ReadableState>> readLater() { + WindmillMultimap.this.necessaryEntriesFromStorageFuture(false); + return this; + } + + /** + * Collect all cached entries into a map and all KNOWN_NONEXISTENT keys to + * knownNonexistentKeys(if not null). Note that this method is not side-effect-free: it unloads + * any key that is not KNOWN_EXIST and not pending deletion from cache; also if complete it + * marks the valuesCached of any key that is KNOWN_EXIST to true, entries() depends on this + * behavior when the fetched result is weighted to iterate the whole keyStateMap one less time. + * For each cached key, returns its structural key and a tuple of . + */ + private Map>> mergedCachedEntries( + Set knownNonexistentKeys) { + Map>> cachedEntries = Maps.newHashMap(); + keyStateMap + .entrySet() + .removeIf( + (entry -> { + Object structuralKey = entry.getKey(); + KeyState keyState = entry.getValue(); + if (complete && keyState.existence == KeyExistence.KNOWN_EXIST) { + keyState.valuesCached = true; + } + ConcatIterables it = null; + if (!keyState.localAdditions.isEmpty()) { + it = new ConcatIterables<>(); + it.extendWith( + Iterables.limit(keyState.localAdditions, keyState.localAdditions.size())); + } + if (keyState.valuesCached) { + if (it == null) it = new ConcatIterables<>(); + it.extendWith(Iterables.limit(keyState.values, keyState.valuesSize)); + } + if (it != null) { + cachedEntries.put( + structuralKey, Triple.of(keyState.originalKey, keyState.valuesCached, it)); + } + if (knownNonexistentKeys != null + && keyState.existence == KeyExistence.KNOWN_NONEXISTENT) + knownNonexistentKeys.add(structuralKey); + return (keyState.existence == KeyExistence.KNOWN_NONEXISTENT + && !keyState.removedLocally) + || keyState.existence == KeyExistence.UNKNOWN_EXISTENCE; + })); + return cachedEntries; + } + + private Iterable> nonWeightedEntries( + Iterable>> lazyWindmillEntries) { + class ResultIterable implements Iterable> { + private final Iterable>> lazyWindmillEntries; + private final Map>> cachedEntries; + private final Set knownNonexistentKeys; + + ResultIterable( + Map>> cachedEntries, + Iterable>> lazyWindmillEntries, + Set knownNonexistentKeys) { + this.cachedEntries = cachedEntries; + this.lazyWindmillEntries = lazyWindmillEntries; + this.knownNonexistentKeys = knownNonexistentKeys; + } + + @Override + public Iterator> iterator() { + // Each time when the Iterable returned by entries() is iterated, a new Iterator is + // created. Every iterator must keep its own copy of seenCachedKeys so that if a key + // is paginated into multiple iterables from windmill, the cached values of this key + // will only be returned once. + Set seenCachedKeys = Sets.newHashSet(); + // notFullyCachedEntries returns all entries from windmill that are not fully cached + // and combines them with localAdditions. If a key is fully cached, contents of this + // key from windmill are ignored. + Iterable>> notFullyCachedEntries = + Iterables.filter( + Iterables.transform( + lazyWindmillEntries, + entry -> { + try { + final K key = + keyCoder.decode(entry.getKey().newInput(), Coder.Context.OUTER); + final Object structuralKey = keyCoder.structuralValue(key); + // key is deleted in cache thus fully cached. + if (knownNonexistentKeys.contains(structuralKey)) return null; + Triple> triple = + cachedEntries.get(structuralKey); + // no record of key in cache, return content in windmill. + if (triple == null) { + return Triple.of(structuralKey, key, entry.getValue()); + } + // key is fully cached in cache. + if (triple.getMiddle()) return null; + + // key is not fully cached, combine the content in windmill with local + // additions with only the first observed page for the key to ensure + // it is not repeated. + if (!seenCachedKeys.add(structuralKey)) { + return Triple.of(structuralKey, key, entry.getValue()); + } else { + ConcatIterables it = new ConcatIterables<>(); + it.extendWith(triple.getRight()); + it.extendWith(entry.getValue()); + return Triple.of(structuralKey, key, it); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + }), + Objects::nonNull); + Iterator> unnestWindmill = + Iterators.concat( + Iterables.transform( + notFullyCachedEntries, + entry -> + Iterables.transform( + entry.getRight(), + v -> new AbstractMap.SimpleEntry<>(entry.getMiddle(), v)) + .iterator()) + .iterator()); + Iterator> fullyCached = + unnestCachedEntries( + Iterables.filter( + cachedEntries.entrySet(), + entry -> !seenCachedKeys.contains(entry.getKey()))) + .iterator(); + return Iterators.concat(unnestWindmill, fullyCached); + } + } + + Set knownNonexistentKeys = Sets.newHashSet(); + Map>> cachedEntries = + mergedCachedEntries(knownNonexistentKeys); + return Iterables.unmodifiableIterable( + new ResultIterable(cachedEntries, lazyWindmillEntries, knownNonexistentKeys)); + } + } + + private class ContainsKeyReadableState implements ReadableState { + final Object structuralKey; + private final K key; + ReadableState> values; + + public ContainsKeyReadableState(K key) { + this.key = key; + structuralKey = keyCoder.structuralValue(key); + values = null; + } + + @Override + public Boolean read() { + KeyState keyState = keyStateMap.getOrDefault(structuralKey, null); + if (keyState != null && keyState.existence != KeyExistence.UNKNOWN_EXISTENCE) { + return keyState.existence == KeyExistence.KNOWN_EXIST; + } + if (values == null) { + values = WindmillMultimap.this.get(key); + } + return !Iterables.isEmpty(values.read()); + } + + @Override + public ReadableState readLater() { + if (values == null) { + values = WindmillMultimap.this.get(key); + } + values.readLater(); + return this; + } + } + + private class IsEmptyReadableState implements ReadableState { + ReadableState> keys = null; + + @Override + public Boolean read() { + for (KeyState keyState : keyStateMap.values()) { + if (keyState.existence == KeyExistence.KNOWN_EXIST) { + return false; + } + } + if (keys == null) { + keys = WindmillMultimap.this.keys(); + } + return Iterables.isEmpty(keys.read()); + } + + @Override + public ReadableState readLater() { + if (keys == null) { + keys = WindmillMultimap.this.keys(); + } + keys.readLater(); + return this; + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillOrderedList.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillOrderedList.java new file mode 100644 index 0000000000000..c92e2e93ddfec --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillOrderedList.java @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import static org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateUtil.encodeKey; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.SortedSet; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTable; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.OrderedListState; +import org.apache.beam.sdk.state.ReadableState; +import org.apache.beam.sdk.util.ByteStringOutputStream; +import org.apache.beam.sdk.values.TimestampedValue; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.RangeSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.TreeRangeSet; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Instant; + +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +public class WindmillOrderedList extends SimpleWindmillState implements OrderedListState { + // The default proto values for SortedListRange correspond to the minimum and maximum + // timestamps. + static final long MIN_TS_MICROS = Windmill.SortedListRange.getDefaultInstance().getStart(); + static final long MAX_TS_MICROS = Windmill.SortedListRange.getDefaultInstance().getLimit(); + private final ByteString stateKey; + private final String stateFamily; + private final Coder elemCoder; + // We need to sort based on timestamp, but we need objects with the same timestamp to be treated + // as unique. We can't use a MultiSet as we can't construct a comparator that uniquely + // identifies objects, + // so we construct a unique in-memory long ids for each element. + private final SortedSet> pendingAdds = + Sets.newTreeSet(TimestampedValueWithId.COMPARATOR); + private final RangeSet pendingDeletes = TreeRangeSet.create(); + private final IdTracker idTracker; + private boolean complete; + private boolean cleared = false; + + WindmillOrderedList( + StateTable derivedStateTable, + StateNamespace namespace, + StateTag> spec, + String stateFamily, + Coder elemCoder, + boolean isNewKey) { + + this.stateKey = encodeKey(namespace, spec); + this.stateFamily = stateFamily; + this.elemCoder = elemCoder; + this.complete = isNewKey; + this.idTracker = new IdTracker(derivedStateTable, namespace, spec); + } + + @Override + public Iterable> read() { + return readRange(null, null); + } + + private SortedSet> getPendingAddRange( + @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { + SortedSet> pendingInRange = pendingAdds; + if (minTimestamp != null && limitTimestamp != null) { + pendingInRange = + pendingInRange.subSet( + TimestampedValueWithId.bound(minTimestamp), + TimestampedValueWithId.bound(limitTimestamp)); + } else if (minTimestamp == null && limitTimestamp != null) { + pendingInRange = pendingInRange.headSet(TimestampedValueWithId.bound(limitTimestamp)); + } else if (limitTimestamp == null && minTimestamp != null) { + pendingInRange = pendingInRange.tailSet(TimestampedValueWithId.bound(minTimestamp)); + } + return pendingInRange; + } + + @Override + public Iterable> readRange( + @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { + idTracker.readLater(); + + final Future>> future = getFuture(minTimestamp, limitTimestamp); + try (Closeable ignored = scopedReadState()) { + SortedSet> pendingInRange = + getPendingAddRange(minTimestamp, limitTimestamp); + + // Transform the return iterator, so it has the same type as pendingAdds. We need to ensure + // that the ids don't overlap with any in pendingAdds, so begin with pendingAdds.size(). + Iterable> data = + new Iterable>() { + // Anything returned from windmill that has been deleted should be ignored. + private final Iterable> iterable = + Iterables.filter(future.get(), tv -> !pendingDeletes.contains(tv.getTimestamp())); + + @Override + public Iterator> iterator() { + return new Iterator>() { + private final Iterator> iter = iterable.iterator(); + private long currentId = pendingAdds.size(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public TimestampedValueWithId next() { + return TimestampedValueWithId.of(iter.next(), currentId++); + } + }; + } + }; + + Iterable> includingAdds = + Iterables.mergeSorted( + ImmutableList.of(data, pendingInRange), TimestampedValueWithId.COMPARATOR); + + // TODO(reuvenlax): If we have a known bounded amount of data, cache known ranges. + return Iterables.transform(includingAdds, TimestampedValueWithId::getValue); + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + } + + @Override + public void clear() { + cleared = true; + complete = true; + pendingAdds.clear(); + pendingDeletes.clear(); + try { + idTracker.clear(); + } catch (ExecutionException | InterruptedException e) { + throw new RuntimeException(e); + } + } + + @Override + public void clearRange(Instant minTimestamp, Instant limitTimestamp) { + getPendingAddRange(minTimestamp, limitTimestamp).clear(); + pendingDeletes.add(Range.closedOpen(minTimestamp, limitTimestamp)); + } + + @Override + public void add(TimestampedValue value) { + // We use the current size of the container as the in-memory id. This works because + // pendingAdds is completely + // cleared when it is processed (otherwise we could end up with duplicate elements in the same + // container). These + // are not the ids that will be sent to windmill. + pendingAdds.add(TimestampedValueWithId.of(value, pendingAdds.size())); + // Leave pendingDeletes alone. Since we can have multiple values with the same timestamp, we + // may still need + // overlapping deletes to remove previous entries at this timestamp. + } + + @Override + public ReadableState isEmpty() { + return new ReadableState() { + @Override + public ReadableState readLater() { + WindmillOrderedList.this.readLater(); + return this; + } + + @Override + public Boolean read() { + return Iterables.isEmpty(WindmillOrderedList.this.read()); + } + }; + } + + @Override + public OrderedListState readLater() { + return readRangeLater(null, null); + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public OrderedListState readRangeLater( + @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { + idTracker.readLater(); + getFuture(minTimestamp, limitTimestamp); + return this; + } + + @Override + public Windmill.WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) + throws IOException { + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + Windmill.TagSortedListUpdateRequest.Builder updatesBuilder = + commitBuilder + .addSortedListUpdatesBuilder() + .setStateFamily(cache.getStateFamily()) + .setTag(stateKey); + try { + if (cleared) { + // Default range. + updatesBuilder.addDeletesBuilder().build(); + cleared = false; + } + + if (!pendingAdds.isEmpty()) { + // TODO(reuvenlax): Once we start caching data, we should remove this line. We have it + // here now + // because once we persist + // added data we forget about it from the cache, so the object is no longer complete. + complete = false; + + Windmill.TagSortedListInsertRequest.Builder insertBuilder = + updatesBuilder.addInsertsBuilder(); + idTracker.add( + pendingAdds, + (elem, id) -> { + try { + ByteStringOutputStream elementStream = new ByteStringOutputStream(); + elemCoder.encode(elem.getValue(), elementStream, Coder.Context.OUTER); + insertBuilder.addEntries( + Windmill.SortedListEntry.newBuilder() + .setValue(elementStream.toByteString()) + .setSortKey( + WindmillTimeUtils.harnessToWindmillTimestamp(elem.getTimestamp())) + .setId(id)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + pendingAdds.clear(); + insertBuilder.build(); + } + + if (!pendingDeletes.isEmpty()) { + for (Range range : pendingDeletes.asRanges()) { + Windmill.TagSortedListDeleteRequest.Builder deletesBuilder = + updatesBuilder.addDeletesBuilder(); + deletesBuilder.setRange( + Windmill.SortedListRange.newBuilder() + .setStart(WindmillTimeUtils.harnessToWindmillTimestamp(range.lowerEndpoint())) + .setLimit(WindmillTimeUtils.harnessToWindmillTimestamp(range.upperEndpoint()))); + deletesBuilder.build(); + idTracker.remove(range); + } + pendingDeletes.clear(); + } + } catch (ExecutionException | InterruptedException e) { + throw new RuntimeException(e); + } + return commitBuilder.buildPartial(); + } + + private Future>> getFuture( + @Nullable Instant minTimestamp, @Nullable Instant limitTimestamp) { + long startSortKey = + minTimestamp != null + ? WindmillTimeUtils.harnessToWindmillTimestamp(minTimestamp) + : MIN_TS_MICROS; + long limitSortKey = + limitTimestamp != null + ? WindmillTimeUtils.harnessToWindmillTimestamp(limitTimestamp) + : MAX_TS_MICROS; + + if (complete) { + // Right now we don't cache any data, so complete means an empty list. + // TODO(reuvenlax): change this once we start caching data. + return Futures.immediateFuture(Collections.emptyList()); + } + return reader.orderedListFuture( + Range.closedOpen(startSortKey, limitSortKey), stateKey, stateFamily, elemCoder); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillSet.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillSet.java new file mode 100644 index 0000000000000..4afb879e722e9 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillSet.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Optional; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.core.StateTags; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.coders.BooleanCoder; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.MapState; +import org.apache.beam.sdk.state.ReadableState; +import org.apache.beam.sdk.state.SetState; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; + +public class WindmillSet extends SimpleWindmillState implements SetState { + private final WindmillMap windmillMap; + + WindmillSet( + StateNamespace namespace, + StateTag> address, + String stateFamily, + Coder keyCoder, + WindmillStateCache.ForKeyAndFamily cache, + boolean isNewKey) { + StateTag> internalMapAddress = StateTags.convertToMapTagInternal(address); + + this.windmillMap = + cache + .get(namespace, internalMapAddress) + .map(map -> (WindmillMap) map) + .orElseGet( + () -> + new WindmillMap<>( + namespace, + internalMapAddress, + stateFamily, + keyCoder, + BooleanCoder.of(), + isNewKey)); + } + + @Override + protected Windmill.WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) + throws IOException { + return windmillMap.persistDirectly(cache); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState< + @UnknownKeyFor @NonNull @Initialized Boolean> + contains(K k) { + return windmillMap.getOrDefault(k, false); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState< + @UnknownKeyFor @NonNull @Initialized Boolean> + addIfAbsent(K k) { + return new WindmillSetAddIfAbsentReadableState(k); + } + + @Override + public void remove(K k) { + windmillMap.remove(k); + } + + @Override + public void add(K value) { + windmillMap.put(value, true); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState< + @UnknownKeyFor @NonNull @Initialized Boolean> + isEmpty() { + return windmillMap.isEmpty(); + } + + @Override + public Iterable read() { + return windmillMap.keys().read(); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized SetState readLater() { + windmillMap.keys().readLater(); + return this; + } + + @Override + public void clear() { + windmillMap.clear(); + } + + @Override + void initializeForWorkItem( + WindmillStateReader reader, Supplier scopedReadStateSupplier) { + windmillMap.initializeForWorkItem(reader, scopedReadStateSupplier); + } + + @Override + void cleanupAfterWorkItem() { + windmillMap.cleanupAfterWorkItem(); + } + + private class WindmillSetAddIfAbsentReadableState implements ReadableState { + ReadableState putState; + + public WindmillSetAddIfAbsentReadableState(K k) { + putState = windmillMap.putIfAbsent(k, true); + } + + @Override + public Boolean read() { + return Optional.ofNullable(putState.read()).orElse(false); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized ReadableState readLater() { + putState = putState.readLater(); + return this; + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillState.java new file mode 100644 index 0000000000000..59fd3f8a1b379 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillState.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.Future; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; + +/** + * Abstract base class for all Windmill state. + * + *

Note that these are not thread safe; each state object is associated with a key and thus only + * accessed by a single thread at once. + */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +@NotThreadSafe +abstract class WindmillState { + protected Supplier scopedReadStateSupplier; + protected WindmillStateReader reader; + + /** + * Return an asynchronously computed {@link Windmill.WorkItemCommitRequest}. The request should be + * of a form that can be merged with others (only add to repeated fields). + */ + abstract Future persist(WindmillStateCache.ForKeyAndFamily cache) + throws IOException; + + /** Prepare this (possibly reused from cache) state for reading from {@code reader} if needed. */ + void initializeForWorkItem( + WindmillStateReader reader, Supplier scopedReadStateSupplier) { + this.reader = reader; + this.scopedReadStateSupplier = scopedReadStateSupplier; + } + + /** + * This (now cached) state should never need to interact with the reader until the next work item. + * Clear it to prevent space leaks. The reader will be reset by {@link #initializeForWorkItem} + * upon the next work item. + */ + void cleanupAfterWorkItem() { + this.reader = null; + this.scopedReadStateSupplier = null; + } + + Closeable scopedReadState() { + return scopedReadStateSupplier.get(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateCache.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCache.java similarity index 92% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateCache.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCache.java index 700c7bbe01c22..6c1239d6ebd2f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateCache.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCache.java @@ -15,12 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker; +package org.apache.beam.runners.dataflow.worker.windmill.state; import java.io.IOException; import java.io.PrintWriter; import java.util.HashMap; import java.util.Objects; +import java.util.Optional; import java.util.concurrent.ConcurrentMap; import java.util.function.BiConsumer; import javax.servlet.http.HttpServletRequest; @@ -28,6 +29,9 @@ import org.apache.beam.runners.core.StateNamespace; import org.apache.beam.runners.core.StateTag; import org.apache.beam.runners.core.StateTags; +import org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker; +import org.apache.beam.runners.dataflow.worker.Weighers; +import org.apache.beam.runners.dataflow.worker.WindmillComputationKey; import org.apache.beam.runners.dataflow.worker.status.BaseStatusServlet; import org.apache.beam.runners.dataflow.worker.status.StatusDataProvider; import org.apache.beam.sdk.state.State; @@ -84,14 +88,6 @@ public WindmillStateCache(long workerCacheMb) { .build(); } - private static class EntryStats { - long entries; - long idWeight; - long entryWeight; - long entryValues; - long maxEntryValues; - } - private EntryStats calculateEntryStats() { EntryStats stats = new EntryStats(); BiConsumer consumer = @@ -119,130 +115,52 @@ public CacheStats getCacheStats() { return stateCache.stats(); } - /** Per-computation view of the state cache. */ - public class ForComputation { - - private final String computation; - - private ForComputation(String computation) { - this.computation = computation; - } - - /** Invalidate all cache entries for this computation and {@code processingKey}. */ - public void invalidate(ByteString processingKey, long shardingKey) { - WindmillComputationKey key = - WindmillComputationKey.create(computation, processingKey, shardingKey); - // By removing the ForKey object, all state for the key is orphaned in the cache and will - // be removed by normal cache cleanup. - keyIndex.remove(key); - } - - /** - * Returns a per-computation, per-key view of the state cache. Access to the cached data for - * this key is not thread-safe. Callers should ensure that there is only a single ForKey object - * in use at a time and that access to it is synchronized or single-threaded. - */ - public ForKey forKey(WindmillComputationKey computationKey, long cacheToken, long workToken) { - ForKey forKey = keyIndex.get(computationKey); - if (forKey == null || !forKey.updateTokens(cacheToken, workToken)) { - forKey = new ForKey(computationKey, cacheToken, workToken); - // We prefer this implementation to using compute because that is implemented similarly for - // ConcurrentHashMap with the downside of it performing inserts for unchanged existing - // values as well. - keyIndex.put(computationKey, forKey); - } - return forKey; - } + /** Returns a per-computation view of the state cache. */ + public ForComputation forComputation(String computation) { + return new ForComputation(computation); } - /** Per-computation, per-key view of the state cache. */ - // Note that we utilize the default equality and hashCode for this class based upon the instance - // (instead of the fields) to optimize cache invalidation. - public class ForKey { - private final WindmillComputationKey computationKey; - // Cache token must be consistent for the key for the cache to be valid. - private final long cacheToken; - - // The work token for processing must be greater than the last work token. As work items are - // increasing for a key, a less-than or equal to work token indicates that the current token is - // for stale processing. - private long workToken; - - /** - * Returns a per-computation, per-key, per-family view of the state cache. Access to the cached - * data for this key is not thread-safe. Callers should ensure that there is only a single - * ForKeyAndFamily object in use at a time for a given computation, key, family tuple and that - * access to it is synchronized or single-threaded. - */ - public ForKeyAndFamily forFamily(String stateFamily) { - return new ForKeyAndFamily(this, stateFamily); - } - - private ForKey(WindmillComputationKey computationKey, long cacheToken, long workToken) { - this.computationKey = computationKey; - this.cacheToken = cacheToken; - this.workToken = workToken; - } - - private boolean updateTokens(long cacheToken, long workToken) { - if (this.cacheToken != cacheToken || workToken <= this.workToken) { - return false; - } - this.workToken = workToken; - return true; - } + /** Print summary statistics of the cache to the given {@link PrintWriter}. */ + @Override + public void appendSummaryHtml(PrintWriter response) { + response.println("Cache Stats:
"); + response.println( + "" + + "" + + "" + + ""); + CacheStats cacheStats = stateCache.stats(); + EntryStats entryStats = calculateEntryStats(); + response.println(""); + response.println(""); + response.println(""); + response.println(""); + response.println(""); + response.println(""); + response.println(""); + response.println(""); + response.println(""); + response.println("
Hit RatioEvictionsEntriesEntry ValuesMax Entry ValuesId WeightEntry WeightMax WeightKeys
" + cacheStats.hitRate() + "" + cacheStats.evictionCount() + "" + entryStats.entries + "(" + stateCache.size() + " inc. weak) " + entryStats.entryValues + "" + entryStats.maxEntryValues + "" + entryStats.idWeight / MEGABYTES + "MB" + entryStats.entryWeight / MEGABYTES + "MB" + getMaxWeight() / MEGABYTES + "MB" + keyIndex.size() + "

"); } - /** - * Per-computation, per-key, per-family view of the state cache. Modifications are cached locally - * and must be flushed to the cache by calling persist. This class is not thread-safe. - */ - public class ForKeyAndFamily { - final ForKey forKey; - final String stateFamily; - private final HashMap localCache; - - private ForKeyAndFamily(ForKey forKey, String stateFamily) { - this.forKey = forKey; - this.stateFamily = stateFamily; - localCache = new HashMap<>(); - } - - public String getStateFamily() { - return stateFamily; - } - - public @Nullable T get(StateNamespace namespace, StateTag address) { - StateId id = new StateId(forKey, stateFamily, namespace); - @SuppressWarnings("nullness") // Unsure how to annotate lambda return allowing null. - @Nullable - StateCacheEntry entry = localCache.computeIfAbsent(id, key -> stateCache.getIfPresent(key)); - return entry == null ? null : entry.get(namespace, address); - } - - public void put( - StateNamespace namespace, StateTag address, T value, long weight) { - StateId id = new StateId(forKey, stateFamily, namespace); - @Nullable StateCacheEntry entry = localCache.get(id); - if (entry == null) { - entry = stateCache.getIfPresent(id); - if (entry == null) { - entry = new StateCacheEntry(); - } - boolean hadValue = localCache.putIfAbsent(id, entry) != null; - Preconditions.checkState(!hadValue); + public BaseStatusServlet statusServlet() { + return new BaseStatusServlet("/cachez") { + @Override + protected void doGet(HttpServletRequest request, HttpServletResponse response) + throws IOException { + PrintWriter writer = response.getWriter(); + writer.println("

Cache Information

"); + appendSummaryHtml(writer); } - entry.put(namespace, address, value, weight); - } - - public void persist() { - localCache.forEach((id, entry) -> stateCache.put(id, entry)); - } + }; } - /** Returns a per-computation view of the state cache. */ - public ForComputation forComputation(String computation) { - return new ForComputation(computation); + private static class EntryStats { + long entries; + long idWeight; + long entryWeight; + long entryValues; + long maxEntryValues; } /** @@ -297,12 +215,10 @@ public StateCacheEntry() { this.weight = 0; } - public @Nullable T get(StateNamespace namespace, StateTag tag) { - @SuppressWarnings("unchecked") - @Nullable - WeightedValue weightedValue = - (WeightedValue) values.get(new NamespacedTag<>(namespace, tag)); - return weightedValue == null ? null : weightedValue.value; + @SuppressWarnings("unchecked") + public Optional get(StateNamespace namespace, StateTag tag) { + return Optional.ofNullable((WeightedValue) values.get(new NamespacedTag<>(namespace, tag))) + .flatMap(WeightedValue::value); } public void put( @@ -362,43 +278,137 @@ public int hashCode() { } private static class WeightedValue { - public long weight; - public @Nullable T value; + private long weight; + private @Nullable T value; + + private Optional value() { + return Optional.ofNullable(this.value); + } } } - /** Print summary statistics of the cache to the given {@link PrintWriter}. */ - @Override - public void appendSummaryHtml(PrintWriter response) { - response.println("Cache Stats:
"); - response.println( - "" - + "" - + "" - + ""); - CacheStats cacheStats = stateCache.stats(); - EntryStats entryStats = calculateEntryStats(); - response.println(""); - response.println(""); - response.println(""); - response.println(""); - response.println(""); - response.println(""); - response.println(""); - response.println(""); - response.println(""); - response.println("
Hit RatioEvictionsEntriesEntry ValuesMax Entry ValuesId WeightEntry WeightMax WeightKeys
" + cacheStats.hitRate() + "" + cacheStats.evictionCount() + "" + entryStats.entries + "(" + stateCache.size() + " inc. weak) " + entryStats.entryValues + "" + entryStats.maxEntryValues + "" + entryStats.idWeight / MEGABYTES + "MB" + entryStats.entryWeight / MEGABYTES + "MB" + getMaxWeight() / MEGABYTES + "MB" + keyIndex.size() + "

"); + /** Per-computation view of the state cache. */ + public class ForComputation { + + private final String computation; + + private ForComputation(String computation) { + this.computation = computation; + } + + /** Invalidate all cache entries for this computation and {@code processingKey}. */ + public void invalidate(ByteString processingKey, long shardingKey) { + WindmillComputationKey key = + WindmillComputationKey.create(computation, processingKey, shardingKey); + // By removing the ForKey object, all state for the key is orphaned in the cache and will + // be removed by normal cache cleanup. + keyIndex.remove(key); + } + + /** + * Returns a per-computation, per-key view of the state cache. Access to the cached data for + * this key is not thread-safe. Callers should ensure that there is only a single ForKey object + * in use at a time and that access to it is synchronized or single-threaded. + */ + public ForKey forKey(WindmillComputationKey computationKey, long cacheToken, long workToken) { + ForKey forKey = keyIndex.get(computationKey); + if (forKey == null || !forKey.updateTokens(cacheToken, workToken)) { + forKey = new ForKey(computationKey, cacheToken, workToken); + // We prefer this implementation to using compute because that is implemented similarly for + // ConcurrentHashMap with the downside of it performing inserts for unchanged existing + // values as well. + keyIndex.put(computationKey, forKey); + } + return forKey; + } } - public BaseStatusServlet statusServlet() { - return new BaseStatusServlet("/cachez") { - @Override - protected void doGet(HttpServletRequest request, HttpServletResponse response) - throws IOException { - PrintWriter writer = response.getWriter(); - writer.println("

Cache Information

"); - appendSummaryHtml(writer); + /** Per-computation, per-key view of the state cache. */ + // Note that we utilize the default equality and hashCode for this class based upon the instance + // (instead of the fields) to optimize cache invalidation. + public class ForKey { + private final WindmillComputationKey computationKey; + // Cache token must be consistent for the key for the cache to be valid. + private final long cacheToken; + + // The work token for processing must be greater than the last work token. As work items are + // increasing for a key, a less-than or equal to work token indicates that the current token is + // for stale processing. + private long workToken; + + private ForKey(WindmillComputationKey computationKey, long cacheToken, long workToken) { + this.computationKey = computationKey; + this.cacheToken = cacheToken; + this.workToken = workToken; + } + + /** + * Returns a per-computation, per-key, per-family view of the state cache. Access to the cached + * data for this key is not thread-safe. Callers should ensure that there is only a single + * ForKeyAndFamily object in use at a time for a given computation, key, family tuple and that + * access to it is synchronized or single-threaded. + */ + public ForKeyAndFamily forFamily(String stateFamily) { + return new ForKeyAndFamily(this, stateFamily); + } + + private boolean updateTokens(long cacheToken, long workToken) { + if (this.cacheToken != cacheToken || workToken <= this.workToken) { + return false; } - }; + this.workToken = workToken; + return true; + } + } + + /** + * Per-computation, per-key, per-family view of the state cache. Modifications are cached locally + * and must be flushed to the cache by calling persist. This class is not thread-safe. + */ + public class ForKeyAndFamily { + final ForKey forKey; + final String stateFamily; + private final HashMap localCache; + + private ForKeyAndFamily(ForKey forKey, String stateFamily) { + this.forKey = forKey; + this.stateFamily = stateFamily; + localCache = new HashMap<>(); + } + + public String getStateFamily() { + return stateFamily; + } + + public Optional get(StateNamespace namespace, StateTag address) { + @SuppressWarnings("nullness") + // the mapping function for localCache.computeIfAbsent (i.e stateCache.getIfPresent) is + // nullable. + Optional stateCacheEntry = + Optional.ofNullable( + localCache.computeIfAbsent( + new StateId(forKey, stateFamily, namespace), stateCache::getIfPresent)); + + return stateCacheEntry.flatMap(entry -> entry.get(namespace, address)); + } + + public void put( + StateNamespace namespace, StateTag address, T value, long weight) { + StateId id = new StateId(forKey, stateFamily, namespace); + @Nullable StateCacheEntry entry = localCache.get(id); + if (entry == null) { + entry = stateCache.getIfPresent(id); + if (entry == null) { + entry = new StateCacheEntry(); + } + boolean hadValue = localCache.putIfAbsent(id, entry) != null; + Preconditions.checkState(!hadValue); + } + entry.put(namespace, address, value, weight); + } + + public void persist() { + localCache.forEach(stateCache::put); + } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternals.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternals.java new file mode 100644 index 0000000000000..c900228e86b02 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternals.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import org.apache.beam.runners.core.StateInternals; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTable; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; +import org.apache.beam.sdk.state.State; +import org.apache.beam.sdk.state.StateContext; +import org.apache.beam.sdk.state.StateContexts; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** Implementation of {@link StateInternals} using Windmill to manage the underlying data. */ +@SuppressWarnings("nullness" // TODO(https://github.com/apache/beam/issues/20497) +) +public class WindmillStateInternals implements StateInternals { + + @VisibleForTesting + static final ThreadLocal> COMPACT_NOW = + ThreadLocal.withInitial(ShouldCompactNowFn::new); + /** + * The key will be null when not in a keyed context, from the users perspective. There is still a + * "key" for the Windmill computation, but it cannot be meaningfully deserialized. + */ + private final @Nullable K key; + + private final WindmillStateCache.ForKeyAndFamily cache; + private final StateTable workItemState; + private final StateTable workItemDerivedState; + private final Supplier scopedReadStateSupplier; + + public WindmillStateInternals( + @Nullable K key, + String stateFamily, + WindmillStateReader reader, + boolean isNewKey, + WindmillStateCache.ForKeyAndFamily cache, + Supplier scopedReadStateSupplier) { + this.key = key; + this.cache = cache; + this.scopedReadStateSupplier = scopedReadStateSupplier; + this.workItemDerivedState = + CachingStateTable.builder(stateFamily, reader, cache, isNewKey, scopedReadStateSupplier) + .build(); + this.workItemState = + CachingStateTable.builder(stateFamily, reader, cache, isNewKey, scopedReadStateSupplier) + .withDerivedState(workItemDerivedState) + .build(); + } + + @Override + public @Nullable K getKey() { + return key; + } + + private void persist(List> commitsToMerge, StateTable stateTable) { + for (State location : stateTable.values()) { + if (!(location instanceof WindmillState)) { + throw new IllegalStateException( + String.format( + "%s wasn't created by %s -- unable to persist it", + location.getClass().getSimpleName(), getClass().getSimpleName())); + } + + try { + commitsToMerge.add(((WindmillState) location).persist(cache)); + } catch (IOException e) { + throw new RuntimeException("Unable to persist state", e); + } + } + + // All cached State objects now have known values. + // Clear any references to the underlying reader to prevent space leaks. + // The next work unit to use these cached State objects will reset the + // reader to a current reader in case those values are modified. + for (State location : stateTable.values()) { + ((WindmillState) location).cleanupAfterWorkItem(); + } + + // Clear out the map of already retrieved state instances. + stateTable.clear(); + } + + public void persist(final Windmill.WorkItemCommitRequest.Builder commitBuilder) { + List> commitsToMerge = new ArrayList<>(); + + // Call persist on each first, which may schedule some futures for reading. + persist(commitsToMerge, workItemState); + persist(commitsToMerge, workItemDerivedState); + + try (Closeable ignored = scopedReadStateSupplier.get()) { + for (Future commitFuture : commitsToMerge) { + commitBuilder.mergeFrom(commitFuture.get()); + } + } catch (ExecutionException | InterruptedException | IOException exc) { + if (exc instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Failed to retrieve Windmill state during persist()", exc); + } + + cache.persist(); + } + + @Override + public T state(StateNamespace namespace, StateTag address) { + return workItemState.get(namespace, address, StateContexts.nullContext()); + } + + @Override + public T state( + StateNamespace namespace, StateTag address, StateContext c) { + return workItemState.get(namespace, address, c); + } + + private static class ShouldCompactNowFn implements Supplier { + /* The rate at which, on average, this will return true. */ + private static final double RATE = 0.002; + private final Random random; + private long counter; + + private ShouldCompactNowFn() { + this.random = new Random(); + this.counter = nextSample(random); + } + + private static long nextSample(Random random) { + // Use geometric distribution to find next true value. + // This lets us avoid invoking random.nextDouble() on every call. + return (long) Math.floor(Math.log(random.nextDouble()) / Math.log(1 - RATE)); + } + + @Override + public Boolean get() { + counter--; + if (counter < 0) { + counter = nextSample(random); + return true; + } else { + return false; + } + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateReader.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReader.java similarity index 68% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateReader.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReader.java index 192a40299e0a9..c28939c59ee23 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/WindmillStateReader.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReader.java @@ -15,11 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker; +package org.apache.beam.runners.dataflow.worker.windmill.state; import com.google.api.client.util.Lists; -import com.google.auto.value.AutoValue; -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.io.IOException; import java.io.InputStream; import java.util.AbstractMap; @@ -27,43 +25,41 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import java.util.function.Supplier; import java.util.stream.Collectors; -import javax.annotation.Nonnull; import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.WindmillStateReader.StateTag.Kind; +import org.apache.beam.runners.dataflow.worker.KeyTokenInvalidException; +import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataResponse; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListEntry; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListRange; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagBag; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagSortedListFetchRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagValue; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagValuePrefixRequest; +import org.apache.beam.runners.dataflow.worker.windmill.state.StateTag.Kind; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.Coder.Context; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.util.Weighted; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.AbstractIterator; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ForwardingList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ForwardingFuture; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.SettableFuture; import org.joda.time.Instant; @@ -80,7 +76,7 @@ "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) -class WindmillStateReader { +public class WindmillStateReader { /** * Ideal maximum bytes in a TagBag response. However, Windmill will always return at least one * value if possible irrespective of this limit. @@ -116,190 +112,44 @@ class WindmillStateReader { public static final long MAX_KEY_BYTES = 16L << 20; // 16MB public static final long MAX_CONTINUATION_KEY_BYTES = 72L << 20; // 72MB - - /** - * When combined with a key and computationId, represents the unique address for state managed by - * Windmill. - */ - @AutoValue - abstract static class StateTag { - enum Kind { - VALUE, - BAG, - WATERMARK, - ORDERED_LIST, - VALUE_PREFIX, - MULTIMAP_SINGLE_ENTRY, - MULTIMAP_ALL - } - - abstract Kind getKind(); - - abstract ByteString getTag(); - - abstract String getStateFamily(); - - /** - * For {@link Kind#BAG, Kind#ORDERED_LIST, Kind#VALUE_PREFIX, KIND#MULTIMAP_SINGLE_ENTRY, - * KIND#MULTIMAP_ALL} kinds: A previous 'continuation_position' returned by Windmill to signal - * the resulting state was incomplete. Sending that position will request the next page of - * values. Null for first request. - * - *

Null for other kinds. - */ - @Nullable - abstract RequestPositionT getRequestPosition(); - - /** For {@link Kind#ORDERED_LIST} kinds: the range to fetch or delete. */ - @Nullable - abstract Range getSortedListRange(); - - /** For {@link Kind#MULTIMAP_SINGLE_ENTRY} kinds: the key in the multimap to fetch or delete. */ - @Nullable - abstract ByteString getMultimapKey(); - - /** - * For {@link Kind#MULTIMAP_ALL} kinds: will only return the keys of the multimap and not the - * values if true. - */ - @Nullable - abstract Boolean getOmitValues(); - - static StateTag of( - Kind kind, ByteString tag, String stateFamily, @Nullable RequestPositionT requestPosition) { - return new AutoValue_WindmillStateReader_StateTag.Builder() - .setKind(kind) - .setTag(tag) - .setStateFamily(stateFamily) - .setRequestPosition(requestPosition) - .build(); - } - - static StateTag of( - Kind kind, ByteString tag, String stateFamily) { - return of(kind, tag, stateFamily, null); - } - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setKind(Kind kind); - - abstract Builder setTag(ByteString tag); - - abstract Builder setStateFamily(String stateFamily); - - abstract Builder setRequestPosition( - @Nullable RequestPositionT requestPosition); - - abstract Builder setSortedListRange(@Nullable Range sortedListRange); - - abstract Builder setMultimapKey(@Nullable ByteString encodedMultimapKey); - - abstract Builder setOmitValues(Boolean omitValues); - - abstract StateTag build(); - } - } - - /** - * An in-memory collection of deserialized values and an optional continuation position to pass to - * Windmill when fetching the next page of values. - */ - private static class ValuesAndContPosition { - private final List values; - - /** Position to pass to next request for next page of values. Null if done. */ - private final @Nullable ContinuationT continuationPosition; - - public ValuesAndContPosition(List values, @Nullable ContinuationT continuationPosition) { - this.values = values; - this.continuationPosition = continuationPosition; - } - } - + @VisibleForTesting final ConcurrentLinkedQueue> pendingLookups; private final String computation; private final ByteString key; private final long shardingKey; private final long workToken; - // WindmillStateReader should only perform blocking i/o in a try-with-resources block that // declares an AutoCloseable vended by readWrapperSupplier. private final Supplier readWrapperSupplier; - - private final MetricTrackingWindmillServerStub server; - + private final MetricTrackingWindmillServerStub metricTrackingWindmillServerStub; + private final ConcurrentHashMap, CoderAndFuture> waiting; private long bytesRead = 0L; public WindmillStateReader( - MetricTrackingWindmillServerStub server, + MetricTrackingWindmillServerStub metricTrackingWindmillServerStub, String computation, ByteString key, long shardingKey, long workToken, Supplier readWrapperSupplier) { - this.server = server; + this.metricTrackingWindmillServerStub = metricTrackingWindmillServerStub; this.computation = computation; this.key = key; this.shardingKey = shardingKey; this.workToken = workToken; this.readWrapperSupplier = readWrapperSupplier; + this.waiting = new ConcurrentHashMap<>(); + this.pendingLookups = new ConcurrentLinkedQueue<>(); } public WindmillStateReader( - MetricTrackingWindmillServerStub server, + MetricTrackingWindmillServerStub metricTrackingWindmillServerStub, String computation, ByteString key, long shardingKey, long workToken) { - this(server, computation, key, shardingKey, workToken, () -> null); + this(metricTrackingWindmillServerStub, computation, key, shardingKey, workToken, () -> null); } - private static final class CoderAndFuture { - private Coder coder = null; - private final SettableFuture future; - - private CoderAndFuture(Coder coder, SettableFuture future) { - this.coder = coder; - this.future = future; - } - - private SettableFuture getFuture() { - return future; - } - - private SettableFuture getNonDoneFuture(StateTag stateTag) { - if (future.isDone()) { - throw new IllegalStateException("Future for " + stateTag + " is already done"); - } - return future; - } - - private Coder getAndClearCoder() { - if (coder == null) { - throw new IllegalStateException("Coder has already been cleared from cache"); - } - Coder result = (Coder) coder; - if (result == null) { - throw new IllegalStateException("Coder has already been cleared from cache"); - } - coder = null; - return result; - } - - private void checkNoCoder() { - if (coder != null) { - throw new IllegalStateException("Unexpected coder"); - } - } - } - - @VisibleForTesting - ConcurrentLinkedQueue> pendingLookups = new ConcurrentLinkedQueue<>(); - - private ConcurrentHashMap, CoderAndFuture> waiting = new ConcurrentHashMap<>(); - private Future stateFuture(StateTag stateTag, @Nullable Coder coder) { CoderAndFuture coderAndFuture = new CoderAndFuture<>(coder, SettableFuture.create()); CoderAndFuture existingCoderAndFutureWildcard = @@ -393,9 +243,9 @@ public Future>> valuePrefixFuture( * Internal request to fetch the next 'page' of values. Return null if no continuation position is * in {@code contStateTag}, which signals there are no more pages. */ - private @Nullable - Future> continuationFuture( - StateTag contStateTag, Coder coder) { + @Nullable + Future> continuationFuture( + StateTag contStateTag, Coder coder) { if (contStateTag.getRequestPosition() == null) { // We're done. return null; @@ -403,45 +253,6 @@ Future> continuationFuture( return stateFuture(contStateTag, coder); } - /** - * A future which will trigger a GetData request to Windmill for all outstanding futures on the - * first {@link #get}. - */ - private static class WrappedFuture extends ForwardingFuture.SimpleForwardingFuture { - /** - * The reader we'll use to service the eventual read. Null if read has been fulfilled. - * - *

NOTE: We must clear this after the read is fulfilled to prevent space leaks. - */ - private @Nullable WindmillStateReader reader; - - public WrappedFuture(WindmillStateReader reader, Future delegate) { - super(delegate); - this.reader = reader; - } - - @Override - public T get() throws InterruptedException, ExecutionException { - if (!delegate().isDone() && reader != null) { - // Only one thread per reader, so no race here. - reader.startBatchAndBlock(); - } - reader = null; - return super.get(); - } - - @Override - public T get(long timeout, TimeUnit unit) - throws InterruptedException, ExecutionException, TimeoutException { - if (!delegate().isDone() && reader != null) { - // Only one thread per reader, so no race here. - reader.startBatchAndBlock(); - } - reader = null; - return super.get(timeout, unit); - } - } - private Future wrappedFuture(final Future future) { if (future.isDone()) { // If the underlying lookup is already complete, we don't need to create the wrapper. @@ -452,59 +263,6 @@ private Future wrappedFuture(final Future future) { } } - /** Function to extract an {@link Iterable} from the continuation-supporting page read future. */ - private static class ToIterableFunction - implements Function, Iterable> { - /** - * Reader to request continuation pages from, or {@literal null} if no continuation pages - * required. - */ - private @Nullable WindmillStateReader reader; - - private final StateTag stateTag; - private final Coder coder; - - public ToIterableFunction( - WindmillStateReader reader, StateTag stateTag, Coder coder) { - this.reader = reader; - this.stateTag = stateTag; - this.coder = coder; - } - - @SuppressFBWarnings( - value = "NP_METHOD_PARAMETER_TIGHTENS_ANNOTATION", - justification = "https://github.com/google/guava/issues/920") - @Override - public Iterable apply( - @Nonnull ValuesAndContPosition valuesAndContPosition) { - if (valuesAndContPosition.continuationPosition == null) { - // Number of values is small enough Windmill sent us the entire bag in one response. - reader = null; - return valuesAndContPosition.values; - } else { - // Return an iterable which knows how to come back for more. - StateTag.Builder continuationTBuilder = - StateTag.of( - stateTag.getKind(), - stateTag.getTag(), - stateTag.getStateFamily(), - valuesAndContPosition.continuationPosition) - .toBuilder(); - if (stateTag.getSortedListRange() != null) { - continuationTBuilder.setSortedListRange(stateTag.getSortedListRange()).build(); - } - if (stateTag.getMultimapKey() != null) { - continuationTBuilder.setMultimapKey(stateTag.getMultimapKey()).build(); - } - if (stateTag.getOmitValues() != null) { - continuationTBuilder.setOmitValues(stateTag.getOmitValues()).build(); - } - return new PagingIterable( - reader, valuesAndContPosition.values, continuationTBuilder.build(), coder); - } - } - } - /** * Return future which transforms a {@code ValuesAndContPosition} result into the initial * Iterable result expected from the external caller. @@ -559,53 +317,101 @@ private void delayUnbatchableMultimapFetches( } } - public void startBatchAndBlock() { - // First, drain work out of the pending lookups into a set. These will be the items we fetch. + private void delayUnbatchableOrderedListFetches( + List> orderedListTags, HashSet> toFetch) { + // Each KeyedGetDataRequest can have at most 1 TagOrderedListRequest per + // pair, thus we need to delay unbatchable ordered list requests of the same stateFamily and tag + // into later batches. + + Map>>> groupedTags = + orderedListTags.stream() + .collect( + Collectors.groupingBy( + StateTag::getStateFamily, Collectors.groupingBy(StateTag::getTag))); + + for (Map>> familyTags : groupedTags.values()) { + for (List> tags : familyTags.values()) { + StateTag first = tags.remove(0); + toFetch.add(first); + // Add the rest of the reads for the state family and tags back to pending. + pendingLookups.addAll(tags); + } + } + } + + private HashSet> buildFetchSet() { HashSet> toFetch = Sets.newHashSet(); - try { - List> multimapTags = Lists.newArrayList(); - while (!pendingLookups.isEmpty()) { - StateTag stateTag = pendingLookups.poll(); - if (stateTag == null) { - break; - } - if (stateTag.getKind() == Kind.MULTIMAP_ALL - || stateTag.getKind() == Kind.MULTIMAP_SINGLE_ENTRY) { - multimapTags.add(stateTag); - continue; - } - if (!toFetch.add(stateTag)) { - throw new IllegalStateException("Duplicate tags being fetched."); - } + List> multimapTags = Lists.newArrayList(); + List> orderedListTags = Lists.newArrayList(); + while (!pendingLookups.isEmpty()) { + StateTag stateTag = pendingLookups.poll(); + if (stateTag == null) { + break; + } + if (stateTag.getKind() == Kind.MULTIMAP_ALL + || stateTag.getKind() == Kind.MULTIMAP_SINGLE_ENTRY) { + multimapTags.add(stateTag); + continue; } - if (!multimapTags.isEmpty()) { - delayUnbatchableMultimapFetches(multimapTags, toFetch); + if (stateTag.getKind() == Kind.ORDERED_LIST) { + orderedListTags.add(stateTag); + continue; } - // If we failed to drain anything, some other thread pulled it off the queue. We have no work - // to do. - if (toFetch.isEmpty()) { - return; + if (!toFetch.add(stateTag)) { + throw new IllegalStateException("Duplicate tags being fetched."); } + } + if (!multimapTags.isEmpty()) { + delayUnbatchableMultimapFetches(multimapTags, toFetch); + } + if (!orderedListTags.isEmpty()) { + delayUnbatchableOrderedListFetches(orderedListTags, toFetch); + } + return toFetch; + } - Windmill.KeyedGetDataRequest request = createRequest(toFetch); - Windmill.KeyedGetDataResponse response; - try (AutoCloseable readWrapper = readWrapperSupplier.get()) { - response = server.getStateData(computation, request); + public void performReads() { + while (true) { + HashSet> toFetch = buildFetchSet(); + if (toFetch.isEmpty()) { + return; } - if (response == null) { - throw new RuntimeException("Windmill unexpectedly returned null for request " + request); + try { + KeyedGetDataResponse response = tryGetDataFromWindmill(toFetch); + // Removes tags from toFetch as they are processed. + consumeResponse(response, toFetch); + if (!toFetch.isEmpty()) { + throw new IllegalStateException( + "Didn't receive responses for all pending fetches. Missing: " + toFetch); + } + } catch (Exception e) { + // Set up all the remaining futures for this key to throw an exception. This ensures that if + // the exception is caught that all futures have been completed and do not block. + for (StateTag stateTag : toFetch) { + waiting.get(stateTag).future.setException(e); + } + // Also setup futures that may have been added back if they were not batched. + while (true) { + @Nullable StateTag stateTag = pendingLookups.poll(); + if (stateTag == null) break; + waiting.get(stateTag).future.setException(e); + } + throw new RuntimeException(e); } + } + } - // Removes tags from toFetch as they are processed. - consumeResponse(response, toFetch); - } catch (Exception e) { - // Set up all the remaining futures for this key to throw an exception. This ensures that if - // the exception is caught that all futures have been completed and do not block. - for (StateTag stateTag : toFetch) { - waiting.get(stateTag).future.setException(e); - } - throw new RuntimeException(e); + private KeyedGetDataResponse tryGetDataFromWindmill(HashSet> stateTags) + throws Exception { + KeyedGetDataRequest keyedGetDataRequest = createRequest(stateTags); + try (AutoCloseable ignored = readWrapperSupplier.get()) { + return Optional.ofNullable( + metricTrackingWindmillServerStub.getStateData(computation, keyedGetDataRequest)) + .orElseThrow( + () -> + new RuntimeException( + "Windmill unexpectedly returned null for request " + keyedGetDataRequest)); } } @@ -613,9 +419,9 @@ public long getBytesRead() { return bytesRead; } - private Windmill.KeyedGetDataRequest createRequest(Iterable> toFetch) { - Windmill.KeyedGetDataRequest.Builder keyedDataBuilder = - Windmill.KeyedGetDataRequest.newBuilder() + private KeyedGetDataRequest createRequest(Iterable> toFetch) { + KeyedGetDataRequest.Builder keyedDataBuilder = + KeyedGetDataRequest.newBuilder() .setKey(key) .setShardingKey(shardingKey) .setWorkToken(workToken); @@ -760,7 +566,7 @@ private Windmill.KeyedGetDataRequest createRequest(Iterable> toFetch return keyedDataBuilder.build(); } - private void consumeResponse(Windmill.KeyedGetDataResponse response, Set> toFetch) { + private void consumeResponse(KeyedGetDataResponse response, Set> toFetch) { bytesRead += response.getSerializedSize(); if (response.getFailed()) { throw new KeyTokenInvalidException(key.toStringUtf8()); @@ -878,48 +684,12 @@ private void consumeResponse(Windmill.KeyedGetDataResponse response, Set extends ForwardingList implements Weighted { - private List delegate; - long weight; - - WeightedList(List delegate) { - this.delegate = delegate; - this.weight = 0; - } - - @Override - protected List delegate() { - return delegate; - } - - @Override - public boolean add(T elem) { - throw new UnsupportedOperationException("Must use AddWeighted()"); - } - - @Override - public long getWeight() { - return weight; - } - - public void addWeighted(T elem, long weight) { - delegate.add(elem); - this.weight += weight; - } } /** The deserialized values in {@code bag} as a read-only array list. */ private List bagPageValues(TagBag bag, Coder elemCoder) { if (bag.getValuesCount() == 0) { - return new WeightedList(Collections.emptyList()); + return new WeightedList(Collections.emptyList()); } WeightedList valueList = new WeightedList<>(new ArrayList(bag.getValuesCount())); @@ -1013,23 +783,19 @@ private List>> multimapPageValues( private void consumeBag(TagBag bag, StateTag stateTag) { boolean shouldRemove; - if (stateTag.getRequestPosition() == null) { - // This is the response for the first page. - // Leave the future in the cache so subsequent requests for the first page - // can return immediately. - shouldRemove = false; - } else { - // This is a response for a subsequent page. - // Don't cache the future since we may need to make multiple requests with different - // continuation positions. - shouldRemove = true; - } + // This is the response for the first page. + // Leave the future in the cache so subsequent requests for the first page + // can return immediately. + // This is a response for a subsequent page. + // Don't cache the future since we may need to make multiple requests with different + // continuation positions. + shouldRemove = stateTag.getRequestPosition() != null; CoderAndFuture> coderAndFuture = getWaiting(stateTag, shouldRemove); SettableFuture> future = coderAndFuture.getNonDoneFuture(stateTag); try { - Coder coder = coderAndFuture.getAndClearCoder(); + Coder coder = coderAndFuture.getAndClearCoder(); List values = this.bagPageValues(bag, coder); future.set( new ValuesAndContPosition<>( @@ -1082,18 +848,14 @@ private void consumeTagValue(TagValue tagValue, StateTag stateTag) { private void consumeTagPrefixResponse( Windmill.TagValuePrefixResponse tagValuePrefixResponse, StateTag stateTag) { boolean shouldRemove; - if (stateTag.getRequestPosition() == null) { - // This is the response for the first page. - // Leave the future in the cache so subsequent - // requests for the first page - // can return immediately. - shouldRemove = false; - } else { - // This is a response for a subsequent page. - // Don't cache the future since we may need to make multiple requests with different - // continuation positions. - shouldRemove = true; - } + // This is the response for the first page. + // Leave the future in the cache so subsequent + // requests for the first page + // can return immediately. + // This is a response for a subsequent page. + // Don't cache the future since we may need to make multiple requests with different + // continuation positions. + shouldRemove = stateTag.getRequestPosition() != null; CoderAndFuture, ByteString>> coderAndFuture = getWaiting(stateTag, shouldRemove); @@ -1117,17 +879,13 @@ private void consumeTagPrefixResponse( private void consumeSortedList( Windmill.TagSortedListFetchResponse sortedListFetchResponse, StateTag stateTag) { boolean shouldRemove; - if (stateTag.getRequestPosition() == null) { - // This is the response for the first page.// Leave the future in the cache so subsequent - // requests for the first page - // can return immediately. - shouldRemove = false; - } else { - // This is a response for a subsequent page. - // Don't cache the future since we may need to make multiple requests with different - // continuation positions. - shouldRemove = true; - } + // This is the response for the first page.// Leave the future in the cache so subsequent + // requests for the first page + // can return immediately. + // This is a response for a subsequent page. + // Don't cache the future since we may need to make multiple requests with different + // continuation positions. + shouldRemove = stateTag.getRequestPosition() != null; CoderAndFuture, ByteString>> coderAndFuture = getWaiting(stateTag, shouldRemove); @@ -1187,102 +945,42 @@ private void consumeMultimapSingleEntry( } } - /** - * An iterable over elements backed by paginated GetData requests to Windmill. The iterable may be - * iterated over an arbitrary number of times and multiple iterators may be active simultaneously. - * - *

There are two pattern we wish to support with low -memory and -latency: - * - *

    - *
  1. Re-iterate over the initial elements multiple times (eg Iterables.first). We'll cache the - * initial 'page' of values returned by Windmill from our first request for the lifetime of - * the iterable. - *
  2. Iterate through all elements of a very large collection. We'll send the GetData request - * for the next page when the current page is begun. We'll discard intermediate pages and - * only retain the first. Thus the maximum memory pressure is one page plus one page per - * call to iterator. - *
- */ - private static class PagingIterable implements Iterable { - /** - * The reader we will use for scheduling continuation pages. - * - *

NOTE We've made this explicit to remind us to be careful not to cache the iterable. - */ - private final WindmillStateReader reader; - - /** Initial values returned for the first page. Never reclaimed. */ - private final List firstPage; - - /** State tag with continuation position set for second page. */ - private final StateTag secondPagePos; - - /** Coder for elements. */ - private final Coder coder; - - private PagingIterable( - WindmillStateReader reader, - List firstPage, - StateTag secondPagePos, - Coder coder) { - this.reader = reader; - this.firstPage = firstPage; - this.secondPagePos = secondPagePos; + private static final class CoderAndFuture { + private final SettableFuture future; + private Coder coder = null; + + private CoderAndFuture(Coder coder, SettableFuture future) { this.coder = coder; + this.future = future; } - @Override - public Iterator iterator() { - return new AbstractIterator() { - private Iterator currentPage = firstPage.iterator(); - private StateTag nextPagePos = secondPagePos; - private Future> pendingNextPage = - // NOTE: The results of continuation page reads are never cached. - reader.continuationFuture(nextPagePos, coder); - - @Override - protected ResultT computeNext() { - while (true) { - if (currentPage.hasNext()) { - return currentPage.next(); - } - if (pendingNextPage == null) { - return endOfData(); - } + private SettableFuture getFuture() { + return future; + } - ValuesAndContPosition valuesAndContPosition; - try { - valuesAndContPosition = pendingNextPage.get(); - } catch (InterruptedException | ExecutionException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new RuntimeException("Unable to read value from state", e); - } - currentPage = valuesAndContPosition.values.iterator(); - StateTag.Builder nextPageBuilder = - StateTag.of( - nextPagePos.getKind(), - nextPagePos.getTag(), - nextPagePos.getStateFamily(), - valuesAndContPosition.continuationPosition) - .toBuilder(); - if (secondPagePos.getSortedListRange() != null) { - nextPageBuilder.setSortedListRange(secondPagePos.getSortedListRange()); - } - if (secondPagePos.getOmitValues() != null) { - nextPageBuilder.setOmitValues(secondPagePos.getOmitValues()); - } - if (secondPagePos.getMultimapKey() != null) { - nextPageBuilder.setMultimapKey(secondPagePos.getMultimapKey()); - } - nextPagePos = nextPageBuilder.build(); - pendingNextPage = - // NOTE: The results of continuation page reads are never cached. - reader.continuationFuture(nextPagePos, coder); - } - } - }; + private SettableFuture getNonDoneFuture(StateTag stateTag) { + if (future.isDone()) { + throw new IllegalStateException("Future for " + stateTag + " is already done"); + } + return future; + } + + private Coder getAndClearCoder() { + if (coder == null) { + throw new IllegalStateException("Coder has already been cleared from cache"); + } + Coder result = (Coder) coder; + if (result == null) { + throw new IllegalStateException("Coder has already been cleared from cache"); + } + coder = null; + return result; + } + + private void checkNoCoder() { + if (coder != null) { + throw new IllegalStateException("Unexpected coder"); + } } } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateUtil.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateUtil.java new file mode 100644 index 0000000000000..3cac5c3c5724b --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateUtil.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.sdk.util.ByteStringOutputStream; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; + +class WindmillStateUtil { + /** Encodes the given namespace and address as {@code <namespace>+<address>}. */ + @VisibleForTesting + static ByteString encodeKey(StateNamespace namespace, StateTag address) { + try { + // Use ByteStringOutputStream rather than concatenation and String.format. We build these keys + // a lot, and this leads to better performance results. See associated benchmarks. + ByteStringOutputStream stream = new ByteStringOutputStream(); + OutputStreamWriter writer = new OutputStreamWriter(stream, StandardCharsets.UTF_8); + + // stringKey starts and ends with a slash. We separate it from the + // StateTag ID by a '+' (which is guaranteed not to be in the stringKey) because the + // ID comes from the user. + namespace.appendTo(writer); + writer.write('+'); + address.appendTo(writer); + writer.flush(); + return stream.toByteString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillValue.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillValue.java new file mode 100644 index 0000000000000..1ea6e56435d26 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillValue.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import static org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateUtil.encodeKey; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.util.ByteStringOutputStream; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; + +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +public class WindmillValue extends SimpleWindmillState implements ValueState { + private final StateNamespace namespace; + private final StateTag> address; + private final ByteString stateKey; + private final String stateFamily; + private final Coder coder; + + /** Whether we've modified the value since creation of this state. */ + private boolean modified = false; + /** Whether the in memory value is the true value. */ + private boolean valueIsKnown = false; + /** The size of the encoded value */ + private long cachedSize = -1; + + private T value; + + WindmillValue( + StateNamespace namespace, + StateTag> address, + String stateFamily, + Coder coder, + boolean isNewKey) { + this.namespace = namespace; + this.address = address; + this.stateKey = encodeKey(namespace, address); + this.stateFamily = stateFamily; + this.coder = coder; + if (isNewKey) { + this.valueIsKnown = true; + this.value = null; + } + } + + @Override + public void clear() { + modified = true; + valueIsKnown = true; + value = null; + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public WindmillValue readLater() { + getFuture(); + return this; + } + + @Override + public T read() { + try (Closeable scope = scopedReadState()) { + if (!valueIsKnown) { + cachedSize = -1; + } + value = getFuture().get(); + valueIsKnown = true; + return value; + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read value from state", e); + } + } + + @Override + public void write(T value) { + modified = true; + valueIsKnown = true; + cachedSize = -1; + this.value = value; + } + + @Override + protected Windmill.WorkItemCommitRequest persistDirectly(WindmillStateCache.ForKeyAndFamily cache) + throws IOException { + if (!valueIsKnown) { + // The value was never read, written or cleared. + // Thus nothing to update in Windmill. + // And no need to add to global cache. + return Windmill.WorkItemCommitRequest.newBuilder().buildPartial(); + } + + ByteString encoded = null; + if (cachedSize == -1 || modified) { + ByteStringOutputStream stream = new ByteStringOutputStream(); + if (value != null) { + coder.encode(value, stream, Coder.Context.OUTER); + } + encoded = stream.toByteString(); + cachedSize = encoded.size(); + } + + // Place in cache to avoid a future read. + cache.put(namespace, address, this, cachedSize); + + if (!modified) { + // The value was read, but never written or cleared. + // But nothing to update in Windmill. + return Windmill.WorkItemCommitRequest.newBuilder().buildPartial(); + } + + // The value was written or cleared. Commit that change to Windmill. + modified = false; + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + commitBuilder + .addValueUpdatesBuilder() + .setTag(stateKey) + .setStateFamily(stateFamily) + .getValueBuilder() + .setData(encoded) + .setTimestamp(Long.MAX_VALUE); + return commitBuilder.buildPartial(); + } + + private Future getFuture() { + // WindmillStateReader guarantees that we can ask for a future for a particular tag multiple + // times and it will efficiently be reused. + return valueIsKnown + ? Futures.immediateFuture(value) + : reader.valueFuture(stateKey, stateFamily, coder); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillWatermarkHold.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillWatermarkHold.java new file mode 100644 index 0000000000000..a800c2eb6dadb --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillWatermarkHold.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import static org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateUtil.encodeKey; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import org.apache.beam.runners.core.StateNamespace; +import org.apache.beam.runners.core.StateTag; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.sdk.state.ReadableState; +import org.apache.beam.sdk.state.WatermarkHoldState; +import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; +import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Optional; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Futures; +import org.joda.time.Instant; + +@SuppressWarnings({ + "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +public class WindmillWatermarkHold extends WindmillState implements WatermarkHoldState { + // The encoded size of an Instant. + private static final int ENCODED_SIZE = 8; + + private final TimestampCombiner timestampCombiner; + private final StateNamespace namespace; + private final StateTag address; + private final ByteString stateKey; + private final String stateFamily; + + private boolean cleared = false; + /** + * If non-{@literal null}, the known current hold value, or absent if we know there are no output + * watermark holds. If {@literal null}, the current hold value could depend on holds in Windmill + * we do not yet know. + */ + private Optional cachedValue = null; + + private Instant localAdditions = null; + + WindmillWatermarkHold( + StateNamespace namespace, + StateTag address, + String stateFamily, + TimestampCombiner timestampCombiner, + boolean isNewKey) { + this.namespace = namespace; + this.address = address; + this.stateKey = encodeKey(namespace, address); + this.stateFamily = stateFamily; + this.timestampCombiner = timestampCombiner; + if (isNewKey) { + cachedValue = Optional.absent(); + } + } + + @Override + public void clear() { + cleared = true; + cachedValue = Optional.absent(); + localAdditions = null; + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public WindmillWatermarkHold readLater() { + getFuture(); + return this; + } + + @Override + public Instant read() { + try (Closeable scope = scopedReadState()) { + Instant persistedHold = getFuture().get(); + if (persistedHold == null) { + cachedValue = Optional.absent(); + } else { + cachedValue = Optional.of(persistedHold); + } + } catch (InterruptedException | ExecutionException | IOException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new RuntimeException("Unable to read state", e); + } + + if (localAdditions == null) { + return cachedValue.orNull(); + } else if (!cachedValue.isPresent()) { + return localAdditions; + } else { + return timestampCombiner.combine(localAdditions, cachedValue.get()); + } + } + + @Override + public ReadableState isEmpty() { + throw new UnsupportedOperationException(); + } + + @Override + public void add(Instant outputTime) { + localAdditions = + (localAdditions == null) + ? outputTime + : timestampCombiner.combine(outputTime, localAdditions); + } + + @Override + public TimestampCombiner getTimestampCombiner() { + return timestampCombiner; + } + + @Override + public Future persist( + final WindmillStateCache.ForKeyAndFamily cache) { + + Future result; + + if (!cleared && localAdditions == null) { + // No changes, so no need to update Windmill and no need to cache any value. + return Futures.immediateFuture(Windmill.WorkItemCommitRequest.newBuilder().buildPartial()); + } + + if (cleared && localAdditions == null) { + // Just clearing the persisted state; blind delete + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + commitBuilder + .addWatermarkHoldsBuilder() + .setTag(stateKey) + .setStateFamily(stateFamily) + .setReset(true); + + result = Futures.immediateFuture(commitBuilder.buildPartial()); + } else if (cleared && localAdditions != null) { + // Since we cleared before adding, we can do a blind overwrite of persisted state + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + commitBuilder + .addWatermarkHoldsBuilder() + .setTag(stateKey) + .setStateFamily(stateFamily) + .setReset(true) + .addTimestamps(WindmillTimeUtils.harnessToWindmillTimestamp(localAdditions)); + + cachedValue = Optional.of(localAdditions); + + result = Futures.immediateFuture(commitBuilder.buildPartial()); + } else if (!cleared && localAdditions != null) { + // Otherwise, we need to combine the local additions with the already persisted data + result = combineWithPersisted(); + } else { + throw new IllegalStateException("Unreachable condition"); + } + + return Futures.lazyTransform( + result, + result1 -> { + cleared = false; + localAdditions = null; + if (cachedValue != null) { + cache.put(namespace, address, WindmillWatermarkHold.this, ENCODED_SIZE); + } + return result1; + }); + } + + private Future getFuture() { + return cachedValue != null + ? Futures.immediateFuture(cachedValue.orNull()) + : reader.watermarkFuture(stateKey, stateFamily); + } + + /** + * Combines local additions with persisted data and mutates the {@code commitBuilder} to write the + * result. + */ + private Future combineWithPersisted() { + boolean windmillCanCombine = false; + + // If the combined output time depends only on the window, then we are just blindly adding + // the same value that may or may not already be present. This depends on the state only being + // used for one window. + windmillCanCombine |= timestampCombiner.dependsOnlyOnWindow(); + + // If the combined output time depends only on the earliest input timestamp, then because + // assignOutputTime is monotonic, the hold only depends on the earliest output timestamp + // (which is the value submitted as a watermark hold). The only way holds for later inputs + // can be redundant is if the are later (or equal) to the earliest. So taking the MIN + // implicitly, as Windmill does, has the desired behavior. + windmillCanCombine |= timestampCombiner.dependsOnlyOnEarliestTimestamp(); + + if (windmillCanCombine) { + // We do a blind write and let Windmill take the MIN + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + commitBuilder + .addWatermarkHoldsBuilder() + .setTag(stateKey) + .setStateFamily(stateFamily) + .addTimestamps(WindmillTimeUtils.harnessToWindmillTimestamp(localAdditions)); + + if (cachedValue != null) { + cachedValue = + Optional.of( + cachedValue.isPresent() + ? timestampCombiner.combine(cachedValue.get(), localAdditions) + : localAdditions); + } + + return Futures.immediateFuture(commitBuilder.buildPartial()); + } else { + // The non-fast path does a read-modify-write + return Futures.lazyTransform( + (cachedValue != null) + ? Futures.immediateFuture(cachedValue.orNull()) + : reader.watermarkFuture(stateKey, stateFamily), + priorHold -> { + cachedValue = + Optional.of( + (priorHold != null) + ? timestampCombiner.combine(priorHold, localAdditions) + : localAdditions); + Windmill.WorkItemCommitRequest.Builder commitBuilder = + Windmill.WorkItemCommitRequest.newBuilder(); + commitBuilder + .addWatermarkHoldsBuilder() + .setTag(stateKey) + .setStateFamily(stateFamily) + .setReset(true) + .addTimestamps(WindmillTimeUtils.harnessToWindmillTimestamp(cachedValue.get())); + + return commitBuilder.buildPartial(); + }); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WrappedFuture.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WrappedFuture.java new file mode 100644 index 0000000000000..7e894524bef30 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/state/WrappedFuture.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.state; + +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import javax.annotation.Nullable; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ForwardingFuture; + +/** + * A future which will trigger a GetData request to Windmill for all outstanding futures on the + * first {@link #get}. + */ +public class WrappedFuture extends ForwardingFuture.SimpleForwardingFuture { + /** + * The reader we'll use to service the eventual read. Null if read has been fulfilled. + * + *

NOTE: We must clear this after the read is fulfilled to prevent space leaks. + */ + private @Nullable WindmillStateReader reader; + + public WrappedFuture(WindmillStateReader reader, Future delegate) { + super(delegate); + this.reader = reader; + } + + @Override + public T get() throws InterruptedException, ExecutionException { + if (!delegate().isDone() && reader != null) { + // Only one thread per reader, so no race here. + reader.performReads(); + } + reader = null; + return super.get(); + } + + @Override + public T get(long timeout, TimeUnit unit) + throws InterruptedException, ExecutionException, TimeoutException { + if (!delegate().isDone() && reader != null) { + // Only one thread per reader, so no race here. + reader.performReads(); + } + reader = null; + return super.get(timeout, unit); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemReceiver.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemReceiver.java new file mode 100644 index 0000000000000..307dfdfa17b37 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemReceiver.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work; + +import java.util.Collection; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Instant; + +/** Functional interface for receiving WorkItems. */ +@FunctionalInterface +public interface WorkItemReceiver { + void receiveWork( + String computation, + @Nullable Instant inputDataWatermark, + @Nullable Instant synchronizedProcessingTime, + Windmill.WorkItem workItem, + Collection getWorkStreamLatencies); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudget.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudget.java new file mode 100644 index 0000000000000..0038e3e9cc60f --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudget.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.budget; + +import com.google.auto.value.AutoValue; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; + +/** + * Budget of items and bytes for fetching {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem}(s) via {@link + * WindmillStream.GetWorkStream}. Used to control how "much" work is returned from Windmill. + */ +@AutoValue +public abstract class GetWorkBudget { + public static GetWorkBudget.Builder builder() { + return new AutoValue_GetWorkBudget.Builder(); + } + + /** {@link GetWorkBudget} of 0. */ + public static GetWorkBudget noBudget() { + return builder().setItems(0).setBytes(0).build(); + } + + public static GetWorkBudget from(GetWorkRequest getWorkRequest) { + return builder() + .setItems(getWorkRequest.getMaxItems()) + .setBytes(getWorkRequest.getMaxBytes()) + .build(); + } + + /** + * Adds the given bytes and items or the current budget, returning a new {@link GetWorkBudget}. + * Does not drop below 0. + */ + public GetWorkBudget add(long items, long bytes) { + Preconditions.checkArgument(items >= 0 && bytes >= 0); + return GetWorkBudget.builder().setBytes(bytes() + bytes).setItems(items() + items).build(); + } + + public GetWorkBudget add(GetWorkBudget other) { + return add(other.items(), other.bytes()); + } + + /** + * Subtracts the given bytes and items or the current budget, returning a new {@link + * GetWorkBudget}. Does not drop below 0. + */ + public GetWorkBudget subtract(long items, long bytes) { + Preconditions.checkArgument(items >= 0 && bytes >= 0); + return GetWorkBudget.builder().setBytes(bytes() - bytes).setItems(items() - items).build(); + } + + public GetWorkBudget subtract(GetWorkBudget other) { + return subtract(other.items(), other.bytes()); + } + + /** Budget of bytes for GetWork. Does not drop below 0. */ + public abstract long bytes(); + + /** Budget of items for GetWork. Does not drop below 0. */ + public abstract long items(); + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setBytes(long bytes); + + public abstract Builder setItems(long budget); + + abstract long items(); + + abstract long bytes(); + + abstract GetWorkBudget autoBuild(); + + public final GetWorkBudget build() { + setItems(Math.max(0, items())); + setBytes(Math.max(0, bytes())); + return autoBuild(); + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java index 4700217dc8a4e..092f5e59a13cb 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/FakeWindmillServer.java @@ -53,9 +53,11 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution.State; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Uninterruptibles; import org.joda.time.Duration; @@ -198,8 +200,7 @@ public long getAndResetThrottleTime() { } @Override - public GetWorkStream getWorkStream( - Windmill.GetWorkRequest request, GetWorkStream.WorkItemReceiver receiver) { + public GetWorkStream getWorkStream(Windmill.GetWorkRequest request, WorkItemReceiver receiver) { LOG.debug("getWorkStream: {}", request.toString()); Instant startTime = Instant.now(); final CountDownLatch done = new CountDownLatch(1); @@ -209,6 +210,19 @@ public void close() { done.countDown(); } + @Override + public void adjustBudget(long itemsDelta, long bytesDelta) { + // no-op. + } + + @Override + public GetWorkBudget remainingBudget() { + return GetWorkBudget.builder() + .setItems(request.getMaxItems()) + .setBytes(request.getMaxBytes()) + .build(); + } + @Override public boolean awaitTermination(int time, TimeUnit unit) throws InterruptedException { while (done.getCount() > 0) { diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index 24e6e2795c683..6826607513d98 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -80,6 +80,7 @@ import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; @@ -178,6 +179,7 @@ import org.junit.Test; import org.junit.rules.ErrorCollector; import org.junit.rules.TestRule; +import org.junit.rules.Timeout; import org.junit.runner.Description; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -225,6 +227,7 @@ public class StreamingDataflowWorkerTest { private static final ByteString DEFAULT_KEY_BYTES = ByteString.copyFromUtf8(DEFAULT_KEY_STRING); private static final String DEFAULT_DATA_STRING = "data"; private static final String DEFAULT_DESTINATION_STREAM_ID = "out"; + private static final long MAXIMUM_BYTES_OUTSTANDING = 10000000; private static final Function EMPTY_DATA_RESPONDER = (GetDataRequest request) -> { GetDataResponse.Builder builder = GetDataResponse.newBuilder(); @@ -254,6 +257,7 @@ public Long get() { return idGenerator.getAndIncrement(); } }; + @Rule public transient Timeout globalTimeout = Timeout.seconds(600); @Rule public BlockingFn blockingFn = new BlockingFn(); @Rule public TestRule restoreMDC = new RestoreDataflowLoggingMDC(); @Rule public ErrorCollector errorCollector = new ErrorCollector(); @@ -279,11 +283,7 @@ private static CounterUpdate getCounter(Iterable counters, String } static Work createMockWork(long workToken) { - return Work.create( - Windmill.WorkItem.newBuilder().setKey(ByteString.EMPTY).setWorkToken(workToken).build(), - Instant::now, - Collections.emptyList(), - work -> {}); + return createMockWork(workToken, work -> {}); } static Work createMockWork(long workToken, Consumer processWorkFn) { @@ -2749,7 +2749,7 @@ public void testMaxThreadMetric() throws Exception { threadExpiration, TimeUnit.SECONDS, maxThreads, - 10000000, + MAXIMUM_BYTES_OUTSTANDING, new ThreadFactoryBuilder() .setNameFormat("DataflowWorkUnits-%d") .setDaemon(true) @@ -2793,12 +2793,14 @@ public void testMaxThreadMetric() throws Exception { executor.shutdown(); } - volatile boolean stop = false; - @Test public void testActiveThreadMetric() throws Exception { int maxThreads = 5; int threadExpirationSec = 60; + CountDownLatch processStart1 = new CountDownLatch(2); + CountDownLatch processStart2 = new CountDownLatch(3); + CountDownLatch processStart3 = new CountDownLatch(4); + AtomicBoolean stop = new AtomicBoolean(false); // setting up actual implementation of executor instead of mocking to keep track of // active thread count. BoundedQueueExecutor executor = @@ -2807,7 +2809,7 @@ public void testActiveThreadMetric() throws Exception { threadExpirationSec, TimeUnit.SECONDS, maxThreads, - 10000000, + MAXIMUM_BYTES_OUTSTANDING, new ThreadFactoryBuilder() .setNameFormat("DataflowWorkUnits-%d") .setDaemon(true) @@ -2825,11 +2827,11 @@ public void testActiveThreadMetric() throws Exception { Consumer sleepProcessWorkFn = unused -> { - synchronized (this) { - this.notify(); - } + processStart1.countDown(); + processStart2.countDown(); + processStart3.countDown(); int count = 0; - while (!stop) { + while (!stop.get()) { count += 1; } }; @@ -2842,27 +2844,163 @@ public void testActiveThreadMetric() throws Exception { assertEquals(0, executor.activeCount()); assertTrue(computationState.activateWork(key1Shard1, m2)); - synchronized (this) { - executor.execute(m2, m2.getWorkItem().getSerializedSize()); - this.wait(); - // Seems current executor executes the initial work item twice - this.wait(); - } + // activate work starts executing work if no other work is queued for that shard + executor.execute(m2, m2.getWorkItem().getSerializedSize()); + processStart1.await(); assertEquals(2, executor.activeCount()); assertTrue(computationState.activateWork(key1Shard1, m3)); assertTrue(computationState.activateWork(key1Shard1, m4)); - synchronized (this) { - executor.execute(m3, m3.getWorkItem().getSerializedSize()); - this.wait(); - } + executor.execute(m3, m3.getWorkItem().getSerializedSize()); + processStart2.await(); + assertEquals(3, executor.activeCount()); - synchronized (this) { - executor.execute(m4, m4.getWorkItem().getSerializedSize()); - this.wait(); - } + executor.execute(m4, m4.getWorkItem().getSerializedSize()); + processStart3.await(); assertEquals(4, executor.activeCount()); - stop = true; + stop.set(true); + executor.shutdown(); + } + + @Test + public void testOutstandingBytesMetric() throws Exception { + int maxThreads = 5; + int threadExpirationSec = 60; + CountDownLatch processStart1 = new CountDownLatch(2); + CountDownLatch processStart2 = new CountDownLatch(3); + CountDownLatch processStart3 = new CountDownLatch(4); + AtomicBoolean stop = new AtomicBoolean(false); + // setting up actual implementation of executor instead of mocking to keep track of + // active thread count. + BoundedQueueExecutor executor = + new BoundedQueueExecutor( + maxThreads, + threadExpirationSec, + TimeUnit.SECONDS, + maxThreads, + MAXIMUM_BYTES_OUTSTANDING, + new ThreadFactoryBuilder() + .setNameFormat("DataflowWorkUnits-%d") + .setDaemon(true) + .build()); + + ComputationState computationState = + new ComputationState( + "computation", + defaultMapTask(Arrays.asList(makeSourceInstruction(StringUtf8Coder.of()))), + executor, + ImmutableMap.of(), + null); + + ShardedKey key1Shard1 = ShardedKey.create(ByteString.copyFromUtf8("key1"), 1); + Consumer sleepProcessWorkFn = + unused -> { + processStart1.countDown(); + processStart2.countDown(); + processStart3.countDown(); + int count = 0; + while (!stop.get()) { + count += 1; + } + }; + + Work m2 = createMockWork(2, sleepProcessWorkFn); + + Work m3 = createMockWork(3, sleepProcessWorkFn); + + Work m4 = createMockWork(4, sleepProcessWorkFn); + assertEquals(0, executor.bytesOutstanding()); + + long bytes = m2.getWorkItem().getSerializedSize(); + assertTrue(computationState.activateWork(key1Shard1, m2)); + // activate work starts executing work if no other work is queued for that shard + bytes += m2.getWorkItem().getSerializedSize(); + executor.execute(m2, m2.getWorkItem().getSerializedSize()); + processStart1.await(); + assertEquals(bytes, executor.bytesOutstanding()); + + assertTrue(computationState.activateWork(key1Shard1, m3)); + assertTrue(computationState.activateWork(key1Shard1, m4)); + + bytes += m3.getWorkItem().getSerializedSize(); + executor.execute(m3, m3.getWorkItem().getSerializedSize()); + processStart2.await(); + assertEquals(bytes, executor.bytesOutstanding()); + + bytes += m4.getWorkItem().getSerializedSize(); + executor.execute(m4, m4.getWorkItem().getSerializedSize()); + processStart3.await(); + assertEquals(bytes, executor.bytesOutstanding()); + stop.set(true); + executor.shutdown(); + } + + @Test + public void testOutstandingBundlesMetric() throws Exception { + int maxThreads = 5; + int threadExpirationSec = 60; + CountDownLatch processStart1 = new CountDownLatch(2); + CountDownLatch processStart2 = new CountDownLatch(3); + CountDownLatch processStart3 = new CountDownLatch(4); + AtomicBoolean stop = new AtomicBoolean(false); + // setting up actual implementation of executor instead of mocking to keep track of + // active thread count. + BoundedQueueExecutor executor = + new BoundedQueueExecutor( + maxThreads, + threadExpirationSec, + TimeUnit.SECONDS, + maxThreads, + MAXIMUM_BYTES_OUTSTANDING, + new ThreadFactoryBuilder() + .setNameFormat("DataflowWorkUnits-%d") + .setDaemon(true) + .build()); + + ComputationState computationState = + new ComputationState( + "computation", + defaultMapTask(Arrays.asList(makeSourceInstruction(StringUtf8Coder.of()))), + executor, + ImmutableMap.of(), + null); + + ShardedKey key1Shard1 = ShardedKey.create(ByteString.copyFromUtf8("key1"), 1); + Consumer sleepProcessWorkFn = + unused -> { + processStart1.countDown(); + processStart2.countDown(); + processStart3.countDown(); + int count = 0; + while (!stop.get()) { + count += 1; + } + }; + + Work m2 = createMockWork(2, sleepProcessWorkFn); + + Work m3 = createMockWork(3, sleepProcessWorkFn); + + Work m4 = createMockWork(4, sleepProcessWorkFn); + assertEquals(0, executor.elementsOutstanding()); + + assertTrue(computationState.activateWork(key1Shard1, m2)); + // activate work starts executing work if no other work is queued for that shard + executor.execute(m2, m2.getWorkItem().getSerializedSize()); + processStart1.await(); + assertEquals(2, executor.elementsOutstanding()); + + assertTrue(computationState.activateWork(key1Shard1, m3)); + assertTrue(computationState.activateWork(key1Shard1, m4)); + + executor.execute(m3, m3.getWorkItem().getSerializedSize()); + processStart2.await(); + assertEquals(3, executor.elementsOutstanding()); + + executor.execute(m4, m4.getWorkItem().getSerializedSize()); + processStart3.await(); + assertEquals(4, executor.elementsOutstanding()); + stop.set(true); executor.shutdown(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java index 57e29bb21dee9..9991520d593b5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingModeExecutionContextTest.java @@ -55,7 +55,10 @@ import org.apache.beam.runners.dataflow.worker.counters.NameContext; import org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler.NoopProfileScope; import org.apache.beam.runners.dataflow.worker.profiler.ScopedProfiler.ProfileScope; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputStateFetcher; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateReader; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.metrics.MetricsContainer; import org.apache.beam.sdk.options.PipelineOptionsFactory; @@ -81,10 +84,10 @@ @RunWith(JUnit4.class) public class StreamingModeExecutionContextTest { - @Mock private StateFetcher stateFetcher; + @Mock private SideInputStateFetcher sideInputStateFetcher; @Mock private WindmillStateReader stateReader; - private StreamingModeExecutionStateRegistry executionStateRegistry = + private final StreamingModeExecutionStateRegistry executionStateRegistry = new StreamingModeExecutionStateRegistry(null); private StreamingModeExecutionContext executionContext; DataflowWorkerHarnessOptions options; @@ -131,7 +134,7 @@ public void testTimerInternalsSetTimer() { null, // output watermark null, // synchronized processing time stateReader, - stateFetcher, + sideInputStateFetcher, outputBuilder); TimerInternals timerInternals = stepContext.timerInternals(); @@ -181,7 +184,7 @@ public void testTimerInternalsProcessingTimeSkew() { null, // output watermark null, // synchronized processing time stateReader, - stateFetcher, + sideInputStateFetcher, outputBuilder); TimerInternals timerInternals = stepContext.timerInternals(); assertTrue(timerTimestamp.isBefore(timerInternals.currentProcessingTime())); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputDoFnRunnerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputDoFnRunnerTest.java index 05e0ff4176155..3c121ab27f76e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputDoFnRunnerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputDoFnRunnerTest.java @@ -39,7 +39,7 @@ import org.apache.beam.runners.core.SideInputReader; import org.apache.beam.runners.core.StateInternals; import org.apache.beam.runners.core.StateNamespaces; -import org.apache.beam.runners.dataflow.worker.StateFetcher.SideInputState; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputState; import org.apache.beam.runners.dataflow.worker.util.ListOutputManager; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalDataRequest; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcherTest.java index 9ce462be32115..a7196613fbb16 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingSideInputFetcherTest.java @@ -31,7 +31,7 @@ import org.apache.beam.runners.core.StateInternals; import org.apache.beam.runners.core.StateNamespaces; import org.apache.beam.runners.core.TimerInternals.TimerData; -import org.apache.beam.runners.dataflow.worker.StateFetcher.SideInputState; +import org.apache.beam.runners.dataflow.worker.streaming.sideinput.SideInputState; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.state.BagState; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java index 1a4c43905d20a..9e6d45a2351be 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingStepMetricsContainerTest.java @@ -20,6 +20,7 @@ import static org.apache.beam.runners.dataflow.worker.counters.DataflowCounterUpdateExtractor.longToSplitInt; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.sameInstance; @@ -33,6 +34,9 @@ import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.MetricName; import org.apache.beam.sdk.metrics.MetricsContainer; +import org.apache.beam.sdk.metrics.NoOpCounter; +import org.apache.beam.sdk.metrics.NoOpHistogram; +import org.apache.beam.sdk.util.HistogramData; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -178,4 +182,22 @@ public void testDistributionUpdateExtraction() { .setMin(longToSplitInt(3)) .setSum(longToSplitInt(3))))); } + + @Test + public void testPerWorkerMetrics() { + StreamingStepMetricsContainer.setEnablePerWorkerMetrics(false); + MetricsContainer metricsContainer = registry.getContainer("test_step"); + assertThat( + metricsContainer.getPerWorkerCounter(name1), sameInstance(NoOpCounter.getInstance())); + HistogramData.BucketType testBucket = HistogramData.LinearBuckets.of(1, 1, 1); + assertThat( + metricsContainer.getPerWorkerHistogram(name1, testBucket), + sameInstance(NoOpHistogram.getInstance())); + + StreamingStepMetricsContainer.setEnablePerWorkerMetrics(true); + assertThat(metricsContainer.getPerWorkerCounter(name1), not(instanceOf(NoOpCounter.class))); + assertThat( + metricsContainer.getPerWorkerHistogram(name1, testBucket), + not(instanceOf(NoOpHistogram.class))); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java index 1f3dee4b76ba4..12ae816de8292 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkStateTest.java @@ -33,10 +33,10 @@ import java.util.Map; import java.util.Optional; import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.WindmillStateCache; import org.apache.beam.runners.dataflow.worker.streaming.ActiveWorkState.ActivateWorkResult; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; +import org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Instant; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StateFetcherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java similarity index 67% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StateFetcherTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java index 13d8a9bd3ffbd..daf8146187911 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StateFetcherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/sideinput/SideInputStateFetcherTest.java @@ -15,11 +15,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker; +package org.apache.beam.runners.dataflow.worker.streaming.sideinput; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.contains; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; import static org.mockito.Matchers.any; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -27,10 +29,10 @@ import static org.mockito.Mockito.when; import java.io.Closeable; -import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.concurrent.TimeUnit; -import org.apache.beam.runners.dataflow.worker.StateFetcher.SideInputState; +import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.ListCoder; @@ -56,14 +58,16 @@ import org.mockito.Mock; import org.mockito.MockitoAnnotations; -/** Unit tests for {@link StateFetcher}. */ +/** Unit tests for {@link SideInputStateFetcher}. */ +// TODO: Add tests with different encoded windows to verify version is correctly plumbed. +@SuppressWarnings("deprecation") @RunWith(JUnit4.class) -public class StateFetcherTest { +public class SideInputStateFetcherTest { private static final String STATE_FAMILY = "state"; - @Mock MetricTrackingWindmillServerStub server; + @Mock private MetricTrackingWindmillServerStub server; - @Mock Supplier readStateSupplier; + @Mock private Supplier readStateSupplier; @Before public void setUp() { @@ -72,10 +76,11 @@ public void setUp() { @Test public void testFetchGlobalDataBasic() throws Exception { - StateFetcher fetcher = new StateFetcher(server); + SideInputStateFetcher fetcher = new SideInputStateFetcher(server); ByteStringOutputStream stream = new ByteStringOutputStream(); - ListCoder.of(StringUtf8Coder.of()).encode(Arrays.asList("data"), stream, Coder.Context.OUTER); + ListCoder.of(StringUtf8Coder.of()) + .encode(Collections.singletonList("data"), stream, Coder.Context.OUTER); ByteString encodedIterable = stream.toByteString(); PCollectionView view = @@ -87,17 +92,29 @@ public void testFetchGlobalDataBasic() throws Exception { // then the data is already cached. when(server.getSideInputData(any(Windmill.GlobalDataRequest.class))) .thenReturn( - buildGlobalDataResponse(tag, ByteString.EMPTY, false, null), - buildGlobalDataResponse(tag, ByteString.EMPTY, true, encodedIterable)); + buildGlobalDataResponse(tag, false, null), + buildGlobalDataResponse(tag, true, encodedIterable)); + + assertFalse( + fetcher + .fetchSideInput( + view, + GlobalWindow.INSTANCE, + STATE_FAMILY, + SideInputState.UNKNOWN, + readStateSupplier) + .isReady()); + + assertFalse( + fetcher + .fetchSideInput( + view, + GlobalWindow.INSTANCE, + STATE_FAMILY, + SideInputState.UNKNOWN, + readStateSupplier) + .isReady()); - assertEquals( - null, - fetcher.fetchSideInput( - view, GlobalWindow.INSTANCE, STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier)); - assertEquals( - null, - fetcher.fetchSideInput( - view, GlobalWindow.INSTANCE, STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier)); assertEquals( "data", fetcher @@ -107,7 +124,8 @@ public void testFetchGlobalDataBasic() throws Exception { STATE_FAMILY, SideInputState.KNOWN_READY, readStateSupplier) - .orNull()); + .value() + .orElse(null)); assertEquals( "data", fetcher @@ -117,18 +135,20 @@ public void testFetchGlobalDataBasic() throws Exception { STATE_FAMILY, SideInputState.KNOWN_READY, readStateSupplier) - .orNull()); + .value() + .orElse(null)); - verify(server, times(2)).getSideInputData(buildGlobalDataRequest(tag, ByteString.EMPTY)); + verify(server, times(2)).getSideInputData(buildGlobalDataRequest(tag)); verifyNoMoreInteractions(server); } @Test public void testFetchGlobalDataNull() throws Exception { - StateFetcher fetcher = new StateFetcher(server); + SideInputStateFetcher fetcher = new SideInputStateFetcher(server); ByteStringOutputStream stream = new ByteStringOutputStream(); - ListCoder.of(VoidCoder.of()).encode(Arrays.asList((Void) null), stream, Coder.Context.OUTER); + ListCoder.of(VoidCoder.of()) + .encode(Collections.singletonList(null), stream, Coder.Context.OUTER); ByteString encodedIterable = stream.toByteString(); PCollectionView view = @@ -140,19 +160,28 @@ public void testFetchGlobalDataNull() throws Exception { // then the data is already cached. when(server.getSideInputData(any(Windmill.GlobalDataRequest.class))) .thenReturn( - buildGlobalDataResponse(tag, ByteString.EMPTY, false, null), - buildGlobalDataResponse(tag, ByteString.EMPTY, true, encodedIterable)); + buildGlobalDataResponse(tag, false, null), + buildGlobalDataResponse(tag, true, encodedIterable)); - assertEquals( - null, - fetcher.fetchSideInput( - view, GlobalWindow.INSTANCE, STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier)); - assertEquals( - null, - fetcher.fetchSideInput( - view, GlobalWindow.INSTANCE, STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier)); - assertEquals( - null, + assertFalse( + fetcher + .fetchSideInput( + view, + GlobalWindow.INSTANCE, + STATE_FAMILY, + SideInputState.UNKNOWN, + readStateSupplier) + .isReady()); + assertFalse( + fetcher + .fetchSideInput( + view, + GlobalWindow.INSTANCE, + STATE_FAMILY, + SideInputState.UNKNOWN, + readStateSupplier) + .isReady()); + assertNull( fetcher .fetchSideInput( view, @@ -160,9 +189,9 @@ public void testFetchGlobalDataNull() throws Exception { STATE_FAMILY, SideInputState.KNOWN_READY, readStateSupplier) - .orNull()); - assertEquals( - null, + .value() + .orElse(null)); + assertNull( fetcher .fetchSideInput( view, @@ -170,9 +199,10 @@ public void testFetchGlobalDataNull() throws Exception { STATE_FAMILY, SideInputState.KNOWN_READY, readStateSupplier) - .orNull()); + .value() + .orElse(null)); - verify(server, times(2)).getSideInputData(buildGlobalDataRequest(tag, ByteString.EMPTY)); + verify(server, times(2)).getSideInputData(buildGlobalDataRequest(tag)); verifyNoMoreInteractions(server); } @@ -181,15 +211,14 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { Coder> coder = ListCoder.of(StringUtf8Coder.of()); ByteStringOutputStream stream = new ByteStringOutputStream(); - coder.encode(Arrays.asList("data1"), stream, Coder.Context.OUTER); + coder.encode(Collections.singletonList("data1"), stream, Coder.Context.OUTER); ByteString encodedIterable1 = stream.toByteStringAndReset(); - coder.encode(Arrays.asList("data2"), stream, Coder.Context.OUTER); + coder.encode(Collections.singletonList("data2"), stream, Coder.Context.OUTER); ByteString encodedIterable2 = stream.toByteString(); - Cache cache = - CacheBuilder.newBuilder().build(); + Cache, SideInput> cache = CacheBuilder.newBuilder().build(); - StateFetcher fetcher = new StateFetcher(server, cache); + SideInputStateFetcher fetcher = new SideInputStateFetcher(server, new SideInputCache(cache)); PCollectionView view1 = TestPipeline.create().apply(Create.empty(StringUtf8Coder.of())).apply(View.asSingleton()); @@ -204,9 +233,9 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { // then view 1 again twice. when(server.getSideInputData(any(Windmill.GlobalDataRequest.class))) .thenReturn( - buildGlobalDataResponse(tag1, ByteString.EMPTY, true, encodedIterable1), - buildGlobalDataResponse(tag2, ByteString.EMPTY, true, encodedIterable2), - buildGlobalDataResponse(tag1, ByteString.EMPTY, true, encodedIterable1)); + buildGlobalDataResponse(tag1, true, encodedIterable1), + buildGlobalDataResponse(tag2, true, encodedIterable2), + buildGlobalDataResponse(tag1, true, encodedIterable1)); assertEquals( "data1", @@ -217,7 +246,8 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier) - .orNull()); + .value() + .orElse(null)); assertEquals( "data2", fetcher @@ -227,7 +257,8 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier) - .orNull()); + .value() + .orElse(null)); cache.invalidateAll(); assertEquals( "data1", @@ -238,7 +269,8 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier) - .orNull()); + .value() + .orElse(null)); assertEquals( "data1", fetcher @@ -248,7 +280,8 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier) - .orNull()); + .value() + .orElse(null)); ArgumentCaptor captor = ArgumentCaptor.forClass(Windmill.GlobalDataRequest.class); @@ -259,14 +292,14 @@ public void testFetchGlobalDataCacheOverflow() throws Exception { assertThat( captor.getAllValues(), contains( - buildGlobalDataRequest(tag1, ByteString.EMPTY), - buildGlobalDataRequest(tag2, ByteString.EMPTY), - buildGlobalDataRequest(tag1, ByteString.EMPTY))); + buildGlobalDataRequest(tag1), + buildGlobalDataRequest(tag2), + buildGlobalDataRequest(tag1))); } @Test public void testEmptyFetchGlobalData() throws Exception { - StateFetcher fetcher = new StateFetcher(server); + SideInputStateFetcher fetcher = new SideInputStateFetcher(server); ByteString encodedIterable = ByteString.EMPTY; @@ -280,7 +313,7 @@ public void testEmptyFetchGlobalData() throws Exception { // Test three calls in a row. First, data is not ready, then data is ready, // then the data is already cached. when(server.getSideInputData(any(Windmill.GlobalDataRequest.class))) - .thenReturn(buildGlobalDataResponse(tag, ByteString.EMPTY, true, encodedIterable)); + .thenReturn(buildGlobalDataResponse(tag, true, encodedIterable)); assertEquals( 0L, @@ -292,17 +325,22 @@ public void testEmptyFetchGlobalData() throws Exception { STATE_FAMILY, SideInputState.UNKNOWN, readStateSupplier) - .orNull()); + .value() + .orElse(null)); - verify(server).getSideInputData(buildGlobalDataRequest(tag, ByteString.EMPTY)); + verify(server).getSideInputData(buildGlobalDataRequest(tag)); verifyNoMoreInteractions(server); } - private Windmill.GlobalData buildGlobalDataResponse( - String tag, ByteString version, boolean isReady, ByteString data) { + private static Windmill.GlobalData buildGlobalDataResponse( + String tag, boolean isReady, ByteString data) { Windmill.GlobalData.Builder builder = Windmill.GlobalData.newBuilder() - .setDataId(Windmill.GlobalDataId.newBuilder().setTag(tag).setVersion(version).build()); + .setDataId( + Windmill.GlobalDataId.newBuilder() + .setTag(tag) + .setVersion(ByteString.EMPTY) + .build()); if (isReady) { builder.setIsReady(true).setData(data); @@ -312,7 +350,7 @@ private Windmill.GlobalData buildGlobalDataResponse( return builder.build(); } - private Windmill.GlobalDataRequest buildGlobalDataRequest(String tag, ByteString version) { + private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag, ByteString version) { Windmill.GlobalDataId id = Windmill.GlobalDataId.newBuilder().setTag(tag).setVersion(version).build(); @@ -323,4 +361,8 @@ private Windmill.GlobalDataRequest buildGlobalDataRequest(String tag, ByteString TimeUnit.MILLISECONDS.toMicros(GlobalWindow.INSTANCE.maxTimestamp().getMillis())) .build(); } + + private static Windmill.GlobalDataRequest buildGlobalDataRequest(String tag) { + return buildGlobalDataRequest(tag, ByteString.EMPTY); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStreamPoolTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java similarity index 99% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStreamPoolTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java index 9924bb7d2b2bc..264540531bf8b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStreamPoolTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/WindmillStreamPoolTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill; +package org.apache.beam.runners.dataflow.worker.windmill.client; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkerMetadataStreamTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java similarity index 96% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkerMetadataStreamTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java index 45ed3381a8bfe..e3b07bf7aa4d4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkerMetadataStreamTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkerMetadataStreamTest.java @@ -15,10 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import static com.google.common.truth.Truth.assertThat; -import static org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS; +import static org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.verify; @@ -33,13 +33,14 @@ import java.util.function.Consumer; import java.util.stream.Collectors; import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; -import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkerMetadataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkerMetadataResponse; import org.apache.beam.runners.dataflow.worker.windmill.WindmillEndpoints; +import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; +import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.sdk.util.FluentBackoff; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.ManagedChannel; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Server; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java similarity index 98% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServerTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java index 53afc6990e433..d9f4b72716cbe 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; @@ -68,9 +68,9 @@ import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItemCommitRequest; import org.apache.beam.runners.dataflow.worker.windmill.WindmillApplianceGrpc; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.CommitWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.CallOptions; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Channel; @@ -99,10 +99,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Unit tests for {@link - * org.apache.beam.runners.dataflow.worker.windmill.grpcclient.GrpcWindmillServer}. - */ +/** Unit tests for {@link GrpcWindmillServer}. */ @RunWith(JUnit4.class) @SuppressWarnings({ "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) @@ -114,7 +111,7 @@ public class GrpcWindmillServerTest { private final MutableHandlerRegistry serviceRegistry = new MutableHandlerRegistry(); @Rule public ErrorCollector errorCollector = new ErrorCollector(); private Server server; - private org.apache.beam.runners.dataflow.worker.windmill.grpcclient.GrpcWindmillServer client; + private GrpcWindmillServer client; private int remainingErrors = 20; @Before diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateCacheTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCacheTest.java similarity index 75% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateCacheTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCacheTest.java index eca431af11a79..cc6633f1b704e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateCacheTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateCacheTest.java @@ -15,17 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker; +package org.apache.beam.runners.dataflow.worker.windmill.state; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; import java.io.IOException; import java.util.Objects; +import java.util.Optional; import org.apache.beam.runners.core.StateNamespace; import org.apache.beam.runners.core.StateNamespaces; import org.apache.beam.runners.core.StateTag; import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; +import org.apache.beam.runners.dataflow.worker.WindmillComputationKey; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.state.State; import org.apache.beam.sdk.state.StateSpec; @@ -38,7 +39,7 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -/** Tests for {@link WindmillStateCache}. */ +/** Tests for {@link org.apache.beam.runners.dataflow.worker.windmill.state.WindmillStateCache}. */ @RunWith(JUnit4.class) public class WindmillStateCacheTest { @@ -153,10 +154,11 @@ public void setUp() { public void testBasic() throws Exception { WindmillStateCache.ForKeyAndFamily keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 1L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); - assertNull(keyCache.get(windowNamespace(0), new TestStateTag("tag2"))); - assertNull(keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag3"))); - assertNull(keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag2"))); + assertEquals( + Optional.empty(), keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); + assertEquals(Optional.empty(), keyCache.get(windowNamespace(0), new TestStateTag("tag2"))); + assertEquals(Optional.empty(), keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag3"))); + assertEquals(Optional.empty(), keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag2"))); assertEquals(0, cache.getWeight()); keyCache.put(StateNamespaces.global(), new TestStateTag("tag1"), new TestState("g1"), 2); @@ -177,12 +179,17 @@ public void testBasic() throws Exception { keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 2L).forFamily(STATE_FAMILY); assertEquals( - new TestState("g1"), keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); - assertEquals(new TestState("w2"), keyCache.get(windowNamespace(0), new TestStateTag("tag2"))); + Optional.of(new TestState("g1")), + keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); + assertEquals( + Optional.of(new TestState("w2")), + keyCache.get(windowNamespace(0), new TestStateTag("tag2"))); assertEquals( - new TestState("t3"), keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag3"))); + Optional.of(new TestState("t3")), + keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag3"))); assertEquals( - new TestState("t2"), keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag2"))); + Optional.of(new TestState("t2")), + keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag2"))); } /** Verifies that max weight is set */ @@ -196,7 +203,8 @@ public void testMaxWeight() throws Exception { public void testInvalidation() throws Exception { WindmillStateCache.ForKeyAndFamily keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 1L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); + assertEquals( + Optional.empty(), keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); keyCache.put(StateNamespaces.global(), new TestStateTag("tag1"), new TestState("g1"), 2); keyCache.persist(); @@ -204,11 +212,13 @@ public void testInvalidation() throws Exception { cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 2L).forFamily(STATE_FAMILY); assertEquals(127, cache.getWeight()); assertEquals( - new TestState("g1"), keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); + Optional.of(new TestState("g1")), + keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 1L, 3L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); + assertEquals( + Optional.empty(), keyCache.get(StateNamespaces.global(), new TestStateTag("tag1"))); assertEquals(127, cache.getWeight()); } @@ -225,8 +235,8 @@ public void testEviction() throws Exception { // Eviction is atomic across the whole window. keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 2L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(windowNamespace(0), new TestStateTag("tag2"))); - assertNull(keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag3"))); + assertEquals(Optional.empty(), keyCache.get(windowNamespace(0), new TestStateTag("tag2"))); + assertEquals(Optional.empty(), keyCache.get(triggerNamespace(0, 0), new TestStateTag("tag3"))); } /** Verifies that the cache does not vend for stale work tokens. */ @@ -239,35 +249,35 @@ public void testStaleWorkItem() throws Exception { keyCache.put(windowNamespace(0), tag, new TestState("w2"), 2); // Same cache. - assertEquals(new TestState("w2"), keyCache.get(windowNamespace(0), tag)); + assertEquals(Optional.of(new TestState("w2")), keyCache.get(windowNamespace(0), tag)); assertEquals(0, cache.getWeight()); keyCache.persist(); assertEquals(127, cache.getWeight()); - assertEquals(new TestState("w2"), keyCache.get(windowNamespace(0), tag)); + assertEquals(Optional.of(new TestState("w2")), keyCache.get(windowNamespace(0), tag)); // Previous work token. keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 1L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(windowNamespace(0), tag)); + assertEquals(Optional.empty(), keyCache.get(windowNamespace(0), tag)); // Retry of work token that inserted. keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 2L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(windowNamespace(0), tag)); + assertEquals(Optional.empty(), keyCache.get(windowNamespace(0), tag)); keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 10L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(windowNamespace(0), tag)); + assertEquals(Optional.empty(), keyCache.get(windowNamespace(0), tag)); keyCache.put(windowNamespace(0), tag, new TestState("w3"), 2); // Ensure that second put updated work token. keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 5L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(windowNamespace(0), tag)); + assertEquals(Optional.empty(), keyCache.get(windowNamespace(0), tag)); keyCache = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 15L).forFamily(STATE_FAMILY); - assertNull(keyCache.get(windowNamespace(0), tag)); + assertEquals(Optional.empty(), keyCache.get(windowNamespace(0), tag)); } /** Verifies that caches are kept independently per-key. */ @@ -293,7 +303,7 @@ public void testMultipleKeys() throws Exception { TestState state1 = new TestState("g1"); keyCache1.put(StateNamespaces.global(), tag, state1, 2); - assertEquals(state1, keyCache1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), keyCache1.get(StateNamespaces.global(), tag)); keyCache1.persist(); keyCache1 = @@ -301,22 +311,22 @@ public void testMultipleKeys() throws Exception { .forComputation("comp1") .forKey(computationKey("comp1", "key1", SHARDING_KEY), 0L, 1L) .forFamily(STATE_FAMILY); - assertEquals(state1, keyCache1.get(StateNamespaces.global(), tag)); - assertNull(keyCache2.get(StateNamespaces.global(), tag)); - assertNull(keyCache3.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), keyCache1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.empty(), keyCache2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.empty(), keyCache3.get(StateNamespaces.global(), tag)); TestState state2 = new TestState("g2"); keyCache2.put(StateNamespaces.global(), tag, state2, 2); keyCache2.persist(); - assertEquals(state2, keyCache2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state2), keyCache2.get(StateNamespaces.global(), tag)); keyCache2 = cache .forComputation("comp1") .forKey(computationKey("comp1", "key2", SHARDING_KEY), 0L, 20L) .forFamily(STATE_FAMILY); - assertEquals(state2, keyCache2.get(StateNamespaces.global(), tag)); - assertEquals(state1, keyCache1.get(StateNamespaces.global(), tag)); - assertNull(keyCache3.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state2), keyCache2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), keyCache1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.empty(), keyCache3.get(StateNamespaces.global(), tag)); } /** Verifies that caches are kept independently per shard of key. */ @@ -343,28 +353,28 @@ public void testMultipleShardsOfKey() throws Exception { TestState state1 = new TestState("g1"); key1CacheShard1.put(StateNamespaces.global(), tag, state1, 2); key1CacheShard1.persist(); - assertEquals(state1, key1CacheShard1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), key1CacheShard1.get(StateNamespaces.global(), tag)); key1CacheShard1 = cache .forComputation(COMPUTATION) .forKey(computationKey(COMPUTATION, "key1", 1), 0L, 1L) .forFamily(STATE_FAMILY); - assertEquals(state1, key1CacheShard1.get(StateNamespaces.global(), tag)); - assertNull(key1CacheShard2.get(StateNamespaces.global(), tag)); - assertNull(key2CacheShard1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), key1CacheShard1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.empty(), key1CacheShard2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.empty(), key2CacheShard1.get(StateNamespaces.global(), tag)); TestState state2 = new TestState("g2"); key1CacheShard2.put(StateNamespaces.global(), tag, state2, 2); - assertEquals(state2, key1CacheShard2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state2), key1CacheShard2.get(StateNamespaces.global(), tag)); key1CacheShard2.persist(); key1CacheShard2 = cache .forComputation(COMPUTATION) .forKey(computationKey(COMPUTATION, "key1", 2), 0L, 20L) .forFamily(STATE_FAMILY); - assertEquals(state2, key1CacheShard2.get(StateNamespaces.global(), tag)); - assertEquals(state1, key1CacheShard1.get(StateNamespaces.global(), tag)); - assertNull(key2CacheShard1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state2), key1CacheShard2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), key1CacheShard1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.empty(), key2CacheShard1.get(StateNamespaces.global(), tag)); } /** Verifies that caches are kept independently per-family. */ @@ -379,22 +389,22 @@ public void testMultipleFamilies() throws Exception { TestState state1 = new TestState("g1"); family1.put(StateNamespaces.global(), tag, state1, 2); - assertEquals(state1, family1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), family1.get(StateNamespaces.global(), tag)); family1.persist(); TestState state2 = new TestState("g2"); family2.put(StateNamespaces.global(), tag, state2, 2); family2.persist(); - assertEquals(state2, family2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state2), family2.get(StateNamespaces.global(), tag)); keyCache = cache.forComputation("comp1").forKey(computationKey("comp1", "key1", SHARDING_KEY), 0L, 1L); family1 = keyCache.forFamily("family1"); family2 = keyCache.forFamily("family2"); WindmillStateCache.ForKeyAndFamily family3 = keyCache.forFamily("family3"); - assertEquals(state1, family1.get(StateNamespaces.global(), tag)); - assertEquals(state2, family2.get(StateNamespaces.global(), tag)); - assertNull(family3.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state1), family1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(state2), family2.get(StateNamespaces.global(), tag)); + assertEquals(Optional.empty(), family3.get(StateNamespaces.global(), tag)); } /** Verifies explicit invalidation does indeed invalidate the correct entries. */ @@ -450,13 +460,17 @@ public void testExplicitInvalidation() throws Exception { .forKey(computationKey("comp1", "key1", 2), 0L, 1L) .forFamily(STATE_FAMILY); assertEquals( - new TestState("g1"), keyCache1.get(StateNamespaces.global(), new TestStateTag("tag1"))); + Optional.of(new TestState("g1")), + keyCache1.get(StateNamespaces.global(), new TestStateTag("tag1"))); assertEquals( - new TestState("g2"), keyCache2.get(StateNamespaces.global(), new TestStateTag("tag2"))); + Optional.of(new TestState("g2")), + keyCache2.get(StateNamespaces.global(), new TestStateTag("tag2"))); assertEquals( - new TestState("g3"), keyCache3.get(StateNamespaces.global(), new TestStateTag("tag3"))); + Optional.of(new TestState("g3")), + keyCache3.get(StateNamespaces.global(), new TestStateTag("tag3"))); assertEquals( - new TestState("g4"), keyCache4.get(StateNamespaces.global(), new TestStateTag("tag4"))); + Optional.of(new TestState("g4")), + keyCache4.get(StateNamespaces.global(), new TestStateTag("tag4"))); // Invalidation of key 1 shard 1 does not affect another shard of key 1 or other keys. cache.forComputation("comp1").invalidate(ByteString.copyFromUtf8("key1"), 1); @@ -466,23 +480,30 @@ public void testExplicitInvalidation() throws Exception { .forKey(computationKey("comp1", "key1", 1), 0L, 2L) .forFamily(STATE_FAMILY); - assertNull(keyCache1.get(StateNamespaces.global(), new TestStateTag("tag1"))); assertEquals( - new TestState("g2"), keyCache2.get(StateNamespaces.global(), new TestStateTag("tag2"))); + Optional.empty(), keyCache1.get(StateNamespaces.global(), new TestStateTag("tag1"))); + assertEquals( + Optional.of(new TestState("g2")), + keyCache2.get(StateNamespaces.global(), new TestStateTag("tag2"))); assertEquals( - new TestState("g3"), keyCache3.get(StateNamespaces.global(), new TestStateTag("tag3"))); + Optional.of(new TestState("g3")), + keyCache3.get(StateNamespaces.global(), new TestStateTag("tag3"))); assertEquals( - new TestState("g4"), keyCache4.get(StateNamespaces.global(), new TestStateTag("tag4"))); + Optional.of(new TestState("g4")), + keyCache4.get(StateNamespaces.global(), new TestStateTag("tag4"))); // Invalidation of an non-existing key affects nothing. cache.forComputation("comp1").invalidate(ByteString.copyFromUtf8("key1"), 3); assertEquals( - new TestState("g2"), keyCache2.get(StateNamespaces.global(), new TestStateTag("tag2"))); + Optional.of(new TestState("g2")), + keyCache2.get(StateNamespaces.global(), new TestStateTag("tag2"))); assertEquals( - new TestState("g3"), keyCache3.get(StateNamespaces.global(), new TestStateTag("tag3"))); + Optional.of(new TestState("g3")), + keyCache3.get(StateNamespaces.global(), new TestStateTag("tag3"))); assertEquals( - new TestState("g4"), keyCache4.get(StateNamespaces.global(), new TestStateTag("tag4"))); + Optional.of(new TestState("g4")), + keyCache4.get(StateNamespaces.global(), new TestStateTag("tag4"))); } private static class TestStateTagWithBadEquality extends TestStateTag { @@ -517,9 +538,9 @@ public void testBadCoderEquality() throws Exception { keyCache1 = cache.forComputation(COMPUTATION).forKey(COMPUTATION_KEY, 0L, 1L).forFamily(STATE_FAMILY); - assertEquals(new TestState("g1"), keyCache1.get(StateNamespaces.global(), tag)); + assertEquals(Optional.of(new TestState("g1")), keyCache1.get(StateNamespaces.global(), tag)); assertEquals( - new TestState("g1"), + Optional.of(new TestState("g1")), keyCache1.get(StateNamespaces.global(), new TestStateTagWithBadEquality("tag1"))); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateInternalsTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java similarity index 97% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateInternalsTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java index 9f2d5eee8f873..8971c39ccaa1f 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateInternalsTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker; +package org.apache.beam.runners.dataflow.worker.windmill.state; import static org.apache.beam.runners.dataflow.worker.DataflowMatchers.ByteStringMatcher.byteStringEq; import static org.apache.beam.sdk.testing.SystemNanoTimeSleeper.sleepMillis; @@ -56,8 +56,8 @@ import org.apache.beam.runners.core.StateTag; import org.apache.beam.runners.core.StateTags; import org.apache.beam.runners.dataflow.options.DataflowWorkerHarnessOptions; -import org.apache.beam.runners.dataflow.worker.WindmillStateInternals.IdTracker; -import org.apache.beam.runners.dataflow.worker.WindmillStateInternals.WindmillOrderedList; +import org.apache.beam.runners.dataflow.worker.WindmillComputationKey; +import org.apache.beam.runners.dataflow.worker.WindmillStateTestUtils; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagBag; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.TagSortedListUpdateRequest; @@ -113,24 +113,21 @@ }) public class WindmillStateInternalsTest { + public static final Range FULL_ORDERED_LIST_RANGE = + Range.closedOpen(WindmillOrderedList.MIN_TS_MICROS, WindmillOrderedList.MAX_TS_MICROS); private static final StateNamespace NAMESPACE = new StateNamespaceForTest("ns"); private static final String STATE_FAMILY = "family"; - private static final StateTag> COMBINING_ADDR = StateTags.combiningValueFromInputInternal("combining", VarIntCoder.of(), Sum.ofIntegers()); private static final ByteString COMBINING_KEY = key(NAMESPACE, "combining"); private final Coder accumCoder = Sum.ofIntegers().getAccumulatorCoder(null, VarIntCoder.of()); - private long workToken = 0; - DataflowWorkerHarnessOptions options; - + private long workToken = 0; @Mock private WindmillStateReader mockReader; - private WindmillStateInternals underTest; private WindmillStateInternals underTestNewKey; private WindmillStateCache cache; - @Mock private Supplier readStateSupplier; private static ByteString key(StateNamespace namespace, String addrId) { @@ -141,6 +138,67 @@ private static ByteString systemKey(StateNamespace namespace, String addrId) { return ByteString.copyFromUtf8(namespace.stringKey() + "+s" + addrId); } + private static ByteString encodeWithCoder(T key, Coder coder) { + ByteStringOutputStream out = new ByteStringOutputStream(); + try { + coder.encode(key, out, Context.OUTER); + } catch (IOException e) { + throw new RuntimeException(e); + } + return out.toByteString(); + } + + // We use the structural value of the Multimap keys to differentiate between different keys. So we + // mix using the original key object and a duplicate but same key object so make sure the + // correctness. + private static byte[] dup(byte[] key) { + byte[] res = new byte[key.length]; + System.arraycopy(key, 0, res, 0, key.length); + return res; + } + + private static Map.Entry> multimapEntry( + byte[] key, Integer... values) { + return new AbstractMap.SimpleEntry<>( + encodeWithCoder(key, ByteArrayCoder.of()), Arrays.asList(values)); + } + + @SafeVarargs + private static List weightedList(T... entries) { + WeightedList list = new WeightedList<>(new ArrayList<>()); + for (T entry : entries) { + list.addWeighted(entry, 1); + } + return list; + } + + private static CombinableMatcher multimapEntryMatcher(byte[] key, Integer value) { + return Matchers.both(Matchers.hasProperty("key", Matchers.equalTo(key))) + .and(Matchers.hasProperty("value", Matchers.equalTo(value))); + } + + private static MultimapEntryUpdate decodeTagMultimapEntry(Windmill.TagMultimapEntry entryProto) { + try { + String key = StringUtf8Coder.of().decode(entryProto.getEntryName().newInput(), Context.OUTER); + List values = new ArrayList<>(); + for (ByteString value : entryProto.getValuesList()) { + values.add(VarIntCoder.of().decode(value.newInput(), Context.OUTER)); + } + return new MultimapEntryUpdate(key, values, entryProto.getDeleteAll()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void assertTagMultimapUpdates( + Windmill.TagMultimapUpdateRequest.Builder updates, MultimapEntryUpdate... expected) { + assertThat( + updates.getUpdatesList().stream() + .map(WindmillStateInternalsTest::decodeTagMultimapEntry) + .collect(Collectors.toList()), + Matchers.containsInAnyOrder(expected)); + } + @Before public void setUp() { MockitoAnnotations.initMocks(this); @@ -203,9 +261,8 @@ private void waitAndSet(final SettableFuture future, final T value, final .run(); } - private WindmillStateReader.WeightedList weightedList(String... elems) { - WindmillStateReader.WeightedList result = - new WindmillStateReader.WeightedList<>(new ArrayList(elems.length)); + private WeightedList weightedList(String... elems) { + WeightedList result = new WeightedList<>(new ArrayList(elems.length)); for (String elem : elems) { result.addWeighted(elem, elem.length()); } @@ -660,25 +717,6 @@ public void testMapComplexPersist() throws Exception { assertEquals(0, commitBuilder.getValueUpdatesCount()); } - private static ByteString encodeWithCoder(T key, Coder coder) { - ByteStringOutputStream out = new ByteStringOutputStream(); - try { - coder.encode(key, out, Context.OUTER); - } catch (IOException e) { - throw new RuntimeException(e); - } - return out.toByteString(); - } - - // We use the structural value of the Multimap keys to differentiate between different keys. So we - // mix using the original key object and a duplicate but same key object so make sure the - // correctness. - private static byte[] dup(byte[] key) { - byte[] res = new byte[key.length]; - System.arraycopy(key, 0, res, 0, key.length); - return res; - } - @Test public void testMultimapGet() { final String tag = "multimap"; @@ -796,7 +834,7 @@ public void testMultimapRemovePersistPut() { underTest.persist(commitBuilder); assertTagMultimapUpdates( Iterables.getOnlyElement(commitBuilder.getMultimapUpdatesBuilderList()), - new MultimapEntryUpdate(key, Arrays.asList(4), true)); + new MultimapEntryUpdate(key, Collections.singletonList(4), true)); multimapState.put(key, 5); assertThat(multimapState.get(key).read(), Matchers.containsInAnyOrder(4, 5)); @@ -887,22 +925,6 @@ public void testMultimapLocalClearOverrideStorage() { assertTrue(multimapState.isEmpty().read()); } - private static Map.Entry> multimapEntry( - byte[] key, Integer... values) { - return new AbstractMap.SimpleEntry<>( - encodeWithCoder(key, ByteArrayCoder.of()), Arrays.asList(values)); - } - - @SafeVarargs - private static List weightedList(T... entries) { - WindmillStateReader.WeightedList list = - new WindmillStateReader.WeightedList<>(new ArrayList<>()); - for (T entry : entries) { - list.addWeighted(entry, 1); - } - return list; - } - @Test public void testMultimapBasicEntriesAndKeys() { final String tag = "multimap"; @@ -950,11 +972,6 @@ true, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) assertThat(keys, Matchers.containsInAnyOrder(key1, key2)); } - private static CombinableMatcher multimapEntryMatcher(byte[] key, Integer value) { - return Matchers.both(Matchers.hasProperty("key", Matchers.equalTo(key))) - .and(Matchers.hasProperty("value", Matchers.equalTo(value))); - } - @Test public void testMultimapEntriesAndKeysMergeLocalAdd() { final String tag = "multimap"; @@ -1389,10 +1406,10 @@ false, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) entriesFuture, () -> new Iterator>>() { - int returnedEntries = 0; - byte[] entryKey = new byte[10_000]; // each key is 10KB final int targetEntries = 1_000_000; // return 1 million entries, which is 10 GBs - Random rand = new Random(); + final byte[] entryKey = new byte[10_000]; // each key is 10KB + final Random rand = new Random(); + int returnedEntries = 0; @Override public boolean hasNext() { @@ -1429,10 +1446,10 @@ true, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) keysFuture, () -> new Iterator>>() { - int returnedEntries = 0; - byte[] entryKey = new byte[10_000]; // each key is 10KB final int targetEntries = 1_000_000; // return 1 million entries, which is 10 GBs - Random rand = new Random(); + final byte[] entryKey = new byte[10_000]; // each key is 10KB + final Random rand = new Random(); + int returnedEntries = 0; @Override public boolean hasNext() { @@ -1477,10 +1494,10 @@ false, key(NAMESPACE, tag), STATE_FAMILY, ByteArrayCoder.of())) Iterable values = () -> new Iterator() { - int returnedValues = 0; - byte[] value = new byte[10_000]; // each value is 10KB final int targetValues = 1_000_000; // return 1 million values, which is 10 GBs - Random rand = new Random(); + final byte[] value = new byte[10_000]; // each value is 10KB + final Random rand = new Random(); + int returnedValues = 0; @Override public boolean hasNext() { @@ -1497,8 +1514,8 @@ public byte[] next() { waitAndSet( entriesFuture, - Arrays.asList( - new AbstractMap.SimpleEntry<>(encodeWithCoder(key, VarIntCoder.of()), values)), + Collections.singletonList( + new SimpleEntry<>(encodeWithCoder(key, VarIntCoder.of()), values)), 200); waitAndSet(getKeyFuture, values, 200); @@ -1509,55 +1526,6 @@ public byte[] next() { assertEquals(1_000_000, Iterables.size(valueResult)); } - private static class MultimapEntryUpdate { - String key; - Iterable values; - boolean deleteAll; - - public MultimapEntryUpdate(String key, Iterable values, boolean deleteAll) { - this.key = key; - this.values = values; - this.deleteAll = deleteAll; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof MultimapEntryUpdate)) return false; - MultimapEntryUpdate that = (MultimapEntryUpdate) o; - return deleteAll == that.deleteAll - && Objects.equals(key, that.key) - && Objects.equals(values, that.values); - } - - @Override - public int hashCode() { - return Objects.hash(key, values, deleteAll); - } - } - - private static MultimapEntryUpdate decodeTagMultimapEntry(Windmill.TagMultimapEntry entryProto) { - try { - String key = StringUtf8Coder.of().decode(entryProto.getEntryName().newInput(), Context.OUTER); - List values = new ArrayList<>(); - for (ByteString value : entryProto.getValuesList()) { - values.add(VarIntCoder.of().decode(value.newInput(), Context.OUTER)); - } - return new MultimapEntryUpdate(key, values, entryProto.getDeleteAll()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private static void assertTagMultimapUpdates( - Windmill.TagMultimapUpdateRequest.Builder updates, MultimapEntryUpdate... expected) { - assertThat( - updates.getUpdatesList().stream() - .map(WindmillStateInternalsTest::decodeTagMultimapEntry) - .collect(Collectors.toList()), - Matchers.containsInAnyOrder(expected)); - } - @Test public void testMultimapPutAndPersist() { final String tag = "multimap"; @@ -1582,7 +1550,7 @@ public void testMultimapPutAndPersist() { assertTagMultimapUpdates( builder, new MultimapEntryUpdate(key1, Arrays.asList(1, 2), false), - new MultimapEntryUpdate(key2, Arrays.asList(2), false)); + new MultimapEntryUpdate(key2, Collections.singletonList(2), false)); } @Test @@ -1615,7 +1583,7 @@ public void testMultimapRemovePutAndPersist() { assertTagMultimapUpdates( builder, new MultimapEntryUpdate(key1, Arrays.asList(1, 2), true), - new MultimapEntryUpdate(key2, Arrays.asList(4), true)); + new MultimapEntryUpdate(key2, Collections.singletonList(4), true)); } @Test @@ -1709,7 +1677,8 @@ false, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) assertEquals(1, commitBuilder.getMultimapUpdatesCount()); Windmill.TagMultimapUpdateRequest.Builder builder = Iterables.getOnlyElement(commitBuilder.getMultimapUpdatesBuilderList()); - assertTagMultimapUpdates(builder, new MultimapEntryUpdate(key1, Arrays.asList(4), false)); + assertTagMultimapUpdates( + builder, new MultimapEntryUpdate(key1, Collections.singletonList(4), false)); } @Test @@ -1731,8 +1700,7 @@ true, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) ReadableState> keysResult = multimapState.keys().readLater(); waitAndSet( keysFuture, - new WindmillStateReader.WeightedList<>( - Arrays.asList(multimapEntry(key1), multimapEntry(key2))), + new WeightedList<>(Arrays.asList(multimapEntry(key1), multimapEntry(key2))), 30); multimapState.remove(key1); @@ -1753,7 +1721,7 @@ true, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) Windmill.TagMultimapEntry entryUpdate = Iterables.getOnlyElement(builder.getUpdatesList()); byte[] decodedKey = ByteArrayCoder.of().decode(entryUpdate.getEntryName().newInput(), Context.OUTER); - assertTrue(Arrays.equals(key1, decodedKey)); + assertArrayEquals(key1, decodedKey); assertTrue(entryUpdate.getDeleteAll()); } @@ -1870,9 +1838,6 @@ true, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) underTest.persist(commitBuilder); } - public static final Range FULL_ORDERED_LIST_RANGE = - Range.closedOpen(WindmillOrderedList.MIN_TS_MICROS, WindmillOrderedList.MAX_TS_MICROS); - @Test public void testOrderedListAddBeforeRead() throws Exception { StateTag> addr = @@ -1897,7 +1862,7 @@ public void testOrderedListAddBeforeRead() throws Exception { TimestampedValue.of("goodbye", Instant.ofEpochMilli(50)); orderedList.add(helloValue); - waitAndSet(future, Arrays.asList(worldValue), 200); + waitAndSet(future, Collections.singletonList(worldValue), 200); assertThat(orderedList.read(), Matchers.contains(worldValue, helloValue)); orderedList.add(goodbyeValue); @@ -1940,7 +1905,7 @@ public void testOrderedListIsEmptyFalse() throws Exception { STATE_FAMILY, StringUtf8Coder.of()); - waitAndSet(future, Arrays.asList(TimestampedValue.of("world", Instant.EPOCH)), 200); + waitAndSet(future, Collections.singletonList(TimestampedValue.of("world", Instant.EPOCH)), 200); assertThat(result.read(), Matchers.is(false)); } @@ -2266,10 +2231,6 @@ public void testNewOrderedListNoFetch() throws Exception { Mockito.verifyZeroInteractions(mockReader); } - // test ordered list cleared before read - // test fetch + add + read - // test ids - @Test public void testBagAddBeforeRead() throws Exception { StateTag> addr = StateTags.bag("bag", StringUtf8Coder.of()); @@ -2282,13 +2243,17 @@ public void testBagAddBeforeRead() throws Exception { bag.readLater(); bag.add("hello"); - waitAndSet(future, Arrays.asList("world"), 200); + waitAndSet(future, Collections.singletonList("world"), 200); assertThat(bag.read(), Matchers.containsInAnyOrder("hello", "world")); bag.add("goodbye"); assertThat(bag.read(), Matchers.containsInAnyOrder("hello", "world", "goodbye")); } + // test ordered list cleared before read + // test fetch + add + read + // test ids + @Test public void testBagClearBeforeRead() throws Exception { StateTag> addr = StateTags.bag("bag", StringUtf8Coder.of()); @@ -2313,7 +2278,7 @@ public void testBagIsEmptyFalse() throws Exception { ReadableState result = bag.isEmpty().readLater(); Mockito.verify(mockReader).bagFuture(key(NAMESPACE, "bag"), STATE_FAMILY, StringUtf8Coder.of()); - waitAndSet(future, Arrays.asList("world"), 200); + waitAndSet(future, Collections.singletonList("world"), 200); assertThat(result.read(), Matchers.is(false)); } @@ -2328,7 +2293,7 @@ public void testBagIsEmptyTrue() throws Exception { ReadableState result = bag.isEmpty().readLater(); Mockito.verify(mockReader).bagFuture(key(NAMESPACE, "bag"), STATE_FAMILY, StringUtf8Coder.of()); - waitAndSet(future, Arrays.asList(), 200); + waitAndSet(future, Collections.emptyList(), 200); assertThat(result.read(), Matchers.is(true)); } @@ -2436,7 +2401,7 @@ public void testCombiningAddBeforeRead() throws Exception { assertThat(value.read(), Matchers.equalTo(29)); // That get "compressed" the combiner. So, the underlying future should change: - future.set(Arrays.asList(new int[] {29})); + future.set(Collections.singletonList(new int[] {29})); value.add(2); assertThat(value.read(), Matchers.equalTo(31)); @@ -2480,7 +2445,7 @@ public void testCombiningIsEmpty() throws Exception { .bagFuture(byteString.capture(), eq(STATE_FAMILY), Mockito.>any()); assertThat(byteString.getValue(), byteStringEq(COMBINING_KEY)); - waitAndSet(future, Arrays.asList(new int[] {29}), 200); + waitAndSet(future, Collections.singletonList(new int[] {29}), 200); assertThat(result.read(), Matchers.is(false)); } @@ -2527,12 +2492,10 @@ public void testCombiningAddPersistWithCompact() throws Exception { Mockito.when( mockReader.bagFuture( - org.mockito.Matchers.any(), - org.mockito.Matchers.any(), + org.mockito.Matchers.any(), + org.mockito.Matchers.any(), org.mockito.Matchers.>any())) - .thenReturn( - Futures.>immediateFuture( - ImmutableList.of(new int[] {40}, new int[] {60}))); + .thenReturn(Futures.immediateFuture(ImmutableList.of(new int[] {40}, new int[] {60}))); GroupingState value = underTest.state(NAMESPACE, COMBINING_ADDR); @@ -2717,7 +2680,7 @@ public void testWatermarkPersistLatestEmpty() throws Exception { hold.add(new Instant(2000)); when(mockReader.watermarkFuture(key(NAMESPACE, "watermark"), STATE_FAMILY)) - .thenReturn(Futures.immediateFuture(null)); + .thenReturn(Futures.immediateFuture(null)); Windmill.WorkItemCommitRequest.Builder commitBuilder = Windmill.WorkItemCommitRequest.newBuilder(); @@ -2743,7 +2706,7 @@ public void testWatermarkPersistLatestWindmillWins() throws Exception { hold.add(new Instant(2000)); when(mockReader.watermarkFuture(key(NAMESPACE, "watermark"), STATE_FAMILY)) - .thenReturn(Futures.immediateFuture(new Instant(4000))); + .thenReturn(Futures.immediateFuture(new Instant(4000))); Windmill.WorkItemCommitRequest.Builder commitBuilder = Windmill.WorkItemCommitRequest.newBuilder(); @@ -2769,7 +2732,7 @@ public void testWatermarkPersistLatestLocalAdditionsWin() throws Exception { hold.add(new Instant(2000)); when(mockReader.watermarkFuture(key(NAMESPACE, "watermark"), STATE_FAMILY)) - .thenReturn(Futures.immediateFuture(new Instant(500))); + .thenReturn(Futures.immediateFuture(new Instant(500))); Windmill.WorkItemCommitRequest.Builder commitBuilder = Windmill.WorkItemCommitRequest.newBuilder(); @@ -2880,7 +2843,7 @@ public void testValueClearBeforeRead() throws Exception { value.clear(); - assertEquals(null, value.read()); + assertNull(value.read()); Mockito.verifyNoMoreInteractions(mockReader); } @@ -2956,7 +2919,7 @@ public void testNewValueNoFetch() throws Exception { StateTag> addr = StateTags.value("value", StringUtf8Coder.of()); ValueState value = underTestNewKey.state(NAMESPACE, addr); - assertEquals(null, value.read()); + assertNull(value.read()); // Shouldn't need to read from windmill for this. Mockito.verifyZeroInteractions(mockReader); @@ -2984,7 +2947,7 @@ public void testCachedValue() throws Exception { resetUnderTest(); value = underTest.state(NAMESPACE, addr); - assertEquals(null, value.read()); + assertNull(value.read()); underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); Mockito.verifyNoMoreInteractions(mockReader); @@ -3087,7 +3050,7 @@ public void testCachedWatermarkHold() throws Exception { resetUnderTest(); hold = underTest.state(NAMESPACE, addr); - assertEquals(null, hold.read()); + assertNull(hold.read()); underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); Mockito.verify(mockReader, times(2)).watermarkFuture(key(NAMESPACE, "watermark"), STATE_FAMILY); @@ -3109,7 +3072,7 @@ public void testCachedCombining() throws Exception { value.readLater(); value.add(1); - waitAndSet(future, Arrays.asList(new int[] {2}), 200); + waitAndSet(future, Collections.singletonList(new int[] {2}), 200); assertThat(value.read(), Matchers.equalTo(3)); underTest.persist(Windmill.WorkItemCommitRequest.newBuilder()); @@ -3149,4 +3112,31 @@ private void disableCompactOnWrite() { private void forceCompactOnWrite() { WindmillStateInternals.COMPACT_NOW.set(() -> true); } + + private static class MultimapEntryUpdate { + String key; + Iterable values; + boolean deleteAll; + + public MultimapEntryUpdate(String key, Iterable values, boolean deleteAll) { + this.key = key; + this.values = values; + this.deleteAll = deleteAll; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof MultimapEntryUpdate)) return false; + MultimapEntryUpdate that = (MultimapEntryUpdate) o; + return deleteAll == that.deleteAll + && Objects.equals(key, that.key) + && Objects.equals(values, that.values); + } + + @Override + public int hashCode() { + return Objects.hash(key, values, deleteAll); + } + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateReaderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java similarity index 96% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateReaderTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java index 1981a63c0ed56..430e31ee04ff1 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/WindmillStateReaderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker; +package org.apache.beam.runners.dataflow.worker.windmill.state; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.Assert.assertEquals; @@ -33,6 +33,10 @@ import java.util.List; import java.util.Map; import java.util.concurrent.Future; +import org.apache.beam.runners.dataflow.worker.KeyTokenInvalidException; +import org.apache.beam.runners.dataflow.worker.MetricTrackingWindmillServerStub; +import org.apache.beam.runners.dataflow.worker.WindmillStateTestUtils; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.KeyedGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.SortedListEntry; @@ -83,6 +87,8 @@ public class WindmillStateReaderTest { private static final ByteString STATE_KEY_2 = ByteString.copyFromUtf8("key2"); private static final String STATE_FAMILY = "family"; + private static final String STATE_FAMILY2 = "family2"; + private static void assertNoReader(Object obj) throws Exception { WindmillStateTestUtils.assertNoReference(obj, WindmillStateReader.class); } @@ -989,15 +995,19 @@ public void testReadSortedList() throws Exception { public void testReadSortedListRanges() throws Exception { Future>> future1 = underTest.orderedListFuture(Range.closedOpen(0L, 5L), STATE_KEY_1, STATE_FAMILY, INT_CODER); + // Should be put into a subsequent batch as it has the same key and state family. Future>> future2 = underTest.orderedListFuture(Range.closedOpen(5L, 6L), STATE_KEY_1, STATE_FAMILY, INT_CODER); Future>> future3 = underTest.orderedListFuture( - Range.closedOpen(6L, 10L), STATE_KEY_1, STATE_FAMILY, INT_CODER); + Range.closedOpen(6L, 10L), STATE_KEY_2, STATE_FAMILY, INT_CODER); + Future>> future4 = + underTest.orderedListFuture( + Range.closedOpen(11L, 12L), STATE_KEY_2, STATE_FAMILY2, INT_CODER); Mockito.verifyNoMoreInteractions(mockWindmill); // Fetch the entire list. - Windmill.KeyedGetDataRequest.Builder expectedRequest = + Windmill.KeyedGetDataRequest.Builder expectedRequest1 = Windmill.KeyedGetDataRequest.newBuilder() .setKey(DATA_KEY) .setShardingKey(SHARDING_KEY) @@ -1011,18 +1021,31 @@ public void testReadSortedListRanges() throws Exception { .setFetchMaxBytes(WindmillStateReader.MAX_ORDERED_LIST_BYTES)) .addSortedListsToFetch( Windmill.TagSortedListFetchRequest.newBuilder() - .setTag(STATE_KEY_1) + .setTag(STATE_KEY_2) .setStateFamily(STATE_FAMILY) - .addFetchRanges(SortedListRange.newBuilder().setStart(5).setLimit(6)) + .addFetchRanges(SortedListRange.newBuilder().setStart(6).setLimit(10)) .setFetchMaxBytes(WindmillStateReader.MAX_ORDERED_LIST_BYTES)) + .addSortedListsToFetch( + Windmill.TagSortedListFetchRequest.newBuilder() + .setTag(STATE_KEY_2) + .setStateFamily(STATE_FAMILY2) + .addFetchRanges(SortedListRange.newBuilder().setStart(11).setLimit(12)) + .setFetchMaxBytes(WindmillStateReader.MAX_ORDERED_LIST_BYTES)); + + Windmill.KeyedGetDataRequest.Builder expectedRequest2 = + Windmill.KeyedGetDataRequest.newBuilder() + .setKey(DATA_KEY) + .setShardingKey(SHARDING_KEY) + .setWorkToken(WORK_TOKEN) + .setMaxBytes(WindmillStateReader.MAX_KEY_BYTES) .addSortedListsToFetch( Windmill.TagSortedListFetchRequest.newBuilder() .setTag(STATE_KEY_1) .setStateFamily(STATE_FAMILY) - .addFetchRanges(SortedListRange.newBuilder().setStart(6).setLimit(10)) + .addFetchRanges(SortedListRange.newBuilder().setStart(5).setLimit(6)) .setFetchMaxBytes(WindmillStateReader.MAX_ORDERED_LIST_BYTES)); - Windmill.KeyedGetDataResponse.Builder response = + Windmill.KeyedGetDataResponse.Builder response1 = Windmill.KeyedGetDataResponse.newBuilder() .setKey(DATA_KEY) .addTagSortedLists( @@ -1034,41 +1057,41 @@ public void testReadSortedListRanges() throws Exception { .addFetchRanges(SortedListRange.newBuilder().setStart(0).setLimit(5))) .addTagSortedLists( Windmill.TagSortedListFetchResponse.newBuilder() - .setTag(STATE_KEY_1) + .setTag(STATE_KEY_2) .setStateFamily(STATE_FAMILY) .addEntries( - SortedListEntry.newBuilder().setValue(intData(6)).setSortKey(6000).setId(5)) - .addEntries( - SortedListEntry.newBuilder().setValue(intData(7)).setSortKey(7000).setId(7)) - .addFetchRanges(SortedListRange.newBuilder().setStart(5).setLimit(6))) + SortedListEntry.newBuilder().setValue(intData(8)).setSortKey(8000).setId(8)) + .addFetchRanges(SortedListRange.newBuilder().setStart(6).setLimit(10))) + .addTagSortedLists( + Windmill.TagSortedListFetchResponse.newBuilder() + .setTag(STATE_KEY_2) + .setStateFamily(STATE_FAMILY2) + .addFetchRanges(SortedListRange.newBuilder().setStart(11).setLimit(12))); + + Windmill.KeyedGetDataResponse.Builder response2 = + Windmill.KeyedGetDataResponse.newBuilder() + .setKey(DATA_KEY) .addTagSortedLists( Windmill.TagSortedListFetchResponse.newBuilder() .setTag(STATE_KEY_1) .setStateFamily(STATE_FAMILY) .addEntries( - SortedListEntry.newBuilder().setValue(intData(8)).setSortKey(8000).setId(8)) - .addFetchRanges(SortedListRange.newBuilder().setStart(6).setLimit(10))); - - Mockito.when(mockWindmill.getStateData(COMPUTATION, expectedRequest.build())) - .thenReturn(response.build()); + SortedListEntry.newBuilder().setValue(intData(6)).setSortKey(6000).setId(5)) + .addEntries( + SortedListEntry.newBuilder().setValue(intData(7)).setSortKey(7000).setId(7)) + .addFetchRanges(SortedListRange.newBuilder().setStart(5).setLimit(6))); - { - Iterable> results = future1.get(); - Mockito.verify(mockWindmill).getStateData(COMPUTATION, expectedRequest.build()); - for (TimestampedValue unused : results) { - // Iterate over the results to force loading all the pages. - } - Mockito.verifyNoMoreInteractions(mockWindmill); - assertThat(results, Matchers.contains(TimestampedValue.of(5, Instant.ofEpochMilli(5)))); - assertNoReader(future1); - } + Mockito.when(mockWindmill.getStateData(COMPUTATION, expectedRequest1.build())) + .thenReturn(response1.build()); + Mockito.when(mockWindmill.getStateData(COMPUTATION, expectedRequest2.build())) + .thenReturn(response2.build()); + // Trigger reads of batching. By fetching future2 which is not part of the first batch we ensure + // that all batches are fetched. { Iterable> results = future2.get(); - Mockito.verify(mockWindmill).getStateData(COMPUTATION, expectedRequest.build()); - for (TimestampedValue unused : results) { - // Iterate over the results to force loading all the pages. - } + Mockito.verify(mockWindmill).getStateData(COMPUTATION, expectedRequest1.build()); + Mockito.verify(mockWindmill).getStateData(COMPUTATION, expectedRequest2.build()); Mockito.verifyNoMoreInteractions(mockWindmill); assertThat( results, @@ -1078,16 +1101,23 @@ public void testReadSortedListRanges() throws Exception { assertNoReader(future2); } + { + Iterable> results = future1.get(); + assertThat(results, Matchers.contains(TimestampedValue.of(5, Instant.ofEpochMilli(5)))); + assertNoReader(future1); + } + { Iterable> results = future3.get(); - Mockito.verify(mockWindmill).getStateData(COMPUTATION, expectedRequest.build()); - for (TimestampedValue unused : results) { - // Iterate over the results to force loading all the pages. - } - Mockito.verifyNoMoreInteractions(mockWindmill); assertThat(results, Matchers.contains(TimestampedValue.of(8, Instant.ofEpochMilli(8)))); assertNoReader(future3); } + + { + Iterable> results = future4.get(); + assertThat(results, Matchers.emptyIterable()); + assertNoReader(future4); + } } @Test diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetTest.java new file mode 100644 index 0000000000000..76d5083978508 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.budget; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class GetWorkBudgetTest { + + @Test + public void testCreateWithNoBudget() { + GetWorkBudget getWorkBudget = GetWorkBudget.noBudget(); + assertEquals(0, getWorkBudget.items()); + assertEquals(0, getWorkBudget.bytes()); + } + + @Test + public void testBuild_itemsAndBytesNeverBelowZero() { + GetWorkBudget getWorkBudget = GetWorkBudget.builder().setItems(-10).setBytes(-10).build(); + assertEquals(0, getWorkBudget.items()); + assertEquals(0, getWorkBudget.bytes()); + } + + @Test + public void testAdd_doesNotAllowNegativeParameters() { + GetWorkBudget getWorkBudget = GetWorkBudget.builder().setItems(1).setBytes(1).build(); + assertThrows(IllegalArgumentException.class, () -> getWorkBudget.add(-1, -1)); + } + + @Test + public void testSubtract_itemsAndBytesNeverBelowZero() { + GetWorkBudget getWorkBudget = GetWorkBudget.builder().setItems(1).setBytes(1).build(); + GetWorkBudget subtracted = getWorkBudget.subtract(10, 10); + assertEquals(0, subtracted.items()); + assertEquals(0, subtracted.bytes()); + } + + @Test + public void testSubtractGetWorkBudget_itemsAndBytesNeverBelowZero() { + GetWorkBudget getWorkBudget = GetWorkBudget.builder().setItems(1).setBytes(1).build(); + GetWorkBudget subtracted = + getWorkBudget.subtract(GetWorkBudget.builder().setItems(10).setBytes(10).build()); + assertEquals(0, subtracted.items()); + assertEquals(0, subtracted.bytes()); + } + + @Test + public void testSubtract_doesNotAllowNegativeParameters() { + GetWorkBudget getWorkBudget = GetWorkBudget.builder().setItems(1).setBytes(1).build(); + assertThrows(IllegalArgumentException.class, () -> getWorkBudget.subtract(-1, -1)); + } +} diff --git a/runners/spark/3/build.gradle b/runners/spark/3/build.gradle index 5380146044d5a..5103805db347a 100644 --- a/runners/spark/3/build.gradle +++ b/runners/spark/3/build.gradle @@ -34,6 +34,7 @@ createJavaExamplesArchetypeValidationTask(type: 'Quickstart', runner: 'Spark') // Additional supported Spark versions (used in compatibility tests) def sparkVersions = [ + "350": "3.5.0", "341": "3.4.1", "340": "3.4.0", "332": "3.3.2", diff --git a/runners/spark/spark_runner.gradle b/runners/spark/spark_runner.gradle index d0dbe453ddfb0..74013de6107d6 100644 --- a/runners/spark/spark_runner.gradle +++ b/runners/spark/spark_runner.gradle @@ -63,8 +63,9 @@ def sparkTestProperties(overrides = [:]) { def sparkTestJvmArgs() { - // run tests with Java 17 using -PcompileAndRunTestsWithJava17 -Pjava17Home=??? - if (project.hasProperty("compileAndRunTestsWithJava17")) { + // run tests with Java 17 using -PtestJavaVersion=17 -Pjava17Home=??? + if (project.hasProperty('testJavaVersion') && + project.getProperty('testJavaVersion') in ['17', '21']) { return [ "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", // add-opens below required for Kryo FieldSerializer / SparkRunnerKryoRegistratorTest diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkCombineFn.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkCombineFn.java index ddf4b12bae130..1075ae0d2a7d9 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkCombineFn.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkCombineFn.java @@ -41,7 +41,6 @@ import org.apache.beam.runners.spark.util.SideInputBroadcast; import org.apache.beam.runners.spark.util.SparkSideInputReader; import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.coders.IterableCoder; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.CombineWithContext; @@ -101,7 +100,7 @@ void add(WindowedValue value, SparkCombineFn throws Exception; /** - * Merge other acccumulator into this one. + * Merge other accumulator into this one. * * @param other the other accumulator to merge */ @@ -173,7 +172,7 @@ static SingleWindowWindowedAccumulator(toValue); } - static WindowedAccumulator create( + static SingleWindowWindowedAccumulator create( Function toValue, WindowedValue accumulator) { return new SingleWindowWindowedAccumulator<>(toValue, accumulator); } @@ -191,10 +190,7 @@ static SingleWindowWindowedAccumulator toValue, WindowedValue accumulator) { this.toValue = toValue; this.windowAccumulator = accumulator.getValue(); - this.accTimestamp = - accumulator.getTimestamp().equals(BoundedWindow.TIMESTAMP_MIN_VALUE) - ? null - : accumulator.getTimestamp(); + this.accTimestamp = accumulator.getTimestamp(); this.accWindow = getWindow(accumulator); } @@ -247,7 +243,7 @@ public void merge( @Override public Collection> extractOutput() { if (windowAccumulator != null) { - return Arrays.asList( + return Collections.singletonList( WindowedValue.of( windowAccumulator, accTimestamp, accWindow, PaneInfo.ON_TIME_AND_ONLY_FIRING)); } @@ -516,7 +512,8 @@ static class WindowedAccumulatorCoder @Override public void encode(WindowedAccumulator value, OutputStream outStream) - throws CoderException, IOException { + throws IOException { + if (type.isMapBased()) { wrap.encode(((MapBasedWindowedAccumulator) value).map.values(), outStream); } else { @@ -536,7 +533,8 @@ public void encode(WindowedAccumulator value, OutputS @Override public WindowedAccumulator decode(InputStream inStream) - throws CoderException, IOException { + throws IOException { + if (type.isMapBased()) { return WindowedAccumulator.create(toValue, type, wrap.decode(inStream), windowComparator); } diff --git a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SparkCombineFnTest.java b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SparkCombineFnTest.java index 295b7ef2b948d..9cb4b44c897c8 100644 --- a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SparkCombineFnTest.java +++ b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SparkCombineFnTest.java @@ -36,6 +36,7 @@ import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.transforms.windowing.Sessions; import org.apache.beam.sdk.transforms.windowing.SlidingWindows; +import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; import org.apache.beam.sdk.transforms.windowing.WindowFn; import org.apache.beam.sdk.util.CombineFnUtil; import org.apache.beam.sdk.util.WindowedValue; @@ -219,6 +220,34 @@ public void testSlidingCombineFnExplode() throws Exception { result); } + @Test + public void testGlobalWindowMergeAccumulatorsWithEarliestCombiner() throws Exception { + SparkCombineFn, Integer, Long, Long> sparkCombineFn = + SparkCombineFn.keyed( + combineFn, + opts, + Collections.emptyMap(), + WindowingStrategy.globalDefault().withTimestampCombiner(TimestampCombiner.EARLIEST)); + + Instant ts = BoundedWindow.TIMESTAMP_MIN_VALUE; + WindowedValue> first = input("key", 1, ts); + WindowedValue> second = input("key", 2, ts); + WindowedValue> third = input("key", 3, ts); + WindowedValue accumulator = WindowedValue.valueInGlobalWindow(0L); + SparkCombineFn.SingleWindowWindowedAccumulator, Integer, Long> acc1 = + SparkCombineFn.SingleWindowWindowedAccumulator.create(KV::getValue, accumulator); + SparkCombineFn.SingleWindowWindowedAccumulator, Integer, Long> acc2 = + SparkCombineFn.SingleWindowWindowedAccumulator.create(KV::getValue, accumulator); + SparkCombineFn.SingleWindowWindowedAccumulator, Integer, Long> acc3 = + SparkCombineFn.SingleWindowWindowedAccumulator.create(KV::getValue, accumulator); + acc1.add(first, sparkCombineFn); + acc2.add(second, sparkCombineFn); + acc3.merge(acc1, sparkCombineFn); + acc3.merge(acc2, sparkCombineFn); + acc3.add(third, sparkCombineFn); + assertEquals(6, (long) Iterables.getOnlyElement(sparkCombineFn.extractOutput(acc3)).getValue()); + } + private static Combine.CombineFn getSumFn() { return new Combine.CombineFn() { diff --git a/sdks/go.mod b/sdks/go.mod index e17427227eba2..e0f079eb1875f 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -23,43 +23,45 @@ module github.com/apache/beam/sdks/v2 go 1.20 require ( - cloud.google.com/go/bigquery v1.55.0 + cloud.google.com/go/bigquery v1.57.1 cloud.google.com/go/bigtable v1.20.0 - cloud.google.com/go/datastore v1.14.0 - cloud.google.com/go/profiler v0.3.1 + cloud.google.com/go/datastore v1.15.0 + cloud.google.com/go/profiler v0.4.0 cloud.google.com/go/pubsub v1.33.0 - cloud.google.com/go/spanner v1.49.0 - cloud.google.com/go/storage v1.33.0 - github.com/aws/aws-sdk-go-v2 v1.21.0 - github.com/aws/aws-sdk-go-v2/config v1.18.43 - github.com/aws/aws-sdk-go-v2/credentials v1.13.41 - github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.87 - github.com/aws/aws-sdk-go-v2/service/s3 v1.40.0 - github.com/aws/smithy-go v1.14.2 + cloud.google.com/go/spanner v1.51.0 + cloud.google.com/go/storage v1.34.1 + github.com/aws/aws-sdk-go-v2 v1.22.1 + github.com/aws/aws-sdk-go-v2/config v1.22.0 + github.com/aws/aws-sdk-go-v2/credentials v1.15.1 + github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.1 + github.com/aws/aws-sdk-go-v2/service/s3 v1.42.0 + github.com/aws/smithy-go v1.16.0 github.com/docker/go-connections v0.4.0 github.com/dustin/go-humanize v1.0.1 github.com/go-sql-driver/mysql v1.7.1 github.com/golang/protobuf v1.5.3 // TODO(danoliveira): Fully replace this with google.golang.org/protobuf - github.com/google/go-cmp v0.5.9 - github.com/google/uuid v1.3.1 + github.com/google/go-cmp v0.6.0 + github.com/google/uuid v1.4.0 github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 github.com/lib/pq v1.10.9 github.com/linkedin/goavro/v2 v2.12.0 - github.com/proullon/ramsql v0.1.2 - github.com/spf13/cobra v1.7.0 - github.com/testcontainers/testcontainers-go v0.24.1 + github.com/nats-io/nats-server/v2 v2.10.4 + github.com/nats-io/nats.go v1.31.0 + github.com/proullon/ramsql v0.1.3 + github.com/spf13/cobra v1.8.0 + github.com/testcontainers/testcontainers-go v0.25.0 github.com/tetratelabs/wazero v1.5.0 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c go.mongodb.org/mongo-driver v1.12.1 - golang.org/x/net v0.15.0 - golang.org/x/oauth2 v0.12.0 - golang.org/x/sync v0.3.0 - golang.org/x/sys v0.12.0 + golang.org/x/net v0.17.0 + golang.org/x/oauth2 v0.13.0 + golang.org/x/sync v0.5.0 + golang.org/x/sys v0.14.0 golang.org/x/text v0.13.0 - google.golang.org/api v0.143.0 - google.golang.org/genproto v0.0.0-20230913181813-007df8e322eb - google.golang.org/grpc v1.58.2 + google.golang.org/api v0.150.0 + google.golang.org/genproto v0.0.0-20231016165738-49dd2c1f3d0b + google.golang.org/grpc v1.59.0 google.golang.org/protobuf v1.31.0 gopkg.in/retry.v1 v1.0.3 gopkg.in/yaml.v2 v2.4.0 @@ -67,8 +69,8 @@ require ( ) require ( - github.com/fsouza/fake-gcs-server v1.47.4 - golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 + github.com/fsouza/fake-gcs-server v1.47.6 + golang.org/x/exp v0.0.0-20230807204917-050eac23e9de ) require ( @@ -76,20 +78,25 @@ require ( github.com/Microsoft/hcsshim v0.11.0 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/minio/highwayhash v1.0.2 // indirect + github.com/nats-io/jwt/v2 v2.5.2 // indirect + github.com/nats-io/nkeys v0.4.6 // indirect + github.com/nats-io/nuid v1.0.1 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/shirou/gopsutil/v3 v3.23.7 // indirect + github.com/shirou/gopsutil/v3 v3.23.8 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect - github.com/tklauser/go-sysconf v0.3.11 // indirect - github.com/tklauser/numcpus v0.6.0 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect github.com/yusufpapurcu/wmi v1.2.3 // indirect + golang.org/x/time v0.3.0 // indirect ) require ( - cloud.google.com/go v0.110.7 // indirect - cloud.google.com/go/compute v1.23.0 // indirect + cloud.google.com/go v0.110.8 // indirect + cloud.google.com/go/compute v1.23.1 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect - cloud.google.com/go/iam v1.1.1 // indirect - cloud.google.com/go/longrunning v0.5.1 // indirect + cloud.google.com/go/iam v1.1.3 // indirect + cloud.google.com/go/longrunning v0.5.2 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/andybalholm/brotli v1.0.4 // indirect @@ -97,19 +104,19 @@ require ( github.com/apache/arrow/go/v12 v12.0.0 // indirect github.com/apache/thrift v0.16.0 // indirect github.com/aws/aws-sdk-go v1.34.0 // indirect - github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.13 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.11 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35 // indirect - github.com/aws/aws-sdk-go-v2/internal/ini v1.3.43 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.1.4 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.14 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.1.36 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.35 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.15.4 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.15.0 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.17.1 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.23.0 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.0 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.2 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.2.1 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.5.1 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.5.0 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.1 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.10.0 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.1 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.10.1 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.1 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.17.0 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.19.0 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.25.0 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect @@ -118,7 +125,7 @@ require ( github.com/containerd/containerd v1.7.6 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect github.com/docker/distribution v2.8.2+incompatible // indirect - github.com/docker/docker v24.0.6+incompatible // but required to resolve issue docker has with go1.20 + github.com/docker/docker v24.0.7+incompatible // but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect github.com/envoyproxy/go-control-plane v0.11.1 // indirect github.com/envoyproxy/protoc-gen-validate v1.0.2 // indirect @@ -128,17 +135,17 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v2.0.8+incompatible // indirect - github.com/google/pprof v0.0.0-20221103000818-d260c55eee4c // indirect + github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 // indirect github.com/google/renameio/v2 v2.0.0 // indirect github.com/google/s2a-go v0.1.7 // indirect - github.com/googleapis/enterprise-certificate-proxy v0.3.1 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/googleapis/gax-go/v2 v2.12.0 // indirect github.com/gorilla/handlers v1.5.1 // indirect github.com/gorilla/mux v1.8.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/klauspost/asmfmt v1.3.2 // indirect - github.com/klauspost/compress v1.16.7 // indirect + github.com/klauspost/compress v1.17.2 // indirect github.com/klauspost/cpuid/v2 v2.2.5 // indirect github.com/kr/text v0.2.0 // indirect github.com/magiconair/properties v1.8.7 // indirect @@ -165,11 +172,11 @@ require ( github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect github.com/zeebo/xxh3 v1.0.2 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.13.0 // indirect + golang.org/x/crypto v0.14.0 // indirect golang.org/x/mod v0.11.0 // indirect golang.org/x/tools v0.10.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20230913181813-007df8e322eb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230920204549-e6e6cdab5c13 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20231016165738-49dd2c1f3d0b // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20231030173426-d783a09b4405 // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index 71c1c4545c898..94fe1436ab858 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -8,43 +8,43 @@ cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= -cloud.google.com/go v0.110.7 h1:rJyC7nWRg2jWGZ4wSJ5nY65GTdYJkg0cd/uXb+ACI6o= -cloud.google.com/go v0.110.7/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= +cloud.google.com/go v0.110.8 h1:tyNdfIxjzaWctIiLYOTalaLKZ17SI44SKFW26QbOhME= +cloud.google.com/go v0.110.8/go.mod h1:Iz8AkXJf1qmxC3Oxoep8R1T36w8B92yU29PcBhHO5fk= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= -cloud.google.com/go/bigquery v1.55.0 h1:hs44Xxov3XLWQiCx2J8lK5U/ihLqnpm4RVVl5fdtLLI= -cloud.google.com/go/bigquery v1.55.0/go.mod h1:9Y5I3PN9kQWuid6183JFhOGOW3GcirA5LpsKCUn+2ec= +cloud.google.com/go/bigquery v1.57.1 h1:FiULdbbzUxWD0Y4ZGPSVCDLvqRSyCIO6zKV7E2nf5uA= +cloud.google.com/go/bigquery v1.57.1/go.mod h1:iYzC0tGVWt1jqSzBHqCr3lrRn0u13E8e+AqowBsDgug= cloud.google.com/go/bigtable v1.20.0 h1:NqZC/WcesSn4O8L0I2JmuNsUigSyBQifVLYgM9LMQeQ= cloud.google.com/go/bigtable v1.20.0/go.mod h1:upJDn8frsjzpRMfybiWkD1PG6WCCL7CRl26MgVeoXY4= -cloud.google.com/go/compute v1.23.0 h1:tP41Zoavr8ptEqaW6j+LQOnyBBhO7OkOMAGrgLopTwY= -cloud.google.com/go/compute v1.23.0/go.mod h1:4tCnrn48xsqlwSAiLf1HXMQk8CONslYbdiEZc9FEIbM= +cloud.google.com/go/compute v1.23.1 h1:V97tBoDaZHb6leicZ1G6DLK2BAaZLJ/7+9BB/En3hR0= +cloud.google.com/go/compute v1.23.1/go.mod h1:CqB3xpmPKKt3OJpW2ndFIXnA9A4xAy/F3Xp1ixncW78= cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= -cloud.google.com/go/datacatalog v1.16.0 h1:qVeQcw1Cz93/cGu2E7TYUPh8Lz5dn5Ws2siIuQ17Vng= +cloud.google.com/go/datacatalog v1.18.1 h1:xJp9mZrc2HPaoxIz3sP9pCmf/impifweQ/yGG9VBfio= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= -cloud.google.com/go/datastore v1.14.0 h1:Mq0ApTRdLW3/dyiw+DkjTk0+iGIUvkbzaC8sfPwWTH4= -cloud.google.com/go/datastore v1.14.0/go.mod h1:GAeStMBIt9bPS7jMJA85kgkpsMkvseWWXiaHya9Jes8= -cloud.google.com/go/iam v1.1.1 h1:lW7fzj15aVIXYHREOqjRBV9PsH0Z6u8Y46a1YGvQP4Y= -cloud.google.com/go/iam v1.1.1/go.mod h1:A5avdyVL2tCppe4unb0951eI9jreack+RJ0/d+KUZOU= -cloud.google.com/go/kms v1.15.0 h1:xYl5WEaSekKYN5gGRyhjvZKM22GVBBCzegGNVPy+aIs= -cloud.google.com/go/longrunning v0.5.1 h1:Fr7TXftcqTudoyRJa113hyaqlGdiBQkp0Gq7tErFDWI= -cloud.google.com/go/longrunning v0.5.1/go.mod h1:spvimkwdz6SPWKEt/XBij79E9fiTkHSQl/fRUUQJYJc= -cloud.google.com/go/profiler v0.3.1 h1:b5got9Be9Ia0HVvyt7PavWxXEht15B9lWnigdvHtxOc= -cloud.google.com/go/profiler v0.3.1/go.mod h1:GsG14VnmcMFQ9b+kq71wh3EKMZr3WRMgLzNiFRpW7tE= +cloud.google.com/go/datastore v1.15.0 h1:0P9WcsQeTWjuD1H14JIY7XQscIPQ4Laje8ti96IC5vg= +cloud.google.com/go/datastore v1.15.0/go.mod h1:GAeStMBIt9bPS7jMJA85kgkpsMkvseWWXiaHya9Jes8= +cloud.google.com/go/iam v1.1.3 h1:18tKG7DzydKWUnLjonWcJO6wjSCAtzh4GcRKlH/Hrzc= +cloud.google.com/go/iam v1.1.3/go.mod h1:3khUlaBXfPKKe7huYgEpDn6FtgRyMEqbkvBxrQyY5SE= +cloud.google.com/go/kms v1.15.3 h1:RYsbxTRmk91ydKCzekI2YjryO4c5Y2M80Zwcs9/D/cI= +cloud.google.com/go/longrunning v0.5.2 h1:u+oFqfEwwU7F9dIELigxbe0XVnBAo9wqMuQLA50CZ5k= +cloud.google.com/go/longrunning v0.5.2/go.mod h1:nqo6DQbNV2pXhGDbDMoN2bWz68MjZUzqv2YttZiveCs= +cloud.google.com/go/profiler v0.4.0 h1:ZeRDZbsOBDyRG0OiK0Op1/XWZ3xeLwJc9zjkzczUxyY= +cloud.google.com/go/profiler v0.4.0/go.mod h1:RvPlm4dilIr3oJtAOeFQU9Lrt5RoySHSDj4pTd6TWeU= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= cloud.google.com/go/pubsub v1.33.0 h1:6SPCPvWav64tj0sVX/+npCBKhUi/UjJehy9op/V3p2g= cloud.google.com/go/pubsub v1.33.0/go.mod h1:f+w71I33OMyxf9VpMVcZbnG5KSUkCOUHYpFd5U1GdRc= -cloud.google.com/go/spanner v1.49.0 h1:+HY8C4uztU7XyLz3xMi/LCXdetLEOExhvRFJu2NiVXM= -cloud.google.com/go/spanner v1.49.0/go.mod h1:eGj9mQGK8+hkgSVbHNQ06pQ4oS+cyc4tXXd6Dif1KoM= +cloud.google.com/go/spanner v1.51.0 h1:l3exhhsVMKsx1E7Xd1QajYSvHmI1KZoWPW5tRxIIdvQ= +cloud.google.com/go/spanner v1.51.0/go.mod h1:c5KNo5LQ1X5tJwma9rSQZsXNBDNvj4/n8BVc3LNahq0= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= -cloud.google.com/go/storage v1.33.0 h1:PVrDOkIC8qQVa1P3SXGpQvfuJhN2LHOoyZvWs8D2X5M= -cloud.google.com/go/storage v1.33.0/go.mod h1:Hhh/dogNRGca7IWv1RC2YqEn0c0G77ctA/OxflYkiD8= +cloud.google.com/go/storage v1.34.1 h1:H2Af2dU5J0PF7A5B+ECFIce+RqxVnrVilO+cu0TS3MI= +cloud.google.com/go/storage v1.34.1/go.mod h1:VN1ElqqvR9adg1k9xlkUJ55cMOP1/QjnNNuT5xQL6dY= dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= @@ -81,60 +81,56 @@ github.com/aws/aws-sdk-go v1.30.19/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZve github.com/aws/aws-sdk-go v1.34.0 h1:brux2dRrlwCF5JhTL7MUT3WUwo9zfDHZZp3+g3Mvlmo= github.com/aws/aws-sdk-go v1.34.0/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aws/aws-sdk-go-v2 v1.7.1/go.mod h1:L5LuPC1ZgDr2xQS7AmIec/Jlc7O/Y1u2KxJyNVab250= -github.com/aws/aws-sdk-go-v2 v1.21.0 h1:gMT0IW+03wtYJhRqTVYn0wLzwdnK9sRMcxmtfGzRdJc= -github.com/aws/aws-sdk-go-v2 v1.21.0/go.mod h1:/RfNgGmRxI+iFOB1OeJUyxiU+9s88k3pfHvDagGEp0M= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.13 h1:OPLEkmhXf6xFPiz0bLeDArZIDx1NNS4oJyG4nv3Gct0= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.13/go.mod h1:gpAbvyDGQFozTEmlTFO8XcQKHzubdq0LzRyJpG6MiXM= +github.com/aws/aws-sdk-go-v2 v1.22.1 h1:sjnni/AuoTXxHitsIdT0FwmqUuNUuHtufcVDErVFT9U= +github.com/aws/aws-sdk-go-v2 v1.22.1/go.mod h1:Kd0OJtkW3Q0M0lUWGszapWjEvrXDzRW+D21JNsroB+c= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.0 h1:hHgLiIrTRtddC0AKcJr5s7i/hLgcpTt+q/FKxf1Zayk= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.0/go.mod h1:w4I/v3NOWgD+qvs1NPEwhd++1h3XPHFaVxasfY6HlYQ= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= -github.com/aws/aws-sdk-go-v2/config v1.18.42/go.mod h1:4AZM3nMMxwlG+eZlxvBKqwVbkDLlnN2a4UGTL6HjaZI= -github.com/aws/aws-sdk-go-v2/config v1.18.43 h1:IgdUtTRvUDC6eiJBqU6vh7bHFNAEBjQ8S+qJ7zVhDOs= -github.com/aws/aws-sdk-go-v2/config v1.18.43/go.mod h1:NiFev8qlgg8MPzw3fO/EwzMZeZwlJEKGwfpjRPA9Nvw= +github.com/aws/aws-sdk-go-v2/config v1.22.0 h1:9Mm99OalzZRz0ab5fpodMoHBApHS6pqRNp3M9NmzvDg= +github.com/aws/aws-sdk-go-v2/config v1.22.0/go.mod h1:2eWgw5lps8fKI7LZVTrRTYP6HE6k/uEFUuTSHfXwqP0= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= -github.com/aws/aws-sdk-go-v2/credentials v1.13.40/go.mod h1:VtEHVAAqDWASwdOqj/1huyT6uHbs5s8FUHfDQdky/Rs= -github.com/aws/aws-sdk-go-v2/credentials v1.13.41 h1:dgbKq1tamtboYAKSXWbqL0lKO9rmEzEhbZFh9JQW/Bg= -github.com/aws/aws-sdk-go-v2/credentials v1.13.41/go.mod h1:cc3Fn7DkKbJalPtQnudHGZZ8ml9+hwtbc1CJONsYYqk= +github.com/aws/aws-sdk-go-v2/credentials v1.15.1 h1:hmf6lAm9hk7uLCfapZn/jL05lm6Uwdbn1B0fgjyuf4M= +github.com/aws/aws-sdk-go-v2/credentials v1.15.1/go.mod h1:QTcHga3ZbQOneJuxmGBOCxiClxmp+TlvmjFexAnJ790= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.3.0/go.mod h1:2LAuqPx1I6jNfaGDucWfA2zqQCYCOMCDHiCOciALyNw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.11 h1:uDZJF1hu0EVT/4bogChk8DyjSF6fof6uL/0Y26Ma7Fg= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.11/go.mod h1:TEPP4tENqBGO99KwVpV9MlOX4NSrSLP8u3KRy2CDwA8= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.2 h1:gIeH4+o1MN/caGBWjoGQTUTIu94xD6fI5B2+TcwBf70= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.2/go.mod h1:wLyMIo/zPOhQhPXTddpfdkSleyigtFi8iMnC+2m/SK4= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.87 h1:e20ZrsgDPUXqg8+rZVuPwNSp6yniUN2Yr2tzFZ+Yvl0= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.87/go.mod h1:0i0TAT6W+5i48QTlDU2KmY6U2hBZeY/LCP0wktya2oc= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41 h1:22dGT7PneFMx4+b3pz7lMTRyN8ZKH7M2cW4GP9yUS2g= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41/go.mod h1:CrObHAuPneJBlfEJ5T3szXOUkLEThaGfvnhTf33buas= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35 h1:SijA0mgjV8E+8G45ltVHs0fvKpTj8xmZJ3VwhGKtUSI= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35/go.mod h1:SJC1nEVVva1g3pHAIdCp7QsRIkMmLAgoDquQ9Rr8kYw= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.1 h1:ULswbgGNVrW8zEhkCNwrwXrs1mUvy2JTqWaCRsD2ZZw= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.1/go.mod h1:pAXgsDPk1rRwwfkz8/9ISO75vXEHqTGIgbLhGqqQ1GY= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.2.1 h1:fi1ga6WysOyYb5PAf3Exd6B5GiSNpnZim4h1rhlBqx0= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.2.1/go.mod h1:V5CY8wNurvPUibTi9mwqUqpiFZ5LnioKWIFUDtIzdI8= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.5.1 h1:ZpaV/j48RlPc4AmOZuPv22pJliXjXq8/reL63YzyFnw= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.5.1/go.mod h1:R8aXraabD2e3qv1csxM14/X9WF4wFMIY0kH4YEtYD5M= github.com/aws/aws-sdk-go-v2/internal/ini v1.1.1/go.mod h1:Zy8smImhTdOETZqfyn01iNOe0CNggVbPjCajyaz6Gvg= -github.com/aws/aws-sdk-go-v2/internal/ini v1.3.43 h1:g+qlObJH4Kn4n21g69DjspU0hKTjWtq7naZ9OLCv0ew= -github.com/aws/aws-sdk-go-v2/internal/ini v1.3.43/go.mod h1:rzfdUlfA+jdgLDmPKjd3Chq9V7LVLYo1Nz++Wb91aRo= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.1.4 h1:6lJvvkQ9HmbHZ4h/IEwclwv2mrTW8Uq1SOB/kXy0mfw= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.1.4/go.mod h1:1PrKYwxTM+zjpw9Y41KFtoJCQrJ34Z47Y4VgVbfndjo= +github.com/aws/aws-sdk-go-v2/internal/ini v1.5.0 h1:DqOQvIfmGkXZUVJnl9VRk0AnxyS59tCtX9k1Pyss4Ak= +github.com/aws/aws-sdk-go-v2/internal/ini v1.5.0/go.mod h1:VV/Kbw9Mg1GWJOT9WK+oTL3cWZiXtapnNvDSRqTZLsg= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.1 h1:vzYLDkwTw4CY0vUk84MeSufRf8XIsC/GsoIFXD60sTg= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.1/go.mod h1:ToBFBnjeGR2ruMx8IWp/y7vSK3Irj5/oPwifruiqoOM= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.2.1/go.mod h1:v33JQ57i2nekYTA70Mb+O18KeH4KqhdqxTJZNK1zdRE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.14 h1:m0QTSI6pZYJTk5WSKx3fm5cNW/DCicVzULBgU/6IyD0= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.14/go.mod h1:dDilntgHy9WnHXsh7dDtUPgHKEfTJIBUTHM8OWm0f/0= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.1.36 h1:eev2yZX7esGRjqRbnVk1UxMLw4CyVZDpZXRCcy75oQk= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.1.36/go.mod h1:lGnOkH9NJATw0XEPcAknFBj3zzNTEGRHtSw+CwC1YTg= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.10.0 h1:CJxo7ZBbaIzmXfV3hjcx36n9V87gJsIUPJflwqEHl3Q= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.10.0/go.mod h1:yjVfjuY4nD1EW9i387Kau+I6V5cBA5YnC/mWNopjZrI= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.1 h1:15FUCJzAP9Y25nioTqTrGlZmhOtthaXBWlt4pS+d3Xo= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.1/go.mod h1:5655NW53Un6l7JzkI6AA3rZvf0m532cSnLThA1fVXcA= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.35 h1:CdzPW9kKitgIiLV1+MHobfR5Xg25iYnyzWZhyQuSlDI= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.35/go.mod h1:QGF2Rs33W5MaN9gYdEQOBBFPLwTZkEhRwI33f7KIG0o= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.10.1 h1:2OXw3ppu1XsB6rqKEMV4tnecTjIY3PRV2U6IP6KPJQo= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.10.1/go.mod h1:FZB4AdakIqW/yERVdGJA6Z9jraax1beXfhBBnK2wwR8= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.15.4 h1:v0jkRigbSD6uOdwcaUQmgEwG1BkPfAPDqaeNt/29ghg= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.15.4/go.mod h1:LhTyt8J04LL+9cIt7pYJ5lbS/U98ZmXovLOR/4LUsk8= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.1 h1:dnl0klXYX9EKpzZbWlH5LJL+YTcEZcJEMPFFr/rAHUQ= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.1/go.mod h1:Mfk/9Joso4tCQYzM4q4HRUIqwln8lnIIMB/OE8Zebdc= github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32yTv8GpBquCApZEycDLunI= -github.com/aws/aws-sdk-go-v2/service/s3 v1.40.0 h1:wl5dxN1NONhTDQD9uaEvNsDRX29cBmGED/nl0jkWlt4= -github.com/aws/aws-sdk-go-v2/service/s3 v1.40.0/go.mod h1:rDGMZA7f4pbmTtPOk5v5UM2lmX6UAbRnMDJeDvnH7AM= +github.com/aws/aws-sdk-go-v2/service/s3 v1.42.0 h1:u0YoSrxjr3Lm+IqIlRAV+4YTFwkXjyB9db9CfUFge2w= +github.com/aws/aws-sdk-go-v2/service/s3 v1.42.0/go.mod h1:98EIdRu+BNsdqITsXfy+57TZfwlUQC9aDn9a9qoo90U= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= -github.com/aws/aws-sdk-go-v2/service/sso v1.14.1/go.mod h1:fIAwKQKBFu90pBxx07BFOMJLpRUGu8VOzLJakeY+0K4= -github.com/aws/aws-sdk-go-v2/service/sso v1.15.0 h1:vuGK1vHNP9zx0PfOrtPumbwR2af0ATQ1Z2H6p75AgRQ= -github.com/aws/aws-sdk-go-v2/service/sso v1.15.0/go.mod h1:fIAwKQKBFu90pBxx07BFOMJLpRUGu8VOzLJakeY+0K4= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.17.1 h1:8lKOidPkmSmfUtiTgtdXWgaKItCZ/g75/jEk6Ql6GsA= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.17.1/go.mod h1:yygr8ACQRY2PrEcy3xsUI357stq2AxnFM6DIsR9lij4= +github.com/aws/aws-sdk-go-v2/service/sso v1.17.0 h1:I/Oh3IxGPfHXiGnwM54TD6hNr/8TlUrBXAtTyGhR+zw= +github.com/aws/aws-sdk-go-v2/service/sso v1.17.0/go.mod h1:H6NCMvDBqA+CvIaXzaSqM6LWtzv9BzZrqBOqz+PzRF8= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.19.0 h1:irbXQkfVYIRaewYSXcu4yVk0m2T+JzZd0dkop7FjmO0= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.19.0/go.mod h1:4wPNCkM22+oRe71oydP66K50ojDUC33XutSMi2pEF/M= github.com/aws/aws-sdk-go-v2/service/sts v1.6.0/go.mod h1:q7o0j7d7HrJk/vr9uUt3BVRASvcU7gYZB9PUgPiByXg= -github.com/aws/aws-sdk-go-v2/service/sts v1.22.0/go.mod h1:VC7JDqsqiwXukYEDjoHh9U0fOJtNWh04FPQz4ct4GGU= -github.com/aws/aws-sdk-go-v2/service/sts v1.23.0 h1:pyvfUqkNLMipdKNAtu7OVbRxUrR2BMaKccIPpk/Hkak= -github.com/aws/aws-sdk-go-v2/service/sts v1.23.0/go.mod h1:VC7JDqsqiwXukYEDjoHh9U0fOJtNWh04FPQz4ct4GGU= +github.com/aws/aws-sdk-go-v2/service/sts v1.25.0 h1:sYIFy8tm1xQwRvVQ4CRuBGXKIg9sHNuG6+3UAQuoujk= +github.com/aws/aws-sdk-go-v2/service/sts v1.25.0/go.mod h1:S/LOQUeYDfJeJpFCIJDMjy7dwL4aA33HUdVi+i7uH8k= github.com/aws/smithy-go v1.6.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= -github.com/aws/smithy-go v1.14.2 h1:MJU9hqBGbvWZdApzpvoF2WAIJDbtjK2NDJSiJP7HblQ= -github.com/aws/smithy-go v1.14.2/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= +github.com/aws/smithy-go v1.16.0 h1:gJZEH/Fqh+RsvlJ1Zt4tVAtV6bKkp3cC+R6FCZMNzik= +github.com/aws/smithy-go v1.16.0/go.mod h1:NukqUGpCZIILqqiV0NIjeFh24kd/FAa4beRb6nbIUPE= github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -162,7 +158,7 @@ github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSV github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= -github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= @@ -171,8 +167,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= -github.com/docker/docker v24.0.6+incompatible h1:hceabKCtUgDqPu+qm0NgsaXf28Ljf4/pWFL7xjWWDgE= -github.com/docker/docker v24.0.6+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v24.0.7+incompatible h1:Wo6l37AuwP3JaMnZa226lzVXGA3F9Ig1seQen0cKYlM= +github.com/docker/docker v24.0.7+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= @@ -195,8 +191,8 @@ github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoD github.com/frankban/quicktest v1.2.2/go.mod h1:Qh/WofXFeiAFII1aEBu529AtJo6Zg2VHscnEsbBnJ20= github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= -github.com/fsouza/fake-gcs-server v1.47.4 h1:gfBhBxEra20/Om02cvcyL8EnekV8KDb01Yffjat6AKQ= -github.com/fsouza/fake-gcs-server v1.47.4/go.mod h1:vqUZbI12uy9IkRQ54Q4p5AniQsSiUq8alO9Nv2egMmA= +github.com/fsouza/fake-gcs-server v1.47.6 h1:/d/879q/Os9Zc5gyV3QVLfZoajN1KcWucf2zYCFeFxs= +github.com/fsouza/fake-gcs-server v1.47.6/go.mod h1:ApSXKexpG1BUXJ4f2tNCxvhTKwCPFqFLBDW2UNQDODE= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= @@ -262,9 +258,9 @@ github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/martian/v3 v3.3.2 h1:IqNFLAmvJOgVlpdEBiQbDc2EwKW77amAycfTuWKdfvw= @@ -272,8 +268,8 @@ github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OI github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20221103000818-d260c55eee4c h1:lvddKcYTQ545ADhBujtIJmqQrZBDsGo7XIMbAQe/sNY= -github.com/google/pprof v0.0.0-20221103000818-d260c55eee4c/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= +github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 h1:hR7/MlvK23p6+lIw9SN1TigNLn9ZnF3W4SYRKq2gAHs= +github.com/google/pprof v0.0.0-20230602150820-91b7bce49751/go.mod h1:Jh3hGz2jkYak8qXPD19ryItVnUgpgeqzdkY/D0EaeuA= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/renameio/v2 v2.0.0 h1:UifI23ZTGY8Tt29JbYFiuyIU3eX+RNFtUwefq9qAhxg= github.com/google/renameio/v2 v2.0.0/go.mod h1:BtmJXm5YlszgC+TD4HOEEUFgkJP3nLxehU6hfe7jRt4= @@ -281,10 +277,10 @@ github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.2.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= -github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/googleapis/enterprise-certificate-proxy v0.3.1 h1:SBWmZhjUDRorQxrN0nwzf+AHBxnbFjViHQS4P0yVpmQ= -github.com/googleapis/enterprise-certificate-proxy v0.3.1/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0= +github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4= +github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfFxPRy3Bf7vr3h0cechB90XaQs= +github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gax-go/v2 v2.12.0 h1:A+gCJKdRfqXkr+BIRGtZLibNXf0m1f9E4HG56etFpas= @@ -323,8 +319,8 @@ github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j github.com/klauspost/compress v1.9.7/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= -github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4= +github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg= github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= @@ -347,8 +343,10 @@ github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpsp github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= +github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= +github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= -github.com/minio/minio-go/v7 v7.0.61 h1:87c+x8J3jxQ5VUGimV9oHdpjsAvy3fhneEBKuoKEVUI= +github.com/minio/minio-go/v7 v7.0.63 h1:GbZ2oCvaUdgT5640WJOpyDhhDxvknAJU2/T3yurwcbQ= github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM= github.com/moby/patternmatcher v0.5.0 h1:YCZgJOeULcxLw1Q+sVR636pmS7sPEn1Qo2iAN6M7DBo= github.com/moby/patternmatcher v0.5.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= @@ -364,6 +362,16 @@ github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJ github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +github.com/nats-io/jwt/v2 v2.5.2 h1:DhGH+nKt+wIkDxM6qnVSKjokq5t59AZV5HRcFW0zJwU= +github.com/nats-io/jwt/v2 v2.5.2/go.mod h1:24BeQtRwxRV8ruvC4CojXlx/WQ/VjuwlYiH+vu/+ibI= +github.com/nats-io/nats-server/v2 v2.10.4 h1:uB9xcwon3tPXWAdmTJqqqC6cie3yuPWHJjjTBgaPNus= +github.com/nats-io/nats-server/v2 v2.10.4/go.mod h1:eWm2JmHP9Lqm2oemB6/XGi0/GwsZwtWf8HIPUsh+9ns= +github.com/nats-io/nats.go v1.31.0 h1:/WFBHEc/dOKBF6qf1TZhrdEfTmOZ5JzdJ+Y3m6Y/p7E= +github.com/nats-io/nats.go v1.31.0/go.mod h1:di3Bm5MLsoB4Bx61CBTsxuarI36WbhAwOm8QrW39+i8= +github.com/nats-io/nkeys v0.4.6 h1:IzVe95ru2CT6ta874rt9saQRkWfe2nFj1NtvYSLqMzY= +github.com/nats-io/nkeys v0.4.6/go.mod h1:4DxZNzenSVd1cYQoAa8948QY3QDjrHfcfVADymtkpts= +github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= +github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/ncw/swift v1.0.52/go.mod h1:23YIA4yWVnGwv2dQlN4bB7egfYX6YLn0Yo/S6zZO/ZM= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= @@ -386,8 +394,8 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/proullon/ramsql v0.1.2 h1:PTtsy2iml/CW3Lsopyr86dlIs7JyYEmfLrfYvQVXD2U= -github.com/proullon/ramsql v0.1.2/go.mod h1:CFGqeQHQpdRfWqYmWD3yXqPTEaHkF4zgXy1C6qDWc9E= +github.com/proullon/ramsql v0.1.3 h1:/LRcXJf4lEmhdb4tYcci473I2VynjcZSzh2hsjJ8rSk= +github.com/proullon/ramsql v0.1.3/go.mod h1:CFGqeQHQpdRfWqYmWD3yXqPTEaHkF4zgXy1C6qDWc9E= github.com/rogpeppe/clock v0.0.0-20190514195947-2896927a307a h1:3QH7VyOaaiUHNrA9Se4YQIRkDTCw1EJls9xTUCaCeRM= github.com/rogpeppe/clock v0.0.0-20190514195947-2896927a307a/go.mod h1:4r5QyqhjIWCcK8DO4KMclc5Iknq5qVBAlbYYzAbUScQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -400,8 +408,8 @@ github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46/go.mod h1:uAQ5P github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63 h1:J6qvD6rbmOil46orKqJaRPG+zTpoGlBTUdyv8ki63L0= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63/go.mod h1:n+VKSARF5y/tS9XFSP7vWDfS+GUC5vs/YT7M5XDTUEM= -github.com/shirou/gopsutil/v3 v3.23.7 h1:C+fHO8hfIppoJ1WdsVm1RoI0RwXoNdfTK7yWXV0wVj4= -github.com/shirou/gopsutil/v3 v3.23.7/go.mod h1:c4gnmoRC0hQuaLqvxnx1//VXQ0Ms/X9UnJF8pddY5z4= +github.com/shirou/gopsutil/v3 v3.23.8 h1:xnATPiybo6GgdRoC4YoGnxXZFRc3dqQTGi73oLvvBrE= +github.com/shirou/gopsutil/v3 v3.23.8/go.mod h1:7hmCaBn+2ZwaZOr6jmPBZDfawwMGuo1id3C6aM8EDqQ= github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= @@ -412,8 +420,8 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/spf13/afero v1.2.1/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= -github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I= -github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= +github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0= +github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -432,14 +440,14 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= -github.com/testcontainers/testcontainers-go v0.24.1 h1:gJdZuQIVWnMJTo+CmQMEP7/CAagNk/0jbcUPn3OWvD8= -github.com/testcontainers/testcontainers-go v0.24.1/go.mod h1:MGBiAkCm86yXQoCiipmQCqZLVdk1uFqtMqaU1Or0MRk= +github.com/testcontainers/testcontainers-go v0.25.0 h1:erH6cQjsaJrH+rJDU9qIf89KFdhK0Bft0aEZHlYC3Vs= +github.com/testcontainers/testcontainers-go v0.25.0/go.mod h1:4sC9SiJyzD1XFi59q8umTQYWxnkweEc5OjVtTUlJzqQ= github.com/tetratelabs/wazero v1.5.0 h1:Yz3fZHivfDiZFUXnWMPUoiW7s8tC1sjdBtlJn08qYa0= github.com/tetratelabs/wazero v1.5.0/go.mod h1:0U0G41+ochRKoPKCJlh0jMg1CHkyfK8kDqiirMmKY8A= -github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= -github.com/tklauser/go-sysconf v0.3.11/go.mod h1:GqXfhXY3kiPa0nAXPDIQIWzJbMCB7AmcWpGR8lSZfqI= -github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms= -github.com/tklauser/numcpus v0.6.0/go.mod h1:FEZLMke0lhOUG6w2JadTzp0a+Nl8PF/GFkQ5UVIcaL4= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= @@ -485,8 +493,8 @@ golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.13.0 h1:mvySKfSWJ+UKUii46M40LOvyWfN0s2U+46/jDd0e6Ck= -golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -497,8 +505,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw= -golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= +golang.org/x/exp v0.0.0-20230807204917-050eac23e9de h1:l5Za6utMv/HsBWWqzt4S8X17j+kt1uVETUX5UFhn2rE= +golang.org/x/exp v0.0.0-20230807204917-050eac23e9de/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -545,15 +553,15 @@ golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= -golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.12.0 h1:smVPGxink+n1ZI5pkQa8y6fZT0RW0MgCO5bFpepy4B4= -golang.org/x/oauth2 v0.12.0/go.mod h1:A74bZ3aGXgCY0qaIC9Ahg6Lglin4AMAco8cIv9baba4= +golang.org/x/oauth2 v0.13.0 h1:jDDenyj+WgFtmV3zYVoi8aE2BwtXFLWOA67ZfNWftiY= +golang.org/x/oauth2 v0.13.0/go.mod h1:/JMhi4ZRXAf4HG9LiNmxvk+45+96RUlVThiH8FzNBn0= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -562,9 +570,10 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= -golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= +golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -601,11 +610,11 @@ golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= +golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -623,6 +632,7 @@ golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -674,8 +684,8 @@ google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsb google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.143.0 h1:o8cekTkqhywkbZT6p1UHJPZ9+9uuCAJs/KYomxZB8fA= -google.golang.org/api v0.143.0/go.mod h1:FoX9DO9hT7DLNn97OuoZAGSDuNAXdJRuGK98rSUgurk= +google.golang.org/api v0.150.0 h1:Z9k22qD289SZ8gCJrk4DrWXkNjtfvKAUo/l1ma8eBYE= +google.golang.org/api v0.150.0/go.mod h1:ccy+MJ6nrYFgE3WgRx/AMXOxOmU8Q4hSa+jjibzhxcg= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -701,12 +711,12 @@ google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4 google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto v0.0.0-20230913181813-007df8e322eb h1:XFBgcDwm7irdHTbz4Zk2h7Mh+eis4nfJEFQFYzJzuIA= -google.golang.org/genproto v0.0.0-20230913181813-007df8e322eb/go.mod h1:yZTlhN0tQnXo3h00fuXNCxJdLdIdnVFVBaRJ5LWBbw4= -google.golang.org/genproto/googleapis/api v0.0.0-20230913181813-007df8e322eb h1:lK0oleSc7IQsUxO3U5TjL9DWlsxpEBemh+zpB7IqhWI= -google.golang.org/genproto/googleapis/api v0.0.0-20230913181813-007df8e322eb/go.mod h1:KjSP20unUpOx5kyQUFa7k4OJg0qeJ7DEZflGDu2p6Bk= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230920204549-e6e6cdab5c13 h1:N3bU/SQDCDyD6R528GJ/PwW9KjYcJA3dgyH+MovAkIM= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230920204549-e6e6cdab5c13/go.mod h1:KSqppvjFjtoCI+KGd4PELB0qLNxdJHRGqRI09mB6pQA= +google.golang.org/genproto v0.0.0-20231016165738-49dd2c1f3d0b h1:+YaDE2r2OG8t/z5qmsh7Y+XXwCbvadxxZ0YY6mTdrVA= +google.golang.org/genproto v0.0.0-20231016165738-49dd2c1f3d0b/go.mod h1:CgAqfJo+Xmu0GwA0411Ht3OU3OntXwsGmrmjI8ioGXI= +google.golang.org/genproto/googleapis/api v0.0.0-20231016165738-49dd2c1f3d0b h1:CIC2YMXmIhYw6evmhPxBKJ4fmLbOFtXQN/GV3XOZR8k= +google.golang.org/genproto/googleapis/api v0.0.0-20231016165738-49dd2c1f3d0b/go.mod h1:IBQ646DjkDkvUIsVq/cc03FUFQ9wbZu7yE396YcL870= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231030173426-d783a09b4405 h1:AB/lmRny7e2pLhFEYIbl5qkDAUt2h0ZRO4wGPhZf+ik= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231030173426-d783a09b4405/go.mod h1:67X1fPuzjcrkymZzZV1vvkFeTn2Rvc6lYF9MYFGCcwE= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -716,8 +726,8 @@ google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8 google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= -google.golang.org/grpc v1.58.2 h1:SXUpjxeVF3FKrTYQI4f4KvbGD5u2xccdYdurwowix5I= -google.golang.org/grpc v1.58.2/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= +google.golang.org/grpc v1.59.0 h1:Z5Iec2pjwb+LEOqzpB2MR12/eKFhDPhuqW91O+4bwUk= +google.golang.org/grpc v1.59.0/go.mod h1:aUPDwccQo6OTjy7Hct4AfBPD1GptF4fyUjIkQ9YtF98= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= diff --git a/sdks/go/examples/minimal_wordcount/minimal_wordcount.go b/sdks/go/examples/minimal_wordcount/minimal_wordcount.go index f5f22cae1d653..83cb390a6093c 100644 --- a/sdks/go/examples/minimal_wordcount/minimal_wordcount.go +++ b/sdks/go/examples/minimal_wordcount/minimal_wordcount.go @@ -27,6 +27,7 @@ // // Concepts: // +// 0. Registering transforms with Beam. // 1. Reading data from text files // 2. Specifying 'inline' transforms // 3. Counting items in a PCollection @@ -62,6 +63,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam" "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" + "github.com/apache/beam/sdks/v2/go/pkg/beam/register" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism" "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" @@ -71,6 +73,26 @@ import ( var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) +func splitWords(line string, emit func(string)) { + for _, word := range wordRE.FindAllString(line, -1) { + emit(word) + } +} + +func formatCounts(w string, c int) string { + return fmt.Sprintf("%s: %v", w, c) +} + +// Concept #0: Transform functions executed by Beam need to be registered +// so they can be executed by portable runners. We use the register package +// in an init block to inform Beam of the functions we will be using, so +// it can access them on workers. +func init() { + register.Function2x0(splitWords) + register.Function2x1(formatCounts) + register.Emitter1[string]() +} + func main() { // beam.Init() is an initialization hook that must be called on startup. beam.Init() @@ -91,15 +113,11 @@ func main() { lines := textio.Read(s, "gs://apache-beam-samples/shakespeare/kinglear.txt") // Concept #2: Invoke a ParDo transform on our PCollection of text lines. - // This ParDo invokes a DoFn (defined in-line) on each element that + // This ParDo invokes a DoFn (registered earlier) on each element that // tokenizes the text line into individual words. The ParDo returns a // PCollection of type string, where each element is an individual word in // Shakespeare's collected texts. - words := beam.ParDo(s, func(line string, emit func(string)) { - for _, word := range wordRE.FindAllString(line, -1) { - emit(word) - } - }, lines) + words := beam.ParDo(s, splitWords, lines) // Concept #3: Invoke the stats.Count transform on our PCollection of // individual words. The Count transform returns a new PCollection of @@ -110,9 +128,7 @@ func main() { // Use a ParDo to format our PCollection of word counts into a printable // string, suitable for writing to an output file. When each element // produces exactly one element, the DoFn can simply return it. - formatted := beam.ParDo(s, func(w string, c int) string { - return fmt.Sprintf("%s: %v", w, c) - }, counted) + formatted := beam.ParDo(s, formatCounts, counted) // Concept #4: Invoke textio.Write at the end of the pipeline to write // the contents of a PCollection (in this case, our PCollection of diff --git a/sdks/go/pkg/beam/core/core.go b/sdks/go/pkg/beam/core/core.go index ed62a2e9eac01..07326d96528dd 100644 --- a/sdks/go/pkg/beam/core/core.go +++ b/sdks/go/pkg/beam/core/core.go @@ -27,7 +27,7 @@ const ( // SdkName is the human readable name of the SDK for UserAgents. SdkName = "Apache Beam SDK for Go" // SdkVersion is the current version of the SDK. - SdkVersion = "2.52.0.dev" + SdkVersion = "2.53.0.dev" // DefaultDockerImage represents the associated image for this release. DefaultDockerImage = "apache/beam_go_sdk:" + SdkVersion diff --git a/sdks/go/pkg/beam/core/graph/graph.go b/sdks/go/pkg/beam/core/graph/graph.go index 474ab1cb37daa..f7826ccbef695 100644 --- a/sdks/go/pkg/beam/core/graph/graph.go +++ b/sdks/go/pkg/beam/core/graph/graph.go @@ -37,7 +37,7 @@ type Graph struct { // New returns an empty graph with the scope set to the root. func New() *Graph { - root := &Scope{0, "root", nil} + root := &Scope{id: 0, Label: "root", Parent: nil} return &Graph{root: root} } diff --git a/sdks/go/pkg/beam/core/graph/scope.go b/sdks/go/pkg/beam/core/graph/scope.go index 2fe836897c3d1..8c8c3a041a5f9 100644 --- a/sdks/go/pkg/beam/core/graph/scope.go +++ b/sdks/go/pkg/beam/core/graph/scope.go @@ -15,6 +15,8 @@ package graph +import "context" + // Scope is a syntactic Scope, such as arising from a composite Transform. It // has no semantic meaning at execution time. Used by monitoring. type Scope struct { @@ -24,6 +26,8 @@ type Scope struct { Label string // Parent is the parent scope, if nested. Parent *Scope + // Context contains optional metadata associated with this scope. + Context context.Context } // ID returns the graph-local identifier for the scope. diff --git a/sdks/go/pkg/beam/core/runtime/contextreg/contextreg.go b/sdks/go/pkg/beam/core/runtime/contextreg/contextreg.go new file mode 100644 index 0000000000000..d91141477576e --- /dev/null +++ b/sdks/go/pkg/beam/core/runtime/contextreg/contextreg.go @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package contextreg contains the global registrations of functions for extracting +// ptransform annotations or environment resource hints from context.Context attached to +// scopes. +// +// For beam internal use only. API subject to change. +package contextreg + +import ( + "context" + "maps" + "sync" +) + +var defaultReg = &Registry{} + +// Default is the default registry for context extractors. +func Default() *Registry { + return defaultReg +} + +// Registry contains a set of registrations for extracting annotations and hints from a context.Context. +// +// This type is exported to allow simpler testing of new extractors, and their interaction with the registry. +type Registry struct { + mu sync.Mutex + transforms []func(context.Context) TransformMetadata + envs []func(context.Context) EnvironmentMetadata +} + +// TransformMetadata represents additional information on transforms to be added to the Pipeline proto graph. +type TransformMetadata struct { + Annotations map[string][]byte + // DisplayData []*pipepb.DisplayData +} + +// EnvironmentMetadata represent additional information on environmental requirements to be added to the Pipeline +// proto graph. +type EnvironmentMetadata struct { + ResourceHints map[string][]byte + // DisplayData []*pipepb.DisplayData + // Dependencies []*pipepb.ArtifactInformation +} + +// TransformExtractor registers a transform metadata extractor to this registry. +// These will be set on the current composite transform scope. +// They are accessible to runners via the transform hypergraph. +func (r *Registry) TransformExtractor(ext func(context.Context) TransformMetadata) { + r.mu.Lock() + r.transforms = append(r.transforms, ext) + r.mu.Unlock() +} + +// EnvExtrator registers an environment metadata extractor to this registry. +// When non-empty extraction occurs, a new environment will be derived from the parent scopes environment. +func (r *Registry) EnvExtrator(ext func(context.Context) EnvironmentMetadata) { + r.mu.Lock() + r.envs = append(r.envs, ext) + r.mu.Unlock() +} + +// ExtractTransformMetadata runs all registered transform extractors on the provided context, +// and returns the resulting metadata. +// +// A metadata field will be nil if there's no data. A nil context bypasses extractor execution. +func (r *Registry) ExtractTransformMetadata(ctx context.Context) TransformMetadata { + r.mu.Lock() + defer r.mu.Unlock() + if ctx == nil { + return TransformMetadata{} + } + ret := TransformMetadata{ + Annotations: map[string][]byte{}, + } + for _, ext := range r.transforms { + k := ext(ctx) + maps.Copy(ret.Annotations, k.Annotations) + } + if len(ret.Annotations) == 0 { + ret.Annotations = nil + } + return ret +} + +// ExtractEnvironmentMetadata runs all registered environment extractors on the provided context, +// and returns the resulting metadata. +// +// A metadata field will be nil if there's no data. A nil context bypasses extractor execution. +func (r *Registry) ExtractEnvironmentMetadata(ctx context.Context) EnvironmentMetadata { + r.mu.Lock() + defer r.mu.Unlock() + if ctx == nil { + return EnvironmentMetadata{} + } + ret := EnvironmentMetadata{ + ResourceHints: map[string][]byte{}, + } + for _, ext := range r.envs { + k := ext(ctx) + maps.Copy(ret.ResourceHints, k.ResourceHints) + } + if len(ret.ResourceHints) == 0 { + ret.ResourceHints = nil + } + return ret +} diff --git a/sdks/go/pkg/beam/core/runtime/contextreg/contextreg_test.go b/sdks/go/pkg/beam/core/runtime/contextreg/contextreg_test.go new file mode 100644 index 0000000000000..dd0c5fb92c578 --- /dev/null +++ b/sdks/go/pkg/beam/core/runtime/contextreg/contextreg_test.go @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package contextreg + +import ( + "context" + "testing" +) + +func TestPTransformExtractor(t *testing.T) { + reg := &Registry{} + + type keyType string + key1 := keyType("annotation1") + key2 := keyType("annotation2") + key3 := keyType("annotation3") + + reg.TransformExtractor(func(ctx context.Context) TransformMetadata { + v := ctx.Value(key1).(string) + return TransformMetadata{ + Annotations: map[string][]byte{ + "beam:test:annotation": []byte(v), + }, + } + }) + reg.TransformExtractor(func(ctx context.Context) TransformMetadata { + v := ctx.Value(key2).(string) + return TransformMetadata{ + Annotations: map[string][]byte{ + "beam:test:annotation2": []byte(v), + }, + } + }) + // Override the extaction for result annotation to use the last set version. + reg.TransformExtractor(func(ctx context.Context) TransformMetadata { + v := ctx.Value(key3).(string) + return TransformMetadata{ + Annotations: map[string][]byte{ + "beam:test:annotation": []byte(v), + }, + } + }) + + ctx := context.Background() + // Set all 3 distinct context values. + ctx = context.WithValue(ctx, key1, "never seen") + want2 := "want_value2" + ctx = context.WithValue(ctx, key2, want2) + want3 := "want_value3" + ctx = context.WithValue(ctx, key3, want3) + + ptrans := reg.ExtractTransformMetadata(ctx) + + key := "beam:test:annotation" + if got, want := string(ptrans.Annotations[key]), want3; got != want { + t.Errorf("extracted annotation %q = %q, want %q", key, got, want) + } + key = "beam:test:annotation2" + if got, want := string(ptrans.Annotations[key]), want2; got != want { + t.Errorf("extracted annotation %q = %q, want %q", key, got, want) + } + if got, want := len(ptrans.Annotations), 2; got != want { + t.Errorf("extracted annotation %q = %q, want %q - have %v", key, got, want, ptrans) + } +} + +func TestHintExtractor(t *testing.T) { + reg := &Registry{} + + type keyType string + hintKey := keyType("hint") + + reg.EnvExtrator(func(ctx context.Context) EnvironmentMetadata { + v := ctx.Value(hintKey).(string) + return EnvironmentMetadata{ + ResourceHints: map[string][]byte{ + "beam:test:hint": []byte(v), + }, + } + }) + + ctx := context.Background() + wantedHint := "hint" + ctx = context.WithValue(ctx, hintKey, wantedHint) + + env := reg.ExtractEnvironmentMetadata(ctx) + + key := "beam:test:hint" + if got, want := string(env.ResourceHints[key]), wantedHint; got != want { + t.Errorf("extracted annotation %q = %q, want %q", key, got, want) + } + if got, want := len(env.ResourceHints), 1; got != want { + t.Errorf("extracted annotation %q = %q, want %q - have %v", key, got, want, env) + } +} diff --git a/sdks/go/pkg/beam/core/runtime/exec/sdf.go b/sdks/go/pkg/beam/core/runtime/exec/sdf.go index b21b47b20ae2b..3977d31dda7db 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/sdf.go +++ b/sdks/go/pkg/beam/core/runtime/exec/sdf.go @@ -781,7 +781,7 @@ func (n *ProcessSizedElementsAndRestrictions) singleWindowSplit(ctx context.Cont func (n *ProcessSizedElementsAndRestrictions) multiWindowSplit(ctx context.Context, f float64, pWeState any, rWeState any) ([]*FullValue, []*FullValue, error) { // Get the split point in window range, to see what window it falls in. done, rem := n.rt.GetProgress() - cwp := done / (done + rem) // Progress in current window. + cwp := progressFraction(done, rem) // Progress in current window. p := (float64(n.currW) + cwp) / float64(n.numW) // Progress of whole element. sp := p + (f * (1.0 - p)) // Split point in range of entire element [0, 1]. wsp := sp * float64(n.numW) // Split point in window range [0, numW]. @@ -923,7 +923,7 @@ func (n *ProcessSizedElementsAndRestrictions) newSplitResult(ctx context.Context // DoFns, so 1.0 is only returned once all windows have been processed. func (n *ProcessSizedElementsAndRestrictions) GetProgress() float64 { d, r := n.rt.GetProgress() - frac := d / (d + r) + frac := progressFraction(d, r) if n.numW == 1 { return frac @@ -959,6 +959,13 @@ func (n *ProcessSizedElementsAndRestrictions) GetOutputWatermark() map[string]*t return nil } +func progressFraction(done float64, remaining float64) float64 { + if done == 0 { + return 0 + } + return done / (done + remaining) +} + // SdfFallback is an executor used when an SDF isn't expanded into steps by the // runner, indicating that the runner doesn't support splitting. It executes all // the SDF steps together in one unit. diff --git a/sdks/go/pkg/beam/core/runtime/exec/sdf_test.go b/sdks/go/pkg/beam/core/runtime/exec/sdf_test.go index a0380796e8637..460a7111b1103 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/sdf_test.go +++ b/sdks/go/pkg/beam/core/runtime/exec/sdf_test.go @@ -715,6 +715,14 @@ func TestAsSplittableUnit(t *testing.T) { currWindow: 0, wantProgress: 0.5, }, + { + name: "SingleWindowZeroWork", + windows: testWindows, + doneWork: 0.0, + remainingWork: 0.0, + currWindow: 0, + wantProgress: 0.0, + }, { name: "MultipleWindows", windows: multiWindows, @@ -724,6 +732,14 @@ func TestAsSplittableUnit(t *testing.T) { // Progress should be halfway through second window. wantProgress: 1.5 / 4.0, }, + { + name: "MultipleWindowsZeroWork", + windows: multiWindows, + doneWork: 0.0, + remainingWork: 0.0, + currWindow: 1, + wantProgress: 1.0 / 4.0, + }, } for _, test := range tests { test := test @@ -776,15 +792,19 @@ func TestAsSplittableUnit(t *testing.T) { name string fn *graph.DoFn frac float64 - doneRt bool // Result that RTracker will return for IsDone. + done float64 + remaining float64 + isDoneRt bool // Result that RTracker will return for IsDone. in FullValue wantPrimaries []*FullValue wantResiduals []*FullValue }{ { - name: "SingleElem", - fn: dfn, - frac: 0.5, + name: "SingleElem", + fn: dfn, + frac: 0.5, + done: 0.0, + remaining: 1.0, in: FullValue{ Elm: &FullValue{ Elm: 1, @@ -823,9 +843,11 @@ func TestAsSplittableUnit(t *testing.T) { }}, }, { - name: "SingleElemStatefulWatermarkEstimating", - fn: statefulWeFn, - frac: 0.5, + name: "SingleElemStatefulWatermarkEstimating", + fn: statefulWeFn, + frac: 0.5, + done: 0.0, + remaining: 1.0, in: FullValue{ Elm: &FullValue{ Elm: 1, @@ -864,9 +886,11 @@ func TestAsSplittableUnit(t *testing.T) { }}, }, { - name: "KvElem", - fn: kvdfn, - frac: 0.5, + name: "KvElem", + fn: kvdfn, + frac: 0.5, + done: 0.0, + remaining: 1.0, in: FullValue{ Elm: &FullValue{ Elm: &FullValue{ @@ -914,10 +938,12 @@ func TestAsSplittableUnit(t *testing.T) { }}, }, { - name: "DoneRTracker", - fn: dfn, - doneRt: true, - frac: 0.5, + name: "DoneRTracker", + fn: dfn, + frac: 0.5, + done: 0.0, + remaining: 1.0, + isDoneRt: true, in: FullValue{ Elm: &FullValue{ Elm: 1, @@ -936,9 +962,11 @@ func TestAsSplittableUnit(t *testing.T) { { // MultiWindow split where split point lands inside currently // processing restriction tracker. - name: "MultiWindow/RestrictionSplit", - fn: dfn, - frac: 0.125, // Should be in the middle of the first (current) window. + name: "MultiWindow/RestrictionSplit", + fn: dfn, + frac: 0.125, // Should be in the middle of the first (current) window. + done: 0.0, + remaining: 1.0, in: FullValue{ Elm: &FullValue{ Elm: 1, @@ -990,9 +1018,11 @@ func TestAsSplittableUnit(t *testing.T) { { // MultiWindow split where the split lands outside the current // window, and performs a window boundary split instead. - name: "MultiWindow/WindowBoundarySplit", - fn: dfn, - frac: 0.55, + name: "MultiWindow/WindowBoundarySplit", + fn: dfn, + frac: 0.55, + done: 0.0, + remaining: 1.0, in: FullValue{ Elm: &FullValue{ Elm: 1, @@ -1033,10 +1063,12 @@ func TestAsSplittableUnit(t *testing.T) { { // Tests that a MultiWindow split with a Done RTracker will // fallback to a window boundary split. - name: "MultiWindow/DoneRTrackerSplit", - fn: dfn, - frac: 0.125, - doneRt: true, + name: "MultiWindow/DoneRTrackerSplit", + fn: dfn, + frac: 0.125, + done: 0.0, + remaining: 1.0, + isDoneRt: true, in: FullValue{ Elm: &FullValue{ Elm: 1, @@ -1077,9 +1109,34 @@ func TestAsSplittableUnit(t *testing.T) { { // Test that if a window boundary split lands at the end of an // element, it results in a no-op. - name: "MultiWindow/NoResidual", - fn: dfn, - frac: 0.95, // Should round to end of element and cause a no-op. + name: "MultiWindow/NoResidual", + fn: dfn, + frac: 0.95, // Should round to end of element and cause a no-op. + done: 0.0, + remaining: 1.0, + in: FullValue{ + Elm: &FullValue{ + Elm: 1, + Elm2: &FullValue{ + Elm: &VetRestriction{ID: "Sdf"}, + Elm2: false, + }, + }, + Elm2: 1.0, + Timestamp: testTimestamp, + Windows: testMultiWindows, + }, + wantPrimaries: []*FullValue{}, + wantResiduals: []*FullValue{}, + }, + { + // Tests that an RTracker progress of 0.0 done and 0.0 remaining + // is treated as a current window progress of 0.0. + name: "MultiWindow/ZeroWork", + fn: dfn, + frac: 0.95, + done: 0.0, + remaining: 0.0, in: FullValue{ Elm: &FullValue{ Elm: 1, @@ -1104,9 +1161,9 @@ func TestAsSplittableUnit(t *testing.T) { node := &ProcessSizedElementsAndRestrictions{PDo: n} node.rt = &SplittableUnitRTracker{ VetRTracker: VetRTracker{Rest: test.in.Elm.(*FullValue).Elm2.(*FullValue).Elm.(*VetRestriction)}, - Done: 0, - Remaining: 1.0, - ThisIsDone: test.doneRt, + Done: test.done, + Remaining: test.remaining, + ThisIsDone: test.isDoneRt, } node.elm = &test.in node.numW = len(test.in.Windows) diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate.go b/sdks/go/pkg/beam/core/runtime/graphx/translate.go index ad76703e3001c..9ef28eb7809b5 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate.go @@ -26,6 +26,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window/trigger" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/contextreg" v1pb "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx/v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/pipelinex" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/state" @@ -154,6 +155,18 @@ type Options struct { // PipelineResourceHints for setting defaults across the whole pipeline. PipelineResourceHints resource.Hints + + // ContextReg is an override for the context extractor registry for testing. + ContextReg *contextreg.Registry +} + +// GetContextReg returns the default context registry if the option is +// unset, and the field version otherwise. +func (opts *Options) GetContextReg() *contextreg.Registry { + if opts.ContextReg == nil { + return contextreg.Default() + } + return opts.ContextReg } // Marshal converts a graph to a model pipeline. @@ -273,10 +286,14 @@ func (m *marshaller) addScopeTree(s *ScopeTree) (string, error) { subtransforms = append(subtransforms, id) } + metadata := m.opt.GetContextReg().ExtractTransformMetadata(s.Scope.Scope.Context) + transform := &pipepb.PTransform{ UniqueName: s.Scope.Name, Subtransforms: subtransforms, EnvironmentId: m.addDefaultEnv(), + Annotations: metadata.Annotations, + // DisplayData: metadata.DisplayData, } if err := m.updateIfCombineComposite(s, transform); err != nil { diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go b/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go index 2836351f26668..a331aedd585de 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go @@ -28,6 +28,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/contextreg" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/typex" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/protox" @@ -165,8 +166,8 @@ func TestMarshal(t *testing.T) { if err != nil { t.Fatal(err) } - if len(edges) != test.edges { - t.Fatal("expected a single edge") + if got, want := len(edges), test.edges; got != want { + t.Fatalf("got %v edges, want %v", got, want) } payload, err := proto.Marshal(&pipepb.DockerPayload{ContainerImage: "foo"}) @@ -192,6 +193,79 @@ func TestMarshal(t *testing.T) { } } +func TestMarshal_PTransformAnnotations(t *testing.T) { + var creg contextreg.Registry + + const annotationKey = "myAnnotation" + + // A misused ptransform extractor that, if a context is attached to a scope will add an annotation to those transforms. + creg.TransformExtractor(func(ctx context.Context) contextreg.TransformMetadata { + return contextreg.TransformMetadata{ + Annotations: map[string][]byte{ + annotationKey: {42, 42, 42}, + }, + } + }) + + tests := []struct { + name string + makeGraph func(t *testing.T, g *graph.Graph) + + transforms int + }{ + { + name: "AnnotationSetOnComposite", + makeGraph: func(t *testing.T, g *graph.Graph) { + in := newIntInput(g) + side := newIntInput(g) + s := g.NewScope(g.Root(), "sub") + s.Context = context.Background() // Allow the default annotation to trigger. + addDoFn(t, g, pickSideFn, s, []*graph.Node{in, side}, []*coder.Coder{intCoder(), intCoder()}, nil) + }, + transforms: 2, + }, + } + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + g := graph.New() + test.makeGraph(t, g) + + edges, _, err := g.Build() + if err != nil { + t.Fatal(err) + } + + payload, err := proto.Marshal(&pipepb.DockerPayload{ContainerImage: "foo"}) + if err != nil { + t.Fatal(err) + } + p, err := graphx.Marshal(edges, + &graphx.Options{Environment: &pipepb.Environment{Urn: "beam:env:docker:v1", Payload: payload}, ContextReg: &creg}) + if err != nil { + t.Fatal(err) + } + + pts := p.GetComponents().GetTransforms() + if got, want := len(pts), test.transforms; got != want { + t.Errorf("got %d transforms, want %d : %v", got, want, proto.MarshalTextString(p)) + } + for _, pt := range pts { + // Context annotations only apply to composites, and are not duplicated to leaves. + if len(pt.GetSubtransforms()) == 0 { + if _, ok := pt.GetAnnotations()[annotationKey]; ok { + t.Errorf("unexpected annotation %v on leaf transform: %v", annotationKey, pt.GetAnnotations()) + } + continue + } + if _, ok := pt.GetAnnotations()[annotationKey]; !ok { + t.Errorf("expected %q annotation, but wasn't present: %v", annotationKey, pt.GetAnnotations()) + } + } + }) + } +} + // testRT's methods can all be no-ops, we just need it to implement sdf.RTracker. type testRT struct { } diff --git a/sdks/go/pkg/beam/io/fileio/example_test.go b/sdks/go/pkg/beam/io/fileio/example_test.go index 1763dccb07120..ddf11af565545 100644 --- a/sdks/go/pkg/beam/io/fileio/example_test.go +++ b/sdks/go/pkg/beam/io/fileio/example_test.go @@ -67,19 +67,9 @@ func ExampleReadMatches() { beam.Init() p, s := beam.NewPipelineWithRoot() - pairFn := func(ctx context.Context, file fileio.ReadableFile, emit func(string, string)) error { - contents, err := file.ReadString(ctx) - if err != nil { - return err - } - emit(file.Metadata.Path, contents) - return nil - } - matches := fileio.MatchFiles(s, "gs://path/to/*.gz") files := fileio.ReadMatches(s, matches) - pairs := beam.ParDo(s, pairFn, files) - debug.Print(s, pairs) + debug.Print(s, files) if err := beamx.Run(context.Background(), p); err != nil { log.Fatalf("Failed to execute job: %v", err) diff --git a/sdks/go/pkg/beam/io/natsio/common.go b/sdks/go/pkg/beam/io/natsio/common.go new file mode 100644 index 0000000000000..53f595516987d --- /dev/null +++ b/sdks/go/pkg/beam/io/natsio/common.go @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package natsio contains transforms for interacting with NATS. +package natsio + +import ( + "fmt" + + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" +) + +type natsFn struct { + URI string + CredsFile string + nc *nats.Conn + js jetstream.JetStream +} + +func (fn *natsFn) Setup() error { + var opts []nats.Option + if fn.CredsFile != "" { + opts = append(opts, nats.UserCredentials(fn.CredsFile)) + } + + conn, err := nats.Connect(fn.URI, opts...) + if err != nil { + return fmt.Errorf("error connecting to NATS: %v", err) + } + fn.nc = conn + + js, err := jetstream.New(fn.nc) + if err != nil { + return fmt.Errorf("error creating JetStream context: %v", err) + } + fn.js = js + + return nil +} + +func (fn *natsFn) Teardown() { + if fn.nc != nil { + fn.nc.Close() + } +} diff --git a/sdks/go/pkg/beam/io/natsio/example_test.go b/sdks/go/pkg/beam/io/natsio/example_test.go new file mode 100644 index 0000000000000..0516b8efa9213 --- /dev/null +++ b/sdks/go/pkg/beam/io/natsio/example_test.go @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package natsio_test + +import ( + "context" + "log" + + "github.com/apache/beam/sdks/v2/go/pkg/beam" + "github.com/apache/beam/sdks/v2/go/pkg/beam/io/natsio" + "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" + "github.com/nats-io/nats.go" +) + +func ExampleWrite() { + beam.Init() + + p, s := beam.NewPipelineWithRoot() + + uri := "nats://localhost:4222" + msgs := []natsio.ProducerMessage{ + { + Subject: "events.1", + ID: "123", + Data: []byte("hello"), + Headers: nats.Header{"key": []string{"val1"}}, + }, + { + Subject: "events.2", + ID: "124", + Data: []byte("world"), + Headers: nats.Header{"key": []string{"val2"}}, + }, + } + + input := beam.CreateList(s, msgs) + natsio.Write(s, uri, input) + + if err := beamx.Run(context.Background(), p); err != nil { + log.Fatalf("Failed to execute job: %v", err) + } +} diff --git a/sdks/go/pkg/beam/io/natsio/helper_test.go b/sdks/go/pkg/beam/io/natsio/helper_test.go new file mode 100644 index 0000000000000..cd47ed331de04 --- /dev/null +++ b/sdks/go/pkg/beam/io/natsio/helper_test.go @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package natsio + +import ( + "context" + "testing" + + "github.com/nats-io/nats-server/v2/server" + "github.com/nats-io/nats-server/v2/test" + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" +) + +func newServer(t *testing.T) *server.Server { + t.Helper() + + opts := &test.DefaultTestOptions + opts.Port = server.RANDOM_PORT + opts.JetStream = true + + srv := test.RunServer(opts) + t.Cleanup(srv.Shutdown) + + return srv +} + +func newConn(t *testing.T, uri string) *nats.Conn { + t.Helper() + + conn, err := nats.Connect(uri) + if err != nil { + t.Fatalf("Failed to connect to NATS: %v", err) + } + t.Cleanup(conn.Close) + + return conn +} + +func newJetStream(t *testing.T, conn *nats.Conn) jetstream.JetStream { + t.Helper() + + js, err := jetstream.New(conn) + if err != nil { + t.Fatalf("Failed to create JetStream instance: %v", err) + } + + return js +} + +func createStream( + t *testing.T, + ctx context.Context, + js jetstream.JetStream, + stream string, + subjects []string, +) jetstream.Stream { + t.Helper() + + cfg := jetstream.StreamConfig{ + Name: stream, + Subjects: subjects, + } + str, err := js.CreateStream(ctx, cfg) + if err != nil { + t.Fatalf("Failed to create stream: %v", err) + } + + t.Cleanup(func() { + if err := js.DeleteStream(ctx, stream); err != nil { + t.Fatalf("Failed to delete stream: %v", err) + } + }) + + return str +} + +func createConsumer( + t *testing.T, + ctx context.Context, + js jetstream.JetStream, + stream string, + subjects []string, +) jetstream.Consumer { + t.Helper() + + cfg := jetstream.OrderedConsumerConfig{ + FilterSubjects: subjects, + } + cons, err := js.OrderedConsumer(ctx, stream, cfg) + if err != nil { + t.Fatalf("Failed to create consumer: %v", err) + } + + return cons +} + +func fetchMessages(t *testing.T, cons jetstream.Consumer, size int) []jetstream.Msg { + t.Helper() + + msgs, err := cons.FetchNoWait(size) + if err != nil { + t.Fatalf("Failed to fetch messages: %v", err) + } + + var result []jetstream.Msg + + for msg := range msgs.Messages() { + if err := msg.Ack(); err != nil { + t.Fatalf("Failed to ack message: %v", err) + } + + result = append(result, msg) + } + + return result +} diff --git a/sdks/go/pkg/beam/io/natsio/write.go b/sdks/go/pkg/beam/io/natsio/write.go new file mode 100644 index 0000000000000..8991ef8cac167 --- /dev/null +++ b/sdks/go/pkg/beam/io/natsio/write.go @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package natsio + +import ( + "context" + "fmt" + "reflect" + + "github.com/apache/beam/sdks/v2/go/pkg/beam" + "github.com/apache/beam/sdks/v2/go/pkg/beam/register" + "github.com/nats-io/nats.go" + "github.com/nats-io/nats.go/jetstream" +) + +func init() { + register.DoFn3x1[context.Context, ProducerMessage, func(ack PublishAck), error](&writeFn{}) + register.Emitter1[PublishAck]() + + beam.RegisterType(reflect.TypeOf((*ProducerMessage)(nil)).Elem()) + beam.RegisterType(reflect.TypeOf((*PublishAck)(nil)).Elem()) +} + +// ProducerMessage represents a message to be published to NATS. +type ProducerMessage struct { + Subject string + ID string + Headers map[string][]string + Data []byte +} + +// PublishAck represents an acknowledgement from NATS after publishing a message. +type PublishAck struct { + Stream string + Subject string + ID string + Sequence uint64 + Duplicate bool +} + +// Write writes a PCollection to NATS JetStream and returns a +// PCollection of the acknowledged messages. The ID field can be set in the +// ProducerMessage to utilize JetStream's support for deduplication of messages. +// Write takes a variable number of WriteOptionFn to configure the write operation: +// - UserCredentials: path to the user credentials file. Defaults to empty. +func Write(s beam.Scope, uri string, col beam.PCollection, opts ...WriteOptionFn) beam.PCollection { + s = s.Scope("natsio.Write") + + option := &writeOption{} + for _, opt := range opts { + opt(option) + } + + return beam.ParDo(s, newWriteFn(uri, option), col) +} + +type writeFn struct { + natsFn +} + +func newWriteFn(uri string, option *writeOption) *writeFn { + return &writeFn{ + natsFn: natsFn{ + URI: uri, + CredsFile: option.CredsFile, + }, + } +} + +func (fn *writeFn) ProcessElement( + ctx context.Context, + elem ProducerMessage, + emit func(PublishAck), +) error { + msg := &nats.Msg{ + Subject: elem.Subject, + Data: elem.Data, + Header: elem.Headers, + } + + var opts []jetstream.PublishOpt + if elem.ID != "" { + opts = append(opts, jetstream.WithMsgID(elem.ID)) + } + + ack, err := fn.js.PublishMsg(ctx, msg, opts...) + if err != nil { + return fmt.Errorf("error publishing message: %v", err) + } + + pubAck := PublishAck{ + Stream: ack.Stream, + Subject: elem.Subject, + ID: elem.ID, + Sequence: ack.Sequence, + Duplicate: ack.Duplicate, + } + emit(pubAck) + + return nil +} diff --git a/sdks/go/pkg/beam/io/natsio/write_option.go b/sdks/go/pkg/beam/io/natsio/write_option.go new file mode 100644 index 0000000000000..b1ee48cbffe41 --- /dev/null +++ b/sdks/go/pkg/beam/io/natsio/write_option.go @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package natsio + +type writeOption struct { + CredsFile string +} + +// WriteOptionFn is a function that can be passed to Write to configure options for +// writing messages. +type WriteOptionFn func(option *writeOption) + +// WriteUserCredentials sets the user credentials when connecting to NATS. +func WriteUserCredentials(credsFile string) WriteOptionFn { + return func(o *writeOption) { + o.CredsFile = credsFile + } +} diff --git a/sdks/go/pkg/beam/io/natsio/write_test.go b/sdks/go/pkg/beam/io/natsio/write_test.go new file mode 100644 index 0000000000000..5e9387ece5f66 --- /dev/null +++ b/sdks/go/pkg/beam/io/natsio/write_test.go @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package natsio + +import ( + "bytes" + "context" + "testing" + + "github.com/apache/beam/sdks/v2/go/pkg/beam" + "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" + "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" + "github.com/google/go-cmp/cmp" + "github.com/nats-io/nats.go" +) + +func TestMain(m *testing.M) { + ptest.Main(m) +} + +func TestWrite(t *testing.T) { + stream := "STREAM" + subject := "subject" + + tests := []struct { + name string + input []any + wantAcks []any + wantMsgs []jsMsg + }{ + { + name: "Write messages and deduplicate based on ID", + input: []any{ + ProducerMessage{ + Subject: subject, + ID: "1", + Data: []byte("msg1a"), + }, + ProducerMessage{ + Subject: subject, + ID: "1", + Data: []byte("msg1b"), + }, + ProducerMessage{ + Subject: subject, + ID: "2", + Data: []byte("msg2"), + }, + }, + wantAcks: []any{ + PublishAck{ + Stream: stream, + Subject: subject, + ID: "1", + Sequence: 1, + Duplicate: false, + }, + PublishAck{ + Stream: stream, + Subject: subject, + ID: "1", + Sequence: 1, + Duplicate: true, + }, + PublishAck{ + Stream: stream, + Subject: subject, + ID: "2", + Sequence: 2, + Duplicate: false, + }, + }, + wantMsgs: []jsMsg{ + testMsg{ + subject: subject, + headers: nats.Header{nats.MsgIdHdr: []string{"1"}}, + data: []byte("msg1a"), + }, + testMsg{ + subject: subject, + headers: nats.Header{nats.MsgIdHdr: []string{"2"}}, + data: []byte("msg2"), + }, + }, + }, + { + name: "Write messages without ID", + input: []any{ + ProducerMessage{ + Subject: subject, + Data: []byte("msg1a"), + }, + ProducerMessage{ + Subject: subject, + Data: []byte("msg1b"), + }, + ProducerMessage{ + Subject: subject, + Data: []byte("msg2"), + }, + }, + wantAcks: []any{ + PublishAck{ + Stream: stream, + Subject: subject, + ID: "", + Sequence: 1, + Duplicate: false, + }, + PublishAck{ + Stream: stream, + Subject: subject, + ID: "", + Sequence: 2, + Duplicate: false, + }, + PublishAck{ + Stream: stream, + Subject: subject, + ID: "", + Sequence: 3, + Duplicate: false, + }, + }, + wantMsgs: []jsMsg{ + testMsg{ + subject: subject, + data: []byte("msg1a"), + }, + testMsg{ + subject: subject, + data: []byte("msg1b"), + }, + testMsg{ + subject: subject, + data: []byte("msg2"), + }, + }, + }, + { + name: "Write message with headers", + input: []any{ + ProducerMessage{ + Subject: subject, + ID: "1", + Headers: map[string][]string{"key": {"val"}}, + Data: []byte("msg1"), + }, + }, + wantAcks: []any{ + PublishAck{ + Stream: stream, + Subject: subject, + ID: "1", + Sequence: 1, + Duplicate: false, + }, + }, + wantMsgs: []jsMsg{ + testMsg{ + subject: subject, + headers: nats.Header{nats.MsgIdHdr: []string{"1"}, "key": []string{"val"}}, + data: []byte("msg1"), + }, + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + srv := newServer(t) + uri := srv.ClientURL() + conn := newConn(t, uri) + js := newJetStream(t, conn) + + subjects := []string{subject} + createStream(t, ctx, js, stream, subjects) + cons := createConsumer(t, ctx, js, stream, subjects) + + p, s := beam.NewPipelineWithRoot() + + col := beam.Create(s, tc.input...) + gotAcks := Write(s, uri, col) + + passert.Equals(s, gotAcks, tc.wantAcks...) + ptest.RunAndValidate(t, p) + + gotMsgs := fetchMessages(t, cons, len(tc.input)+1) + + if gotLen, wantLen := len(gotMsgs), len(tc.wantMsgs); gotLen != wantLen { + t.Fatalf("Len() = %v, want %v", gotLen, wantLen) + } + + for i := range gotMsgs { + if gotSubject, wantSubject := gotMsgs[i].Subject(), tc.wantMsgs[i].Subject(); gotSubject != wantSubject { + t.Errorf("msg %d: Subject() = %v, want %v", i, gotSubject, wantSubject) + } + + if gotHeaders, wantHeaders := gotMsgs[i].Headers(), tc.wantMsgs[i].Headers(); !cmp.Equal( + gotHeaders, + wantHeaders, + ) { + t.Errorf("msg %d: Headers() = %v, want %v", i, gotHeaders, wantHeaders) + } + + if gotData, wantData := gotMsgs[i].Data(), tc.wantMsgs[i].Data(); !bytes.Equal( + gotData, + wantData, + ) { + t.Errorf("msg %d: Data() = %q, want %q", i, gotData, wantData) + } + } + }) + } +} + +type jsMsg interface { + Subject() string + Headers() nats.Header + Data() []byte +} + +type testMsg struct { + subject string + headers nats.Header + data []byte +} + +func (m testMsg) Subject() string { + return m.subject +} + +func (m testMsg) Headers() nats.Header { + return m.headers +} + +func (m testMsg) Data() []byte { + return m.data +} diff --git a/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api.pb.go b/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api.pb.go index 1d547470ea1ad..9d14cff3c7d61 100644 --- a/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api.pb.go +++ b/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api.pb.go @@ -27,7 +27,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/fn_execution/v1/beam_fn_api.proto // TODO: Consider consolidating common components in another package @@ -1883,30 +1883,57 @@ func (x *ProcessBundleSplitRequest) GetDesiredSplits() map[string]*ProcessBundle // first_residual_element. // - The current bundle, if no further splits happen, will have done exactly // the work under primary_roots and all elements up to and including the -// channel splits last_primary_element. +// channel split's last_primary_element. // // This allows the SDK to relinquish ownership of and commit to not process some // of the elements that it may have been sent (the residual) while retaining // ownership and commitment to finish the other portion (the primary). // -// For example, lets say the SDK is processing elements A B C D E and a split -// request comes in. The SDK could return a response with a channel split -// representing a last_primary_element of 3 (D) and first_residual_element of 4 -// (E). The SDK is now responsible for processing A B C D and the runner must -// process E in the future. A future split request could have the SDK split the -// elements B into B1 and B2 and C into C1 and C2 representing their primary and -// residual roots. The SDK would return a response with a channel split -// representing a last_primary_element of 0 (A) and first_residual_element of 3 -// (D) with primary_roots (B1, C1) and residual_roots (B2, C2). The SDK is now -// responsible for processing A B1 C1 and the runner must process C2 D2 (and E -// from the prior split) in the future. Yet another future split request could -// have the SDK could split B1 further into B1a and B1b primary and residuals -// and return C2 as a residual (assuming C2 was left unprocessed). The SDK would -// return a response with a channel split representing a last_primary_element of -// 0 (A) and first_residual_element of 4 (E) with primary_roots (B1a) and -// residual_roots (B1b, C1). The SDK is now responsible for processing A B1a the -// runner must process B1b C1 (in addition to C2, D, E from prior splits) in the -// future. +// Example with three splits of a single bundle: +// Let's say the SDK is processing elements [A B C D E]. These elements make +// up the 0-indexed channel. +// +// ** First Split ** +// Channel Split = [ A B C D <> E ] +// Primary Roots = [] (No elements were split) +// Residual Roots = [] +// +// Say a split request comes in. The SDK could return a response with a channel +// split representing a last_primary_element of 3 (D) and +// first_residual_element of 4 (E). The SDK is now responsible for processing A +// B C D and the runner must process E in the future. +// +// (A B C D) | (E) +// +// ** Second Split ** +// Channel Split = [ A < B C > D E ] +// Primary Roots = [B1 C1] +// Residual Roots = [B2 C2] +// +// A future split request could have the SDK split the elements B into B1 and +// B2 and C into C1 and C2 representing their primary and residual roots. The +// +// (A B1 C1) | (B2 C2 D) +// +// SDK would return a response with a channel split representing a +// last_primary_element of 0 (A) and first_residual_element of 3 (D) with +// primary_roots (B1, C1) and residual_roots (B2, C2). The SDK is now +// responsible for processing A B1 C1 and the runner must process B2 C2 D (and +// E from the prior split) in the future. +// +// ** Third Split ** +// Channel Split = [ A < B C > D E ] +// Primary Roots = [B1a] +// Residual Roots [B1b C1] +// Yet another future split request could have the SDK could split B1 further +// into B1a and B1b primary and residuals and return C1 as a residual (assuming +// C1 was left unprocessed). The SDK would return a response with a channel +// split representing a last_primary_element of 0 (A) and +// first_residual_element of 3 (E) with primary_roots (B1a) and residual_roots +// (B1b, C1). The SDK is now responsible for processing A B1a the runner must +// process B1b C1 (in addition to C2, D, E from prior splits) in the future. +// +// (A B1a) | (B1b C1) // // For more rigorous definitions see https://s.apache.org/beam-breaking-fusion type ProcessBundleSplitResponse struct { diff --git a/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api_grpc.pb.go b/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api_grpc.pb.go index ac9e402750c4d..cd53ea805705e 100644 --- a/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api_grpc.pb.go +++ b/sdks/go/pkg/beam/model/fnexecution_v1/beam_fn_api_grpc.pb.go @@ -17,7 +17,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.1.0 -// - protoc v4.24.0--rc1 +// - protoc v4.24.4 // source: org/apache/beam/model/fn_execution/v1/beam_fn_api.proto package fnexecution_v1 diff --git a/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api.pb.go b/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api.pb.go index a24609b2fd059..26cf245f72069 100644 --- a/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api.pb.go +++ b/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api.pb.go @@ -22,7 +22,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/fn_execution/v1/beam_provision_api.proto package fnexecution_v1 diff --git a/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api_grpc.pb.go b/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api_grpc.pb.go index f9c6f5681399b..9064b348b4c04 100644 --- a/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api_grpc.pb.go +++ b/sdks/go/pkg/beam/model/fnexecution_v1/beam_provision_api_grpc.pb.go @@ -17,7 +17,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.1.0 -// - protoc v4.24.0--rc1 +// - protoc v4.24.4 // source: org/apache/beam/model/fn_execution/v1/beam_provision_api.proto package fnexecution_v1 diff --git a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api.pb.go b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api.pb.go index 6a7663d77e9cf..85bb2e368970d 100644 --- a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api.pb.go +++ b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api.pb.go @@ -22,7 +22,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/job_management/v1/beam_artifact_api.proto package jobmanagement_v1 diff --git a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api_grpc.pb.go b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api_grpc.pb.go index 6b381b96f3d10..28e43e21fbbdf 100644 --- a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api_grpc.pb.go +++ b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_artifact_api_grpc.pb.go @@ -17,7 +17,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.1.0 -// - protoc v4.24.0--rc1 +// - protoc v4.24.4 // source: org/apache/beam/model/job_management/v1/beam_artifact_api.proto package jobmanagement_v1 diff --git a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api.pb.go b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api.pb.go index 0f33c7ab9e3c8..8f7ca43ec0f5b 100644 --- a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api.pb.go +++ b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api.pb.go @@ -22,7 +22,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/job_management/v1/beam_expansion_api.proto package jobmanagement_v1 diff --git a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api_grpc.pb.go b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api_grpc.pb.go index e2cc3c4f77ec3..f1c3782f5fb80 100644 --- a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api_grpc.pb.go +++ b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_expansion_api_grpc.pb.go @@ -17,7 +17,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.1.0 -// - protoc v4.24.0--rc1 +// - protoc v4.24.4 // source: org/apache/beam/model/job_management/v1/beam_expansion_api.proto package jobmanagement_v1 diff --git a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api.pb.go b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api.pb.go index d93130d26d9f7..62e0b313ec2da 100644 --- a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api.pb.go +++ b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api.pb.go @@ -22,7 +22,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/job_management/v1/beam_job_api.proto package jobmanagement_v1 diff --git a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api_grpc.pb.go b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api_grpc.pb.go index 08da7e4643c3a..38f2c85a1c1cd 100644 --- a/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api_grpc.pb.go +++ b/sdks/go/pkg/beam/model/jobmanagement_v1/beam_job_api_grpc.pb.go @@ -17,7 +17,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.1.0 -// - protoc v4.24.0--rc1 +// - protoc v4.24.4 // source: org/apache/beam/model/job_management/v1/beam_job_api.proto package jobmanagement_v1 diff --git a/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api.pb.go b/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api.pb.go index b20a5dccbe05b..49df2b5c2e597 100644 --- a/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api.pb.go +++ b/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api.pb.go @@ -22,7 +22,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/pipeline/v1/beam_runner_api.proto package pipeline_v1 @@ -1857,6 +1857,10 @@ const ( // SDKs should convert the size to bytes, but can allow users to specify human-friendly units (e.g. GiB). // Payload: ASCII encoded string of the base 10 representation of an integer number of bytes. StandardResourceHints_MIN_RAM_BYTES StandardResourceHints_Enum = 1 + // Describes desired number of CPUs available in transform's execution environment. + // SDKs should accept and validate a positive integer count. + // Payload: ASCII encoded string of the base 10 representation of an integer number of CPUs. + StandardResourceHints_CPU_COUNT StandardResourceHints_Enum = 2 ) // Enum value maps for StandardResourceHints_Enum. @@ -1864,10 +1868,12 @@ var ( StandardResourceHints_Enum_name = map[int32]string{ 0: "ACCELERATOR", 1: "MIN_RAM_BYTES", + 2: "CPU_COUNT", } StandardResourceHints_Enum_value = map[string]int32{ "ACCELERATOR": 0, "MIN_RAM_BYTES": 1, + "CPU_COUNT": 2, } ) @@ -9223,42 +9229,45 @@ var file_org_apache_beam_model_pipeline_v1_beam_runner_api_proto_rawDesc = []byt 0x65, 0x63, 0x75, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x53, 0x74, 0x61, 0x67, 0x65, 0x50, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x72, 0x49, 0x64, 0x48, 0x00, 0x52, 0x05, 0x74, 0x69, 0x6d, 0x65, 0x72, 0x42, 0x08, 0x0a, 0x06, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x22, - 0x8f, 0x01, 0x0a, 0x15, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, 0x52, 0x65, 0x73, 0x6f, - 0x75, 0x72, 0x63, 0x65, 0x48, 0x69, 0x6e, 0x74, 0x73, 0x22, 0x76, 0x0a, 0x04, 0x45, 0x6e, 0x75, - 0x6d, 0x12, 0x34, 0x0a, 0x0b, 0x41, 0x43, 0x43, 0x45, 0x4c, 0x45, 0x52, 0x41, 0x54, 0x4f, 0x52, - 0x10, 0x00, 0x1a, 0x23, 0xa2, 0xb4, 0xfa, 0xc2, 0x05, 0x1d, 0x62, 0x65, 0x61, 0x6d, 0x3a, 0x72, - 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x73, 0x3a, 0x61, 0x63, 0x63, 0x65, 0x6c, 0x65, 0x72, - 0x61, 0x74, 0x6f, 0x72, 0x3a, 0x76, 0x31, 0x12, 0x38, 0x0a, 0x0d, 0x4d, 0x49, 0x4e, 0x5f, 0x52, - 0x41, 0x4d, 0x5f, 0x42, 0x59, 0x54, 0x45, 0x53, 0x10, 0x01, 0x1a, 0x25, 0xa2, 0xb4, 0xfa, 0xc2, - 0x05, 0x1f, 0x62, 0x65, 0x61, 0x6d, 0x3a, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x73, - 0x3a, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x61, 0x6d, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x3a, 0x76, - 0x31, 0x32, 0x8f, 0x01, 0x0a, 0x11, 0x54, 0x65, 0x73, 0x74, 0x53, 0x74, 0x72, 0x65, 0x61, 0x6d, - 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x7a, 0x0a, 0x06, 0x45, 0x76, 0x65, 0x6e, 0x74, - 0x73, 0x12, 0x30, 0x2e, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x62, - 0x65, 0x61, 0x6d, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, - 0x6e, 0x65, 0x2e, 0x76, 0x31, 0x2e, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x52, 0x65, 0x71, 0x75, - 0x65, 0x73, 0x74, 0x1a, 0x3a, 0x2e, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, + 0xc2, 0x01, 0x0a, 0x15, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x61, 0x72, 0x64, 0x52, 0x65, 0x73, 0x6f, + 0x75, 0x72, 0x63, 0x65, 0x48, 0x69, 0x6e, 0x74, 0x73, 0x22, 0xa8, 0x01, 0x0a, 0x04, 0x45, 0x6e, + 0x75, 0x6d, 0x12, 0x34, 0x0a, 0x0b, 0x41, 0x43, 0x43, 0x45, 0x4c, 0x45, 0x52, 0x41, 0x54, 0x4f, + 0x52, 0x10, 0x00, 0x1a, 0x23, 0xa2, 0xb4, 0xfa, 0xc2, 0x05, 0x1d, 0x62, 0x65, 0x61, 0x6d, 0x3a, + 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x73, 0x3a, 0x61, 0x63, 0x63, 0x65, 0x6c, 0x65, + 0x72, 0x61, 0x74, 0x6f, 0x72, 0x3a, 0x76, 0x31, 0x12, 0x38, 0x0a, 0x0d, 0x4d, 0x49, 0x4e, 0x5f, + 0x52, 0x41, 0x4d, 0x5f, 0x42, 0x59, 0x54, 0x45, 0x53, 0x10, 0x01, 0x1a, 0x25, 0xa2, 0xb4, 0xfa, + 0xc2, 0x05, 0x1f, 0x62, 0x65, 0x61, 0x6d, 0x3a, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, + 0x73, 0x3a, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x61, 0x6d, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x73, 0x3a, + 0x76, 0x31, 0x12, 0x30, 0x0a, 0x09, 0x43, 0x50, 0x55, 0x5f, 0x43, 0x4f, 0x55, 0x4e, 0x54, 0x10, + 0x02, 0x1a, 0x21, 0xa2, 0xb4, 0xfa, 0xc2, 0x05, 0x1b, 0x62, 0x65, 0x61, 0x6d, 0x3a, 0x72, 0x65, + 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x73, 0x3a, 0x63, 0x70, 0x75, 0x5f, 0x63, 0x6f, 0x75, 0x6e, + 0x74, 0x3a, 0x76, 0x31, 0x32, 0x8f, 0x01, 0x0a, 0x11, 0x54, 0x65, 0x73, 0x74, 0x53, 0x74, 0x72, + 0x65, 0x61, 0x6d, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x7a, 0x0a, 0x06, 0x45, 0x76, + 0x65, 0x6e, 0x74, 0x73, 0x12, 0x30, 0x2e, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, + 0x65, 0x2e, 0x62, 0x65, 0x61, 0x6d, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x69, 0x70, + 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x2e, 0x76, 0x31, 0x2e, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x3a, 0x2e, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, + 0x63, 0x68, 0x65, 0x2e, 0x62, 0x65, 0x61, 0x6d, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x2e, 0x76, 0x31, 0x2e, 0x54, 0x65, 0x73, 0x74, 0x53, + 0x74, 0x72, 0x65, 0x61, 0x6d, 0x50, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x2e, 0x45, 0x76, 0x65, + 0x6e, 0x74, 0x22, 0x00, 0x30, 0x01, 0x3a, 0x3f, 0x0a, 0x08, 0x62, 0x65, 0x61, 0x6d, 0x5f, 0x75, + 0x72, 0x6e, 0x12, 0x21, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x45, 0x6e, 0x75, 0x6d, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x4f, 0x70, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0xc4, 0xa6, 0xaf, 0x58, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, + 0x62, 0x65, 0x61, 0x6d, 0x55, 0x72, 0x6e, 0x3a, 0x49, 0x0a, 0x0d, 0x62, 0x65, 0x61, 0x6d, 0x5f, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x12, 0x21, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x45, 0x6e, 0x75, 0x6d, 0x56, + 0x61, 0x6c, 0x75, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0xc5, 0xa6, 0xaf, 0x58, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x62, 0x65, 0x61, 0x6d, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, + 0x6e, 0x74, 0x42, 0x78, 0x0a, 0x21, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x62, 0x65, 0x61, 0x6d, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x69, 0x70, 0x65, - 0x6c, 0x69, 0x6e, 0x65, 0x2e, 0x76, 0x31, 0x2e, 0x54, 0x65, 0x73, 0x74, 0x53, 0x74, 0x72, 0x65, - 0x61, 0x6d, 0x50, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x2e, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x22, - 0x00, 0x30, 0x01, 0x3a, 0x3f, 0x0a, 0x08, 0x62, 0x65, 0x61, 0x6d, 0x5f, 0x75, 0x72, 0x6e, 0x12, - 0x21, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, - 0x66, 0x2e, 0x45, 0x6e, 0x75, 0x6d, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, - 0x6e, 0x73, 0x18, 0xc4, 0xa6, 0xaf, 0x58, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x62, 0x65, 0x61, - 0x6d, 0x55, 0x72, 0x6e, 0x3a, 0x49, 0x0a, 0x0d, 0x62, 0x65, 0x61, 0x6d, 0x5f, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x61, 0x6e, 0x74, 0x12, 0x21, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x45, 0x6e, 0x75, 0x6d, 0x56, 0x61, 0x6c, 0x75, - 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0xc5, 0xa6, 0xaf, 0x58, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x0c, 0x62, 0x65, 0x61, 0x6d, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x42, - 0x78, 0x0a, 0x21, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x62, 0x65, - 0x61, 0x6d, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, - 0x65, 0x2e, 0x76, 0x31, 0x42, 0x09, 0x52, 0x75, 0x6e, 0x6e, 0x65, 0x72, 0x41, 0x70, 0x69, 0x5a, - 0x48, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x61, 0x70, 0x61, 0x63, - 0x68, 0x65, 0x2f, 0x62, 0x65, 0x61, 0x6d, 0x2f, 0x73, 0x64, 0x6b, 0x73, 0x2f, 0x76, 0x32, 0x2f, - 0x67, 0x6f, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x62, 0x65, 0x61, 0x6d, 0x2f, 0x6d, 0x6f, 0x64, 0x65, - 0x6c, 0x2f, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x76, 0x31, 0x3b, 0x70, 0x69, - 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x33, + 0x6c, 0x69, 0x6e, 0x65, 0x2e, 0x76, 0x31, 0x42, 0x09, 0x52, 0x75, 0x6e, 0x6e, 0x65, 0x72, 0x41, + 0x70, 0x69, 0x5a, 0x48, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x61, + 0x70, 0x61, 0x63, 0x68, 0x65, 0x2f, 0x62, 0x65, 0x61, 0x6d, 0x2f, 0x73, 0x64, 0x6b, 0x73, 0x2f, + 0x76, 0x32, 0x2f, 0x67, 0x6f, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x62, 0x65, 0x61, 0x6d, 0x2f, 0x6d, + 0x6f, 0x64, 0x65, 0x6c, 0x2f, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x76, 0x31, + 0x3b, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api_grpc.pb.go b/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api_grpc.pb.go index d5e65f7b768d4..20a30cf4dd011 100644 --- a/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api_grpc.pb.go +++ b/sdks/go/pkg/beam/model/pipeline_v1/beam_runner_api_grpc.pb.go @@ -17,7 +17,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.1.0 -// - protoc v4.24.0--rc1 +// - protoc v4.24.4 // source: org/apache/beam/model/pipeline/v1/beam_runner_api.proto package pipeline_v1 diff --git a/sdks/go/pkg/beam/model/pipeline_v1/endpoints.pb.go b/sdks/go/pkg/beam/model/pipeline_v1/endpoints.pb.go index 74348ddc3b336..2dfaffa2bff00 100644 --- a/sdks/go/pkg/beam/model/pipeline_v1/endpoints.pb.go +++ b/sdks/go/pkg/beam/model/pipeline_v1/endpoints.pb.go @@ -21,7 +21,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/pipeline/v1/endpoints.proto package pipeline_v1 diff --git a/sdks/go/pkg/beam/model/pipeline_v1/external_transforms.pb.go b/sdks/go/pkg/beam/model/pipeline_v1/external_transforms.pb.go index 0bc21a56685eb..edbe82264f5ea 100644 --- a/sdks/go/pkg/beam/model/pipeline_v1/external_transforms.pb.go +++ b/sdks/go/pkg/beam/model/pipeline_v1/external_transforms.pb.go @@ -21,7 +21,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/pipeline/v1/external_transforms.proto package pipeline_v1 diff --git a/sdks/go/pkg/beam/model/pipeline_v1/metrics.pb.go b/sdks/go/pkg/beam/model/pipeline_v1/metrics.pb.go index ec3e0d704a803..60edad2363be7 100644 --- a/sdks/go/pkg/beam/model/pipeline_v1/metrics.pb.go +++ b/sdks/go/pkg/beam/model/pipeline_v1/metrics.pb.go @@ -21,7 +21,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/pipeline/v1/metrics.proto package pipeline_v1 diff --git a/sdks/go/pkg/beam/model/pipeline_v1/schema.pb.go b/sdks/go/pkg/beam/model/pipeline_v1/schema.pb.go index 717fbbfb7a691..4bc6a57044cd0 100644 --- a/sdks/go/pkg/beam/model/pipeline_v1/schema.pb.go +++ b/sdks/go/pkg/beam/model/pipeline_v1/schema.pb.go @@ -24,7 +24,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/pipeline/v1/schema.proto package pipeline_v1 diff --git a/sdks/go/pkg/beam/model/pipeline_v1/standard_window_fns.pb.go b/sdks/go/pkg/beam/model/pipeline_v1/standard_window_fns.pb.go index dccd7d4275032..e0522806df73a 100644 --- a/sdks/go/pkg/beam/model/pipeline_v1/standard_window_fns.pb.go +++ b/sdks/go/pkg/beam/model/pipeline_v1/standard_window_fns.pb.go @@ -22,7 +22,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.27.1 -// protoc v4.24.0--rc1 +// protoc v4.24.4 // source: org/apache/beam/model/pipeline/v1/standard_window_fns.proto package pipeline_v1 diff --git a/sdks/go/pkg/beam/options/resource/hint.go b/sdks/go/pkg/beam/options/resource/hint.go index 1538fe65def28..d823f4feafa9c 100644 --- a/sdks/go/pkg/beam/options/resource/hint.go +++ b/sdks/go/pkg/beam/options/resource/hint.go @@ -196,3 +196,40 @@ func (h acceleratorHint) MergeWithOuter(outer Hint) Hint { func (h acceleratorHint) String() string { return fmt.Sprintf("accelerator=%v", h.value) } + +// CPUCount hints that this scope should be put in a machine with at least this many CPUs or vCPUs. +// +// Hints are advisory only and runners may not respect them. +// +// See https://beam.apache.org/documentation/runtime/resource-hints/ for more information about +// resource hints. +func CPUCount(v uint64) Hint { + return CPUCountHint{value: uint64(v)} +} + +type CPUCountHint struct { + value uint64 +} + +func (CPUCountHint) URN() string { + return "beam:resources:cpu_count:v1" +} + +func (h CPUCountHint) Payload() []byte { + // Go strings are utf8, and if the string is ascii, + // byte conversion handles that directly. + return []byte(strconv.FormatUint(h.value, 10)) +} + +// MergeWithOuter by keeping the maximum of the two cpu counts. +func (h CPUCountHint) MergeWithOuter(outer Hint) Hint { + // Intentional runtime panic from type assertion to catch hint merge errors. + if outer.(CPUCountHint).value > h.value { + return outer + } + return h +} + +func (h CPUCountHint) String() string { + return fmt.Sprintf("cpu_count=%v", humanize.Bytes(uint64(h.value))) +} diff --git a/sdks/go/pkg/beam/options/resource/hint_test.go b/sdks/go/pkg/beam/options/resource/hint_test.go index cf24b47b6c916..7c2a1df792941 100644 --- a/sdks/go/pkg/beam/options/resource/hint_test.go +++ b/sdks/go/pkg/beam/options/resource/hint_test.go @@ -111,6 +111,38 @@ func TestParseMinRAMHint_panic(t *testing.T) { ParseMinRAM("a bad byte string") } +func TestCPUCountHint_MergeWith(t *testing.T) { + low := CPUCountHint{value: 2} + high := CPUCountHint{value: 128} + + if got, want := low.MergeWithOuter(high), high; got != want { + t.Errorf("%v.MergeWith(%v) = %v, want %v", low, high, got, want) + } + if got, want := high.MergeWithOuter(low), high; got != want { + t.Errorf("%v.MergeWith(%v) = %v, want %v", high, low, got, want) + } +} + +func TestCPUCountHint_Payload(t *testing.T) { + tests := []struct { + value uint64 + payload string + }{ + {0, "0"}, + {2, "2"}, + {11, "11"}, + {2003, "2003"}, + {1.2e7, "12000000"}, + } + + for _, test := range tests { + h := CPUCountHint{value: test.value} + if got, want := h.Payload(), []byte(test.payload); !bytes.Equal(got, want) { + t.Errorf("%v.Payload() = %v, want %v", h, got, want) + } + } +} + // We copy the URN from the proto for use as a constant rather than perform a direct look up // each time, or increase initialization time. However we do need to validate that they are // correct, and match the standard hint urns, so that's done here. @@ -130,7 +162,11 @@ func TestStandardHintUrns(t *testing.T) { }, { h: MinRAMBytes(2e9), urn: getStandardURN(pipepb.StandardResourceHints_MIN_RAM_BYTES), + }, { + h: CPUCount(4), + urn: getStandardURN(pipepb.StandardResourceHints_CPU_COUNT), }} + for _, test := range tests { if got, want := test.h.URN(), test.urn; got != want { t.Errorf("Checked urn for %T, got %q, want %q", test.h, got, want) @@ -154,12 +190,12 @@ func (h customHint) MergeWithOuter(outer Hint) Hint { } func TestHints_Equal(t *testing.T) { - hs := NewHints(MinRAMBytes(2e9), Accelerator("type:pants;count1;install-pajamas")) + hs := NewHints(MinRAMBytes(2e9), Accelerator("type:pants;count1;install-pajamas"), CPUCount(4)) if got, want := hs.Equal(hs), true; got != want { t.Errorf("Self equal test: hs.Equal(hs) = %v, want %v", got, want) } - eq := NewHints(MinRAMBytes(2e9), Accelerator("type:pants;count1;install-pajamas")) + eq := NewHints(MinRAMBytes(2e9), Accelerator("type:pants;count1;install-pajamas"), CPUCount(4)) if got, want := hs.Equal(eq), true; got != want { t.Errorf("identical equal test: hs.Equal(eq) = %v, want %v", got, want) } @@ -223,12 +259,13 @@ func TestHints_MergeWithOuter(t *testing.T) { func TestHints_Payloads(t *testing.T) { { - hs := NewHints(MinRAMBytes(2e9), Accelerator("type:jeans;count1;")) + hs := NewHints(MinRAMBytes(2e9), Accelerator("type:jeans;count1;"), CPUCount(4)) got := hs.Payloads() want := map[string][]byte{ "beam:resources:min_ram_bytes:v1": []byte("2000000000"), "beam:resources:accelerator:v1": []byte("type:jeans;count1;"), + "beam:resources:cpu_count:v1": []byte("4"), } if !reflect.DeepEqual(got, want) { t.Errorf("hs.Payloads() = %v, want %v", got, want) @@ -248,7 +285,7 @@ func TestHints_Payloads(t *testing.T) { func TestHints_NilHints(t *testing.T) { var hs1, hs2 Hints - hs := NewHints(MinRAMBytes(2e9), Accelerator("type:pants;count1;install-pajamas")) + hs := NewHints(MinRAMBytes(2e9), Accelerator("type:pants;count1;install-pajamas"), CPUCount(4)) if got, want := hs1.Equal(hs2), true; got != want { t.Errorf("nils equal test: (nil).Equal(nil) = %v, want %v", got, want) diff --git a/sdks/go/pkg/beam/pardo.go b/sdks/go/pkg/beam/pardo.go index d18945834d6d7..629ce329c9bae 100644 --- a/sdks/go/pkg/beam/pardo.go +++ b/sdks/go/pkg/beam/pardo.go @@ -157,11 +157,14 @@ func ParDo0(s Scope, dofn any, col PCollection, opts ...Option) { // struct may also define Setup, StartBundle, FinishBundle and Teardown methods. // The struct is JSON-serialized and may contain construction-time values. // +// Functions and types used as DoFns must be registered with beam using the +// beam `register` package, so they may execute on distributed workers. +// Functions must not be anonymous or closures, or they will fail at execution time. +// // Conceptually, when a ParDo transform is executed, the elements of the input // PCollection are first divided up into some number of "bundles". These are -// farmed off to distributed worker machines (or run locally, if using the -// direct runner). For each bundle of input elements processing proceeds as -// follows: +// farmed off to distributed worker machines (or locally on a local runner instance). +// For each bundle of input elements processing proceeds as follows: // // - If a struct, a fresh instance of the argument DoFn is created on a // worker from json serialization, and the Setup method is called on this @@ -187,10 +190,11 @@ func ParDo0(s Scope, dofn any, col PCollection, opts ...Option) { // // For example: // +// func stringLen(word string) int { return len(word) } +// func init() { register.Function1x1(stringLen) } +// // words := beam.ParDo(s, &Foo{...}, ...) -// lengths := beam.ParDo(s, func (word string) int) { -// return len(word) -// }, words) +// lengths := beam.ParDo(s, stringLen, words) // // Each output element has the same timestamp and is in the same windows as its // corresponding input element. The timestamp can be accessed and/or emitted by @@ -207,28 +211,34 @@ func ParDo0(s Scope, dofn any, col PCollection, opts ...Option) { // options, and their contents accessible to each of the DoFn operations. For // example: // +// func filterLessThanCutoff(word string, cutoff int, emit func(string)) { +// if len(word) < cutoff { +// emit(word) +// } +// } +// func init() { register.Function3x0(filterLessThanCutoff) } +// // words := ... // cufoff := ... // Singleton PCollection -// smallWords := beam.ParDo(s, func (word string, cutoff int, emit func(string)) { -// if len(word) < cutoff { -// emit(word) -// } -// }, words, beam.SideInput{Input: cutoff}) +// smallWords := beam.ParDo(s, filterLessThanCutoff, words, beam.SideInput{Input: cutoff}) // // # Additional Outputs // // Optionally, a ParDo transform can produce zero or multiple output // PCollections. Note the use of ParDo2 to specfic 2 outputs. For example: // +// func partitionAtCutoff(word string, cutoff int, small, big func(string)) { +// if len(word) < cutoff { +// small(word) +// } else { +// big(word) +// } +// } +// func init() { register.Function4x0(partitionAtCutoff) } +// // words := ... // cufoff := ... // Singleton PCollection -// small, big := beam.ParDo2(s, func (word string, cutoff int, small, big func(string)) { -// if len(word) < cutoff { -// small(word) -// } else { -// big(word) -// } -// }, words, beam.SideInput{Input: cutoff}) +// small, big := beam.ParDo2(s, partitionAtCutoff, words, beam.SideInput{Input: cutoff}) // // By default, the Coders for the elements of each output PCollections is // inferred from the concrete type. diff --git a/sdks/go/pkg/beam/partition.go b/sdks/go/pkg/beam/partition.go index 37498ddbc0bd5..1c79965ea63b2 100644 --- a/sdks/go/pkg/beam/partition.go +++ b/sdks/go/pkg/beam/partition.go @@ -39,6 +39,19 @@ var ( // // A PartitionFn has the signature `func(T) int.` // +// func lenToTen(s string) int { +// if len(s) > 9 { +// return 10 +// } +// return len(s) +// } +// +// // Partition functions must be registered with Beam, and must not be closures. +// func init() { register.Function1x1(lenToTen) } +// +// // The number of partitions goes up to 11 since we can return 0 through 10 +// wordsByLength := beam.Partition(s, 11, lenToTen, inputStrings) +// // T is permitted to be a KV. func Partition(s Scope, n int, fn any, col PCollection) []PCollection { s = s.Scope(fmt.Sprintf("Partition(%v)", n)) diff --git a/sdks/go/pkg/beam/pipeline.go b/sdks/go/pkg/beam/pipeline.go index b3a2a10dc1baa..c591eeb33722c 100644 --- a/sdks/go/pkg/beam/pipeline.go +++ b/sdks/go/pkg/beam/pipeline.go @@ -16,6 +16,8 @@ package beam import ( + "context" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/metrics" ) @@ -47,6 +49,19 @@ func (s Scope) Scope(name string) Scope { return Scope{scope: scope, real: s.real} } +// WithContext creates a named subscope with an attached context for the +// represented composite transform. Values from that context may be +// extracted and added to the composite PTransform or generate a new +// environment for scoped transforms. +// +// If you're not sure whether these apply to your transform, use Scope +// instead, and do not set a context. +func (s Scope) WithContext(ctx context.Context, name string) Scope { + newS := s.Scope(name) + newS.scope.Context = ctx + return newS +} + func (s Scope) String() string { if !s.IsValid() { return "" diff --git a/sdks/go/pkg/beam/runners/direct/direct_test.go b/sdks/go/pkg/beam/runners/direct/direct_test.go index a8108580aa2e9..bcfce612da338 100644 --- a/sdks/go/pkg/beam/runners/direct/direct_test.go +++ b/sdks/go/pkg/beam/runners/direct/direct_test.go @@ -29,6 +29,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/metrics" + "github.com/apache/beam/sdks/v2/go/pkg/beam/register" "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/filter" "github.com/google/go-cmp/cmp" ) @@ -542,3 +543,100 @@ func TestMain(m *testing.M) { beam.Init() os.Exit(m.Run()) } + +func init() { + // Basic Registration + // beam.RegisterFunction(identity) + // beam.RegisterType(reflect.TypeOf((*source)(nil))) + // beam.RegisterType(reflect.TypeOf((*discard)(nil))) + + // Generic registration + register.Function2x0(identity) + register.DoFn2x0[[]byte, func(int)]((*source)(nil)) + register.DoFn1x0[int]((*discard)(nil)) + register.Emitter1[int]() +} + +type source struct { + Count int +} + +func (fn *source) ProcessElement(_ []byte, emit func(int)) { + for i := 0; i < fn.Count; i++ { + emit(i) + } +} + +func identity(v int, emit func(int)) { + emit(v) +} + +type discard struct { + processed int +} + +func (fn *discard) ProcessElement(int) { + fn.processed++ +} + +// BenchmarkPipe checks basic throughput and exec overhead with everything registered. +// +// Just registered: ~700-900ns per call, 330B per DoFn, across 5 allocs per DoFn +// +// goos: linux +// goarch: amd64 +// pkg: github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct +// cpu: 12th Gen Intel(R) Core(TM) i7-1260P +// BenchmarkPipe/dofns=0-16 1657698 763.0 ns/op 10.49 MB/s 763.0 ns/elm 320 B/op 6 allocs/op +// BenchmarkPipe/dofns=1-16 832784 1294 ns/op 12.37 MB/s 1294 ns/elm 656 B/op 11 allocs/op +// BenchmarkPipe/dofns=2-16 633345 1798 ns/op 13.35 MB/s 899.0 ns/elm 992 B/op 16 allocs/op +// BenchmarkPipe/dofns=3-16 471106 2446 ns/op 13.08 MB/s 815.4 ns/elm 1329 B/op 21 allocs/op +// BenchmarkPipe/dofns=5-16 340099 3634 ns/op 13.21 MB/s 726.8 ns/elm 2001 B/op 31 allocs/op +// BenchmarkPipe/dofns=10-16 183429 6957 ns/op 12.65 MB/s 695.7 ns/elm 3683 B/op 56 allocs/op +// BenchmarkPipe/dofns=100-16 17956 65986 ns/op 12.25 MB/s 659.9 ns/elm 33975 B/op 506 allocs/op +// +// Optimized w/ Generic reg: ~200-300ns per call, 150B per DoFn, across 2 allocs per DoFn +// +// goos: linux +// goarch: amd64 +// pkg: github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct +// cpu: 12th Gen Intel(R) Core(TM) i7-1260P +// BenchmarkPipe/dofns=0-16 9319206 131.5 ns/op 60.85 MB/s 131.5 ns/elm 152 B/op 2 allocs/op +// BenchmarkPipe/dofns=1-16 4465477 268.3 ns/op 59.63 MB/s 268.3 ns/elm 304 B/op 3 allocs/op +// BenchmarkPipe/dofns=2-16 2876710 431.9 ns/op 55.56 MB/s 216.0 ns/elm 456 B/op 5 allocs/op +// BenchmarkPipe/dofns=3-16 2096349 562.1 ns/op 56.93 MB/s 187.4 ns/elm 608 B/op 7 allocs/op +// BenchmarkPipe/dofns=5-16 1347927 823.8 ns/op 58.27 MB/s 164.8 ns/elm 912 B/op 11 allocs/op +// BenchmarkPipe/dofns=10-16 737594 1590 ns/op 55.36 MB/s 159.0 ns/elm 1672 B/op 21 allocs/op +// BenchmarkPipe/dofns=100-16 60728 19696 ns/op 41.02 MB/s 197.0 ns/elm 15357 B/op 201 allocs/op +func BenchmarkPipe(b *testing.B) { + makeBench := func(numDoFns int) func(b *testing.B) { + return func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(8 * int64(numDoFns+1)) + + disc := &discard{} + p, s := beam.NewPipelineWithRoot() + imp := beam.Impulse(s) + src := beam.ParDo(s, &source{Count: b.N}, imp) + iden := src + for i := 0; i < numDoFns; i++ { + iden = beam.ParDo(s, identity, iden) + } + beam.ParDo0(s, disc, iden) + Execute(context.TODO(), p) + if disc.processed != b.N { + b.Fatalf("processed dodn't match bench number: got %v want %v", disc.processed, b.N) + } + d := b.Elapsed() + div := numDoFns + if div == 0 { + div = 1 + } + div = div * b.N + b.ReportMetric(float64(d)/float64(div), "ns/elm") + } + } + for _, numDoFns := range []int{0, 1, 2, 3, 5, 10, 100} { + b.Run(fmt.Sprintf("dofns=%d", numDoFns), makeBench(numDoFns)) + } +} diff --git a/sdks/go/pkg/beam/runners/prism/README.md b/sdks/go/pkg/beam/runners/prism/README.md index 7ad9dc1d45796..0be9ca5617dcb 100644 --- a/sdks/go/pkg/beam/runners/prism/README.md +++ b/sdks/go/pkg/beam/runners/prism/README.md @@ -152,6 +152,8 @@ can have features selectively disabled to ensure * Progess tracking * Channel Splitting * Dynamic Splitting +* FnAPI Optimizations + * Fusion ## Next feature short list (unordered) @@ -165,7 +167,6 @@ See https://github.com/apache/beam/issues/24789 for current status. * Support SDK Containers via Testcontainers * Cross Language Transforms * FnAPI Optimizations - * Fusion * Data with ProcessBundleRequest & Response This is not a comprehensive feature set, but a set of goals to best diff --git a/sdks/go/pkg/beam/runners/prism/internal/execute.go b/sdks/go/pkg/beam/runners/prism/internal/execute.go index c1ac6ea4488c2..5e07e161dd5c1 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/execute.go +++ b/sdks/go/pkg/beam/runners/prism/internal/execute.go @@ -104,7 +104,6 @@ func makeWorker(env string, j *jobservices.Job) (*worker.W, error) { type transformExecuter interface { ExecuteUrns() []string - ExecuteWith(t *pipepb.PTransform) string ExecuteTransform(stageID, tid string, t *pipepb.PTransform, comps *pipepb.Components, watermark mtime.Time, data [][]byte) *worker.B } @@ -166,11 +165,6 @@ func executePipeline(ctx context.Context, wks map[string]*worker.W, j *jobservic urn := t.GetSpec().GetUrn() stage.exe = proc.transformExecuters[urn] - // Stopgap until everythinng's moved to handlers. - stage.envID = t.GetEnvironmentId() - if stage.exe != nil { - stage.envID = stage.exe.ExecuteWith(t) - } stage.ID = fmt.Sprintf("stage-%03d", i) wk := wks[stage.envID] diff --git a/sdks/go/pkg/beam/runners/prism/internal/execute_test.go b/sdks/go/pkg/beam/runners/prism/internal/execute_test.go index 1a5ae7989a061..ce821bef89855 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/execute_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/execute_test.go @@ -37,7 +37,7 @@ import ( "github.com/apache/beam/sdks/v2/go/test/integration/primitives" ) -func initRunner(t *testing.T) { +func initRunner(t testing.TB) { t.Helper() if *jobopts.Endpoint == "" { s := jobservices.NewServer(0, internal.RunPipeline) @@ -64,7 +64,7 @@ func execute(ctx context.Context, p *beam.Pipeline) (beam.PipelineResult, error) return universal.Execute(ctx, p) } -func executeWithT(ctx context.Context, t *testing.T, p *beam.Pipeline) (beam.PipelineResult, error) { +func executeWithT(ctx context.Context, t testing.TB, p *beam.Pipeline) (beam.PipelineResult, error) { t.Log("startingTest - ", t.Name()) s1 := rand.NewSource(time.Now().UnixNano()) r1 := rand.New(s1) @@ -587,3 +587,104 @@ func init() { func TestMain(m *testing.M) { ptest.MainWithDefault(m, "testlocal") } + +func init() { + // Basic Registration + // beam.RegisterFunction(identity) + // beam.RegisterType(reflect.TypeOf((*source)(nil))) + // beam.RegisterType(reflect.TypeOf((*discard)(nil))) + + // Generic registration + register.Function2x0(identity) + register.DoFn2x0[[]byte, func(int)]((*source)(nil)) + register.DoFn1x0[int]((*discard)(nil)) + register.Emitter1[int]() +} + +type source struct { + Count int +} + +func (fn *source) ProcessElement(_ []byte, emit func(int)) { + for i := 0; i < fn.Count; i++ { + emit(i) + } +} + +func identity(v int, emit func(int)) { + emit(v) +} + +type discard struct { + processed int +} + +func (fn *discard) ProcessElement(int) { + fn.processed++ +} + +// BenchmarkPipe checks basic throughput and exec overhead with everything registered. +// +// No fusion (all elements encoded) (generic registration): +// +// ~2000ns per call, 2000B per DoFn, across 22 allocs per DoFn +// (using Basic regsitration adds 3 allocs per DoFn, a ~200 bytes, and ~200-400ns/elm) +// +// goos: linux +// goarch: amd64 +// pkg: github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct +// cpu: 12th Gen Intel(R) Core(TM) i7-1260P +// BenchmarkPipe/dofns=0-16 885811 1333 ns/op 1333 ns/elm 1993 B/op 22 allocs/op +// BenchmarkPipe/dofns=1-16 457683 2636 ns/op 2636 ns/elm 3986 B/op 44 allocs/op +// BenchmarkPipe/dofns=2-16 283699 3975 ns/op 1988 ns/elm 6138 B/op 66 allocs/op +// BenchmarkPipe/dofns=3-16 212767 5689 ns/op 1896 ns/elm 8504 B/op 88 allocs/op +// BenchmarkPipe/dofns=5-16 121842 8279 ns/op 1656 ns/elm 11994 B/op 132 allocs/op +// BenchmarkPipe/dofns=10-16 22059 52877 ns/op 5288 ns/elm 30614 B/op 443 allocs/op +// BenchmarkPipe/dofns=100-16 6614 166364 ns/op 1664 ns/elm 192961 B/op 2261 allocs/op +// +// With fusion (generic registration): +// ~200ns per call, 150B per DoFn, across 2 allocs per DoFn +// AKA comparible to Direct Runner, as expected. +// +// goos: linux +// goarch: amd64 +// pkg: github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct +// cpu: 12th Gen Intel(R) Core(TM) i7-1260P +// BenchmarkPipe/dofns=0-16 7660638 145.8 ns/op 145.8 ns/elm 152 B/op 2 allocs/op +// BenchmarkPipe/dofns=1-16 3676358 313.3 ns/op 313.3 ns/elm 304 B/op 4 allocs/op +// BenchmarkPipe/dofns=2-16 2242688 507.4 ns/op 253.7 ns/elm 457 B/op 6 allocs/op +// BenchmarkPipe/dofns=3-16 1726969 662.6 ns/op 220.9 ns/elm 610 B/op 8 allocs/op +// BenchmarkPipe/dofns=5-16 1198765 1005 ns/op 201.0 ns/elm 915 B/op 12 allocs/op +// BenchmarkPipe/dofns=10-16 631459 1874 ns/op 187.4 ns/elm 1679 B/op 22 allocs/op +// BenchmarkPipe/dofns=100-16 57926 19890 ns/op 198.9 ns/elm 15660 B/op 206 allocs/op +func BenchmarkPipe(b *testing.B) { + initRunner(b) + makeBench := func(numDoFns int) func(b *testing.B) { + return func(b *testing.B) { + b.ReportAllocs() + disc := &discard{} + p, s := beam.NewPipelineWithRoot() + imp := beam.Impulse(s) + src := beam.ParDo(s, &source{Count: b.N}, imp) + iden := src + for i := 0; i < numDoFns; i++ { + iden = beam.ParDo(s, identity, iden) + } + beam.ParDo0(s, disc, iden) + _, err := execute(context.Background(), p) + if err != nil { + b.Fatal(err) + } + d := b.Elapsed() + div := numDoFns + if div == 0 { + div = 1 + } + div = div * b.N + b.ReportMetric(float64(d)/float64(div), "ns/elm") + } + } + for _, numDoFns := range []int{0, 1, 2, 3, 5, 10, 100} { + b.Run(fmt.Sprintf("dofns=%d", numDoFns), makeBench(numDoFns)) + } +} diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlecombine.go b/sdks/go/pkg/beam/runners/prism/internal/handlecombine.go index ff9bd1e1c88a1..3f31ad77fd53a 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlecombine.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlecombine.go @@ -58,10 +58,16 @@ func (*combine) PrepareUrns() []string { } // PrepareTransform returns lifted combines and removes the leaves if enabled. Otherwise returns nothing. -func (h *combine) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) (*pipepb.Components, []string) { +func (h *combine) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) prepareResult { // If we aren't lifting, the "default impl" for combines should be sufficient. if !h.config.EnableLifting { - return nil, nil + return prepareResult{ + SubbedComps: &pipepb.Components{ + Transforms: map[string]*pipepb.PTransform{ + tid: t, + }, + }, + } } // To lift a combine, the spec should contain a CombinePayload. @@ -197,7 +203,7 @@ func (h *combine) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipe liftEID: tform(liftEID, urns.TransformPreCombine, pcolInID, liftedNID, t.GetEnvironmentId()), gbkEID: tform(gbkEID, urns.TransformGBK, liftedNID, groupedNID, ""), mergeEID: tform(mergeEID, urns.TransformMerge, groupedNID, mergedNID, t.GetEnvironmentId()), - extractEID: tform(mergeEID, urns.TransformExtract, mergedNID, pcolOutID, t.GetEnvironmentId()), + extractEID: tform(extractEID, urns.TransformExtract, mergedNID, pcolOutID, t.GetEnvironmentId()), }, } @@ -205,5 +211,8 @@ func (h *combine) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipe // TODO recurse through sub transforms to remove? // We don't need to remove the composite, since we don't add it in // when we return the new transforms, so it's not in the topology. - return newComps, t.GetSubtransforms() + return prepareResult{ + SubbedComps: newComps, + RemovedLeaves: t.GetSubtransforms(), + } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go index 2ac5ca5bbf595..e9d422ca107d8 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlepardo.go @@ -58,7 +58,7 @@ func (*pardo) PrepareUrns() []string { // PrepareTransform handles special processing with respect to ParDos, since their handling is dependant on supported features // and requirements. -func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) (*pipepb.Components, []string) { +func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) prepareResult { // ParDos are a pain in the butt. // Combines, by comparison, are dramatically simpler. @@ -89,21 +89,22 @@ func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb // so they're not included here. Any nearly any ParDo can have them. // At their simplest, we don't need to do anything special at pre-processing time, and simply pass through as normal. - return &pipepb.Components{ - Transforms: map[string]*pipepb.PTransform{ - tid: t, + return prepareResult{ + SubbedComps: &pipepb.Components{ + Transforms: map[string]*pipepb.PTransform{ + tid: t, + }, }, - }, nil + } } // Side inputs add to topology and make fusion harder to deal with // (side input producers can't be in the same stage as their consumers) - // But we don't have fusion yet, so no worries. // State, Timers, Stable Input, Time Sorted Input, and some parts of SDF - // Are easier to deal including a fusion break. But We can do that with a - // runner specific transform for stable input, and another for timesorted - // input. + // Are easier to deal with by including a fusion break. But we can do that with a + // runner specific transform for stable input, and another for time sorted input. + // TODO add // SplittableDoFns have 3 required phases and a 4th optional phase. // @@ -235,10 +236,16 @@ func (h *pardo) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb EnvironmentId: t.GetEnvironmentId(), }, } - - return &pipepb.Components{ - Coders: coders, - Pcollections: pcols, - Transforms: tforms, - }, t.GetSubtransforms() + return prepareResult{ + SubbedComps: &pipepb.Components{ + Coders: coders, + Pcollections: pcols, + Transforms: tforms, + }, + RemovedLeaves: t.GetSubtransforms(), + // Force ProcessSized to be a root to ensure SDFs are able to split + // between elements or within elements. + // Also this is where a transform would be stateful anyway. + ForcedRoots: []string{eProcessID}, + } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go index 3f699e47e6752..59e926754821b 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go @@ -67,13 +67,55 @@ func (*runner) ConfigCharacteristic() reflect.Type { var _ transformPreparer = (*runner)(nil) func (*runner) PrepareUrns() []string { - return []string{urns.TransformReshuffle} + return []string{urns.TransformReshuffle, urns.TransformFlatten} } // PrepareTransform handles special processing with respect runner transforms, like reshuffle. -func (h *runner) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) (*pipepb.Components, []string) { +func (h *runner) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) prepareResult { + switch t.GetSpec().GetUrn() { + case urns.TransformFlatten: + return h.handleFlatten(tid, t, comps) + case urns.TransformReshuffle: + return h.handleReshuffle(tid, t, comps) + default: + panic("unknown urn to Prepare: " + t.GetSpec().GetUrn()) + } +} + +func (h *runner) handleFlatten(tid string, t *pipepb.PTransform, comps *pipepb.Components) prepareResult { + if !h.config.SDKFlatten { + t.EnvironmentId = "" // force the flatten to be a runner transform due to configuration. + forcedRoots := []string{tid} // Have runner side transforms be roots. + + // Force runner flatten consumers to be roots. + // This resolves merges between two runner transforms trying + // to execute together. + outColID := getOnlyValue(t.GetOutputs()) + for ctid, t := range comps.GetTransforms() { + for _, gi := range t.GetInputs() { + if gi == outColID { + forcedRoots = append(forcedRoots, ctid) + } + } + } + + // Return the new components which is the transforms consumer + return prepareResult{ + // We sub this flatten with itself, to not drop it. + SubbedComps: &pipepb.Components{ + Transforms: map[string]*pipepb.PTransform{ + tid: t, + }, + }, + RemovedLeaves: nil, + ForcedRoots: forcedRoots, + } + } + return prepareResult{} +} + +func (h *runner) handleReshuffle(tid string, t *pipepb.PTransform, comps *pipepb.Components) prepareResult { // TODO: Implement the windowing strategy the "backup" transforms used for Reshuffle. - // TODO: Implement a fusion break for reshuffles. if h.config.SDKReshuffle { panic("SDK side reshuffle not yet supported") @@ -106,12 +148,15 @@ func (h *runner) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipep // We need to remove the consumers of the output PCollection. toRemove := []string{} + // We need to force the consumers to be stage root, + // because reshuffle should be a fusion break. + forcedRoots := []string{} - for _, t := range comps.GetTransforms() { + for tid, t := range comps.GetTransforms() { for li, gi := range t.GetInputs() { if gi == outColID { - // The whole s t.GetInputs()[li] = inColID + forcedRoots = append(forcedRoots, tid) } } } @@ -120,7 +165,11 @@ func (h *runner) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipep toRemove = append(toRemove, t.GetSubtransforms()...) // Return the new components which is the transforms consumer - return nil, toRemove + return prepareResult{ + SubbedComps: nil, // Replace the reshuffle with nothing. + RemovedLeaves: toRemove, + ForcedRoots: forcedRoots, + } } var _ transformExecuter = (*runner)(nil) diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/metrics.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/metrics.go index 6db16191de93e..e0caec55881ee 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/metrics.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/metrics.go @@ -475,8 +475,8 @@ func (m *metricsStore) AddShortIDs(resp *fnpb.MonitoringInfosMetadataResponse) { m.shortIDsToKeys[short] = key } for d, payloads := range m.unprocessedPayloads { - m.contributeMetrics(durability(d), payloads) m.unprocessedPayloads[d] = nil + m.contributeMetrics(durability(d), payloads) } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go index bf2db814813c3..647e9ad962830 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/server.go @@ -17,6 +17,7 @@ package jobservices import ( "fmt" + "math" "net" "sync" @@ -60,7 +61,9 @@ func NewServer(port int, execute func(*Job)) *Server { execute: execute, } slog.Info("Serving JobManagement", slog.String("endpoint", s.Endpoint())) - var opts []grpc.ServerOption + opts := []grpc.ServerOption{ + grpc.MaxRecvMsgSize(math.MaxInt32), + } s.server = grpc.NewServer(opts...) jobpb.RegisterJobServiceServer(s.server, s) jobpb.RegisterArtifactStagingServiceServer(s.server, s) diff --git a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go index bca40709626d9..fb244cb4fbbbc 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/preprocess.go +++ b/sdks/go/pkg/beam/runners/prism/internal/preprocess.go @@ -23,6 +23,7 @@ import ( pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/prism/internal/urns" "golang.org/x/exp/maps" + "golang.org/x/exp/slices" "golang.org/x/exp/slog" "google.golang.org/protobuf/encoding/prototext" ) @@ -35,7 +36,13 @@ type transformPreparer interface { PrepareUrns() []string // PrepareTransform takes a PTransform proto and returns a set of new Components, and a list of // transformIDs leaves to remove and ignore from graph processing. - PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) (*pipepb.Components, []string) + PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) prepareResult +} + +type prepareResult struct { + SubbedComps *pipepb.Components + RemovedLeaves []string + ForcedRoots []string } // preprocessor retains configuration for preprocessing the @@ -73,6 +80,7 @@ func (p *preprocessor) preProcessGraph(comps *pipepb.Components) []*stage { // TODO move this out of this part of the pre-processor? leaves := map[string]struct{}{} ignore := map[string]struct{}{} + forcedRoots := map[string]bool{} for tid, t := range ts { if _, ok := ignore[tid]; ok { continue @@ -106,30 +114,33 @@ func (p *preprocessor) preProcessGraph(comps *pipepb.Components) []*stage { continue } - subs, toRemove := h.PrepareTransform(tid, t, comps) + prepResult := h.PrepareTransform(tid, t, comps) // Clear out unnecessary leaves from this composite for topological sort handling. - for _, key := range toRemove { + for _, key := range prepResult.RemovedLeaves { ignore[key] = struct{}{} delete(leaves, key) } + for _, key := range prepResult.ForcedRoots { + forcedRoots[key] = true + } // ts should be a clone, so we should be able to add new transforms into the map. - for tid, t := range subs.GetTransforms() { + for tid, t := range prepResult.SubbedComps.GetTransforms() { leaves[tid] = struct{}{} ts[tid] = t } - for cid, c := range subs.GetCoders() { + for cid, c := range prepResult.SubbedComps.GetCoders() { comps.GetCoders()[cid] = c } - for nid, n := range subs.GetPcollections() { + for nid, n := range prepResult.SubbedComps.GetPcollections() { comps.GetPcollections()[nid] = n } // It's unlikely for these to change, but better to handle them now, to save a headache later. - for wid, w := range subs.GetWindowingStrategies() { + for wid, w := range prepResult.SubbedComps.GetWindowingStrategies() { comps.GetWindowingStrategies()[wid] = w } - for envid, env := range subs.GetEnvironments() { + for envid, env := range prepResult.SubbedComps.GetEnvironments() { comps.GetEnvironments()[envid] = env } } @@ -141,167 +152,217 @@ func (p *preprocessor) preProcessGraph(comps *pipepb.Components) []*stage { topological := pipelinex.TopologicalSort(ts, keptLeaves) slog.Debug("topological transform ordering", slog.Any("topological", topological)) - // Basic Fusion Behavior - // - // Fusion is the practice of executing associated DoFns in the same stage. - // This often leads to more efficient processing, since costly encode/decode or - // serialize/deserialize operations can be elided. In Beam, any PCollection can - // in principle serve as a place for serializing and deserializing elements. - // - // In particular, Fusion is a stage for optimizing pipeline execution, and was - // described in the FlumeJava paper, in section 4. - // https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/35650.pdf - // - // Per the FlumeJava paper, there are two primary opportunities for Fusion, - // Producer+Consumer fusion and Sibling fusion. - // - // Producer+Consumer fusion is when the producer of a PCollection and the consumers of - // that PCollection are combined into a single stage. Sibling fusion is when two consumers - // of the same pcollection are fused into the same step. These processes can continue until - // graph structure or specific transforms dictate that fusion may not proceed futher. - // - // Examples of fusion breaks include GroupByKeys, or requiring side inputs to complete - // processing for downstream processing, since the producer and consumer of side inputs - // cannot be in the same fused stage. - // - // Additionally, at this phase, we can consider different optimizations for execution. - // For example "Flatten unzipping". In practice, there's no requirement for any stages - // to have an explicit "Flatten" present in the graph. A flatten can be "unzipped", - // duplicating the consumming transforms after the flatten, until a subsequent fusion break. - // This enables additional parallelism by allowing sources to operate in their own independant - // stages. Beam supports this naturally with the separation of work into independant - // bundles for execution. - - return defaultFusion(topological, comps) -} + facts := computeFacts(topological, comps) + facts.forcedRoots = forcedRoots -// defaultFusion is the base strategy for prism, that doesn't seek to optimize execution -// with fused stages. Input is the set of leaf nodes we're going to execute, topologically -// sorted, and the pipeline components. -// -// Default fusion behavior: Don't. Prism is intended to test all of Beam, which often -// means for testing purposes, to execute pipelines without optimization. -// -// Special Exception to unfused Go SDK pipelines. -// -// If a transform, after a GBK step, has a single input with a KV> coder -// and a single output O with a KV> coder, and if then it must be fused with -// the consumers of O. -func defaultFusion(topological []string, comps *pipepb.Components) []*stage { - var stages []*stage - - // TODO figure out a better place to source the PCol Parents/Consumers analysis - // so we don't keep repeating it. - - pcolParents, pcolConsumers := computPColFacts(topological, comps) - - // Explicitly list the pcollectionID we want to fuse along. - fuseWithConsumers := map[string]string{} - for _, tid := range topological { - t := comps.GetTransforms()[tid] - - // See if this transform has a single input and output - if len(t.GetInputs()) != 1 || len(t.GetOutputs()) != 1 { - continue - } - inputID := getOnlyValue(t.GetInputs()) - outputID := getOnlyValue(t.GetOutputs()) - - parentLink := pcolParents[inputID] - - parent := comps.GetTransforms()[parentLink.transform] - - // Check if the input source is a GBK - if parent.GetSpec().GetUrn() != urns.TransformGBK { - continue - } - - // Check if the coder is a KV> - iCID := comps.GetPcollections()[inputID].GetCoderId() - oCID := comps.GetPcollections()[outputID].GetCoderId() - - if checkForExpandCoderPattern(iCID, oCID, comps) { - fuseWithConsumers[tid] = outputID - } - } - - // Since we iterate in topological order, we're guaranteed to process producers before consumers. - consumed := map[string]bool{} // Checks if we've already handled a transform already due to fusion. - for _, tid := range topological { - if consumed[tid] { - continue - } - stg := &stage{ - transforms: []string{tid}, - } - // TODO validate that fused stages have the same environment. - stg.envID = comps.GetTransforms()[tid].EnvironmentId - - stages = append(stages, stg) - - pcolID, ok := fuseWithConsumers[tid] - if !ok { - continue - } - cs := pcolConsumers[pcolID] - - for _, c := range cs { - stg.transforms = append(stg.transforms, c.transform) - consumed[c.transform] = true - } - } + return greedyFusion(topological, comps, facts) +} - for _, stg := range stages { - prepareStage(stg, comps, pcolConsumers) - } - return stages +// TODO(lostluck): Be able to toggle this in variants. +// Most likely, re-implement in terms of simply marking all transforms as forced roots. +// Commented out to avoid the unused staticheck, but it's worth keeping until the docs +// and implementation is re-added. + +// // defaultFusion is the base strategy for prism, that doesn't seek to optimize execution +// // with fused stages. Input is the set of leaf nodes we're going to execute, topologically +// // sorted, and the pipeline components. +// // +// // Default fusion behavior: Don't. Prism is intended to test all of Beam, which often +// // means for testing purposes, to execute pipelines without optimization. +// // +// // Special Exception to unfused Go SDK pipelines. +// // +// // If a transform, after a GBK step, has a single input with a KV> coder +// // and a single output O with a KV> coder, and if then it must be fused with +// // the consumers of O. +// func defaultFusion(topological []string, comps *pipepb.Components, facts fusionFacts) []*stage { +// // Basic Fusion Behavior +// // +// // Fusion is the practice of executing associated DoFns in the same stage. +// // This often leads to more efficient processing, since costly encode/decode or +// // serialize/deserialize operations can be elided. In Beam, any PCollection can +// // in principle serve as a place for serializing and deserializing elements. +// // +// // In particular, Fusion is a stage for optimizing pipeline execution, and was +// // described in the FlumeJava paper, in section 4. +// // https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/35650.pdf +// // +// // Per the FlumeJava paper, there are two primary opportunities for Fusion, +// // Producer+Consumer fusion and Sibling fusion. +// // +// // Producer+Consumer fusion is when the producer of a PCollection and the consumers of +// // that PCollection are combined into a single stage. Sibling fusion is when two consumers +// // of the same pcollection are fused into the same step. These processes can continue until +// // graph structure or specific transforms dictate that fusion may not proceed futher. +// // +// // Examples of fusion breaks include GroupByKeys, or requiring side inputs to complete +// // processing for downstream processing, since the producer and consumer of side inputs +// // cannot be in the same fused stage. +// // +// // Additionally, at this phase, we can consider different optimizations for execution. +// // For example "Flatten unzipping". In practice, there's no requirement for any stages +// // to have an explicit "Flatten" present in the graph. A flatten can be "unzipped", +// // duplicating the consumming transforms after the flatten, until a subsequent fusion break. +// // This enables additional parallelism by allowing sources to operate in their own independant +// // stages. Beam supports this naturally with the separation of work into independant +// // bundles for execution. + +// // Explicitly list the pcollectionID we want to fuse along. +// fuseWithConsumers := map[string]string{} +// for _, tid := range topological { +// t := comps.GetTransforms()[tid] + +// // See if this transform has a single input and output +// if len(t.GetInputs()) != 1 || len(t.GetOutputs()) != 1 { +// continue +// } +// inputID := getOnlyValue(t.GetInputs()) +// outputID := getOnlyValue(t.GetOutputs()) + +// producerLink := facts.pcolProducers[inputID] + +// producer := comps.GetTransforms()[producerLink.transform] + +// // Check if the input source is a GBK +// if producer.GetSpec().GetUrn() != urns.TransformGBK { +// continue +// } + +// // Check if the coder is a KV> +// iCID := comps.GetPcollections()[inputID].GetCoderId() +// oCID := comps.GetPcollections()[outputID].GetCoderId() + +// if checkForExpandCoderPattern(iCID, oCID, comps) { +// fuseWithConsumers[tid] = outputID +// } +// } + +// var stages []*stage +// // Since we iterate in topological order, we're guaranteed to process producers before consumers. +// consumed := map[string]bool{} // Checks if we've already handled a transform already due to fusion. +// for _, tid := range topological { +// if consumed[tid] { +// continue +// } +// stg := &stage{ +// transforms: []string{tid}, +// } +// // TODO validate that fused stages have the same environment. +// stg.envID = comps.GetTransforms()[tid].EnvironmentId + +// stages = append(stages, stg) + +// pcolID, ok := fuseWithConsumers[tid] +// if !ok { +// continue +// } +// cs := facts.pcolConsumers[pcolID] + +// for _, c := range cs { +// stg.transforms = append(stg.transforms, c.transform) +// consumed[c.transform] = true +// } +// } + +// for _, stg := range stages { +// prepareStage(stg, comps, facts) +// } +// return stages +// } + +// // We need to see that both coders have this pattern: KV> +// func checkForExpandCoderPattern(in, out string, comps *pipepb.Components) bool { +// isKV := func(id string) bool { +// return comps.GetCoders()[id].GetSpec().GetUrn() == urns.CoderKV +// } +// getComp := func(id string, i int) string { +// return comps.GetCoders()[id].GetComponentCoderIds()[i] +// } +// isIter := func(id string) bool { +// return comps.GetCoders()[id].GetSpec().GetUrn() == urns.CoderIterable +// } +// if !isKV(in) || !isKV(out) { +// return false +// } +// // Are the keys identical? +// if getComp(in, 0) != getComp(out, 0) { +// return false +// } +// // Are both values iterables? +// if isIter(getComp(in, 1)) && isIter(getComp(out, 1)) { +// // If so we have the ExpandCoderPattern from the Go SDK. Hurray! +// return true +// } +// return false +// } + +type fusionFacts struct { + pcolProducers map[string]link // global pcol ID to transform link that produces it. + pcolConsumers map[string][]link // global pcol ID to all consumers of that pcollection + usedAsSideInput map[string]bool // global pcol ID and if it's used as a side input + + directSideInputs map[string]map[string]bool // global transform ID and all direct side input pcollections. + downstreamSideInputs map[string]map[string]bool // global transform ID and all transitive side input pcollections. + + forcedRoots map[string]bool // transforms forced to be roots (not computed in computeFacts) } -// computPColFacts computes a map of PCollectionIDs to their parent transforms, and a map of -// PCollectionIDs to their consuming transforms. -func computPColFacts(topological []string, comps *pipepb.Components) (map[string]link, map[string][]link) { - pcolParents := map[string]link{} - pcolConsumers := map[string][]link{} +// computeFacts computes facts about the given set of transforms and components that +// are useful for fusion. +func computeFacts(topological []string, comps *pipepb.Components) fusionFacts { + ret := fusionFacts{ + pcolProducers: map[string]link{}, + pcolConsumers: map[string][]link{}, + usedAsSideInput: map[string]bool{}, + directSideInputs: map[string]map[string]bool{}, // direct set + downstreamSideInputs: map[string]map[string]bool{}, // transitive set + } // Use the topological ids so each PCollection only has a single - // parent. We've already pruned out composites at this stage. + // producer. We've already pruned out composites at this stage. for _, tID := range topological { t := comps.GetTransforms()[tID] for local, global := range t.GetOutputs() { - pcolParents[global] = link{transform: tID, local: local, global: global} + ret.pcolProducers[global] = link{transform: tID, local: local, global: global} + } + sis, err := getSideInputs(t) + if err != nil { + panic(err) } + directSIs := map[string]bool{} + ret.directSideInputs[tID] = directSIs for local, global := range t.GetInputs() { - pcolConsumers[global] = append(pcolConsumers[global], link{transform: tID, local: local, global: global}) + ret.pcolConsumers[global] = append(ret.pcolConsumers[global], link{transform: tID, local: local, global: global}) + if _, ok := sis[local]; ok { + ret.usedAsSideInput[global] = true + directSIs[global] = true + } } } - return pcolParents, pcolConsumers + for _, tID := range topological { + computeDownstreamSideInputs(tID, comps, ret) + } + + return ret } -// We need to see that both coders have this pattern: KV> -func checkForExpandCoderPattern(in, out string, comps *pipepb.Components) bool { - isKV := func(id string) bool { - return comps.GetCoders()[id].GetSpec().GetUrn() == urns.CoderKV - } - getComp := func(id string, i int) string { - return comps.GetCoders()[id].GetComponentCoderIds()[i] - } - isIter := func(id string) bool { - return comps.GetCoders()[id].GetSpec().GetUrn() == urns.CoderIterable - } - if !isKV(in) || !isKV(out) { - return false - } - // Are the keys identical? - if getComp(in, 0) != getComp(out, 0) { - return false +func computeDownstreamSideInputs(tID string, comps *pipepb.Components, facts fusionFacts) map[string]bool { + if dssi, ok := facts.downstreamSideInputs[tID]; ok { + return dssi } - // Are both values iterables? - if isIter(getComp(in, 1)) && isIter(getComp(out, 1)) { - // If so we have the ExpandCoderPattern from the Go SDK. Hurray! - return true + dssi := map[string]bool{} + for _, o := range comps.GetTransforms()[tID].GetOutputs() { + if facts.usedAsSideInput[o] { + dssi[o] = true + } + for _, consumer := range facts.pcolConsumers[o] { + cdssi := computeDownstreamSideInputs(consumer.global, comps, facts) + maps.Copy(dssi, cdssi) + } } - return false + facts.downstreamSideInputs[tID] = dssi + return dssi } // prepareStage does the final pre-processing step for stages: @@ -309,7 +370,7 @@ func checkForExpandCoderPattern(in, out string, comps *pipepb.Components) bool { // 1. Determining the single parallel input (may be 0 for impulse stages). // 2. Determining all outputs to the stages. // 3. Determining all side inputs. -// 4 validating that no side input is fed by an internal PCollection. +// 4 Validating that no side input is fed by an internal PCollection. // 4. Check that all transforms are in the same environment or are environment agnostic. (TODO for xlang) // 5. Validate that only the primary input consuming transform are stateful. (Might be able to relax this) // @@ -320,22 +381,22 @@ func checkForExpandCoderPattern(in, out string, comps *pipepb.Components) bool { // Finally, it takes this information and caches it in the stage for simpler descriptor construction downstream. // // Note, this is very similar to the work done WRT composites in pipelinex.Normalize. -func prepareStage(stg *stage, comps *pipepb.Components, pipelineConsumers map[string][]link) { +func prepareStage(stg *stage, comps *pipepb.Components, pipelineFacts fusionFacts) { // Collect all PCollections involved in this stage. - pcolParents, pcolConsumers := computPColFacts(stg.transforms, comps) + stageFacts := computeFacts(stg.transforms, comps) transformSet := map[string]bool{} for _, tid := range stg.transforms { transformSet[tid] = true } - // Now we can see which consumers (inputs) aren't covered by the parents (outputs). + // Now we can see which consumers (inputs) aren't covered by the producers (outputs). mainInputs := map[string]string{} var sideInputs []link inputs := map[string]bool{} - for pid, plinks := range pcolConsumers { + for pid, plinks := range stageFacts.pcolConsumers { // Check if this PCollection is generated in this bundle. - if _, ok := pcolParents[pid]; ok { + if _, ok := stageFacts.pcolProducers[pid]; ok { // It is, so we will ignore for now. continue } @@ -354,10 +415,10 @@ func prepareStage(stg *stage, comps *pipepb.Components, pipelineConsumers map[st outputs := map[string]link{} var internal []string // Look at all PCollections produced in this stage. - for pid, link := range pcolParents { + for pid, link := range stageFacts.pcolProducers { // Look at all consumers of this PCollection in the pipeline isInternal := true - for _, l := range pipelineConsumers[pid] { + for _, l := range pipelineFacts.pcolConsumers[pid] { // If the consuming transform isn't in the stage, it's an output. if !transformSet[l.transform] { isInternal = false @@ -384,10 +445,149 @@ func prepareStage(stg *stage, comps *pipepb.Components, pipelineConsumers map[st if l := len(mainInputs); l == 1 { stg.primaryInput = getOnlyValue(mainInputs) } else if l > 1 { - // Quick check that this is a lone flatten node, which is handled runner side anyway - // and only sent SDK side as part of a fused stage. - if !(len(stg.transforms) == 1 && comps.GetTransforms()[stg.transforms[0]].GetSpec().GetUrn() == urns.TransformFlatten) { - panic("expected flatten node, but wasn't") + // Quick check that this is lead by a flatten node, and that it's handled runner side. + t := comps.GetTransforms()[stg.transforms[0]] + if !(t.GetSpec().GetUrn() == urns.TransformFlatten && t.GetEnvironmentId() == "") { + panic("expected runner flatten node, but wasn't") + } + } +} + +// greedyFusion produces a pipeline as tightly fused as possible. +// +// Fusion is a critical optimization for performance of pipeline execution. +// Thus it's important for SDKs to be capable of executing transforms in a fused state. +// +// However, not all transforms can be fused into the same stage together. +// Further, some transforms must be at the root of a stage. +// +// # Fusion Restrictions +// +// Environments: Transforms that aren't in the same environment can't be +// fused together *unless* their environments can also be fused together. +// Eg. Resource hints can often be ignored for local runners. +// +// Side Inputs: A transform S consuming a PCollection as a side input can't +// be fused with the transform P that produces that PCollection. Further, +// no transform S+ descended from S, can be fused with transform P. +// +// Splittable DoFns: An expanded Splittable DoFn transform's Process Sized +// Elements and Restrictions component must be the root of a stage. +// +// State and Timers: Stateful Transforms (transforms using State and Timers) +// must be the root of transforms, since they are required to be keyed. +// A sequence of Key Preserving stateful transforms could be fused. +// +// TODO: Sink/Unzip Flattens so they vanish from the graph. +// +// This approach is largely cribed from the Python approach at +// fn_api_runner/translations.py. That implementation is very set oriented & +// eagerly adds data source/sink transforms, while prism does so later in +// stage construction. +func greedyFusion(topological []string, comps *pipepb.Components, facts fusionFacts) []*stage { + fused := map[int]int{} + stageAssignments := map[string]int{} + + stageEnvs := map[int]string{} + forcedRoots := map[int]bool{} + directSIs := map[int]map[string]bool{} + downstreamSIs := map[int]map[string]bool{} + + var index int + replacements := func(tID string) int { + sID, ok := stageAssignments[tID] + if !ok { // No stage exists yet. + sID = index + index++ + + t := comps.GetTransforms()[tID] + stageAssignments[tID] = sID + stageEnvs[sID] = t.GetEnvironmentId() + forcedRoots[sID] = facts.forcedRoots[tID] + directSIs[sID] = maps.Clone(facts.directSideInputs[tID]) + downstreamSIs[sID] = maps.Clone(facts.downstreamSideInputs[tID]) + } + + var oldIDs []int + rep, ok := fused[sID] + for ok { + oldIDs = append(oldIDs, sID) + sID = rep + rep, ok = fused[sID] } + // Update the assignment & fusions for path shortening. + stageAssignments[tID] = sID + for _, old := range oldIDs { + fused[old] = sID + } + return sID + } + + overlap := func(downstream, consumer map[string]bool) bool { + for si := range consumer { + if downstream[si] { + return true + } + } + return false + } + + // To start, every transform is in it's own stage. + // So we map a transformID to a stageID. + // We go through each PCollection, (facts.PcolProducers) and + // try to fuse the producer to each consumer of that PCollection. + // + // If we can fuse, the consumer takes on the producer's stageID, + // and the assignments are updated. + + // Use the topological sort instead? + + keys := maps.Keys(facts.pcolProducers) + slices.Sort(keys) + for _, pcol := range keys { + producer := facts.pcolProducers[pcol] + for _, consumer := range facts.pcolConsumers[pcol] { + pID := replacements(producer.transform) // Get current stage for producer + cID := replacements(consumer.transform) // Get current stage for consumer + + // See if there's anything preventing fusion: + if pID == cID { + continue // Already fused together. + } + if stageEnvs[pID] != stageEnvs[cID] { + continue // Not the same environment. + } + if forcedRoots[cID] { + continue // Forced root. + } + if overlap(downstreamSIs[pID], directSIs[cID]) { + continue // Side input conflict + } + + // In principle, we can fuse! + fused[cID] = pID // Set the consumer to be in the producer's stage. + // Copy the consumer's direct and downstream side input sets into the producer. + maps.Copy(directSIs[pID], directSIs[cID]) + maps.Copy(downstreamSIs[pID], downstreamSIs[cID]) + } + } + + var stages []*stage + fusedToStages := map[int]*stage{} + for _, tID := range topological { + sID := replacements(tID) + s := fusedToStages[sID] + if s == nil { + s = &stage{ + envID: stageEnvs[sID], + } + fusedToStages[sID] = s + stages = append(stages, s) + } + s.transforms = append(s.transforms, tID) } + for _, stg := range stages { + prepareStage(stg, comps, facts) + } + return stages } diff --git a/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go b/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go index ba39d024e7160..8d0d6accdf6dc 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/preprocess_test.go @@ -31,6 +31,7 @@ func Test_preprocessor_preProcessGraph(t *testing.T) { wantComponents *pipepb.Components wantStages []*stage + forcedRoots []string }{ { name: "noPreparer", @@ -57,7 +58,8 @@ func Test_preprocessor_preProcessGraph(t *testing.T) { }, }, }, { - name: "preparer", + name: "preparer", + forcedRoots: []string{"e1_early", "e1_late"}, input: &pipepb.Components{ Transforms: map[string]*pipepb.PTransform{ "e1": { @@ -125,7 +127,9 @@ func Test_preprocessor_preProcessGraph(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - pre := newPreprocessor([]transformPreparer{&testPreparer{}}) + pre := newPreprocessor([]transformPreparer{&testPreparer{ + ForcedRoots: test.forcedRoots, + }}) gotStages := pre.preProcessGraph(test.input) if diff := cmp.Diff(test.wantStages, gotStages, cmp.AllowUnexported(stage{}, link{}), cmpopts.EquateEmpty()); diff != "" { @@ -139,47 +143,53 @@ func Test_preprocessor_preProcessGraph(t *testing.T) { } } -type testPreparer struct{} +type testPreparer struct { + ForcedRoots []string +} func (p *testPreparer) PrepareUrns() []string { return []string{"test_urn"} } -func (p *testPreparer) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) (*pipepb.Components, []string) { - return &pipepb.Components{ - Transforms: map[string]*pipepb.PTransform{ - "e1_early": { - UniqueName: "e1_early", - Spec: &pipepb.FunctionSpec{ - Urn: "defaultUrn", +func (p *testPreparer) PrepareTransform(tid string, t *pipepb.PTransform, comps *pipepb.Components) prepareResult { + return prepareResult{ + ForcedRoots: p.ForcedRoots, + SubbedComps: &pipepb.Components{ + Transforms: map[string]*pipepb.PTransform{ + "e1_early": { + UniqueName: "e1_early", + Spec: &pipepb.FunctionSpec{ + Urn: "defaultUrn", + }, + Outputs: map[string]string{"i0": "pcol1"}, + EnvironmentId: "env1", + }, + "e1_late": { + UniqueName: "e1_late", + Spec: &pipepb.FunctionSpec{ + Urn: "defaultUrn", + }, + Inputs: map[string]string{"i0": "pcol1"}, + EnvironmentId: "env1", }, - Outputs: map[string]string{"i0": "pcol1"}, - EnvironmentId: "env1", }, - "e1_late": { - UniqueName: "e1_late", - Spec: &pipepb.FunctionSpec{ - Urn: "defaultUrn", + Pcollections: map[string]*pipepb.PCollection{ + "pcol1": { + UniqueName: "pcol1", + CoderId: "coder1", + WindowingStrategyId: "ws1", }, - Inputs: map[string]string{"i0": "pcol1"}, - EnvironmentId: "env1", }, - }, - Pcollections: map[string]*pipepb.PCollection{ - "pcol1": { - UniqueName: "pcol1", - CoderId: "coder1", - WindowingStrategyId: "ws1", + Coders: map[string]*pipepb.Coder{ + "coder1": {Spec: &pipepb.FunctionSpec{Urn: "coder1"}}, + }, + WindowingStrategies: map[string]*pipepb.WindowingStrategy{ + "ws1": {WindowCoderId: "global"}, + }, + Environments: map[string]*pipepb.Environment{ + "env1": {Urn: "env1"}, }, }, - Coders: map[string]*pipepb.Coder{ - "coder1": {Spec: &pipepb.FunctionSpec{Urn: "coder1"}}, - }, - WindowingStrategies: map[string]*pipepb.WindowingStrategy{ - "ws1": {WindowCoderId: "global"}, - }, - Environments: map[string]*pipepb.Environment{ - "env1": {Urn: "env1"}, - }, - }, []string{"e1"} + RemovedLeaves: []string{"e1"}, + } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/separate_test.go b/sdks/go/pkg/beam/runners/prism/internal/separate_test.go index 97ae494e4abb7..1be3d3e708417 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/separate_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/separate_test.go @@ -141,6 +141,7 @@ func TestSeparation(t *testing.T) { count := 10 imp := beam.Impulse(s) ints := beam.ParDo(s, emitTenFn, imp) + ints = beam.Reshuffle(s, ints) out := beam.ParDo(s, &sepHarness{ Base: sepHarnessBase{ WatcherID: ws.newWatcher(3), @@ -379,7 +380,7 @@ func (fn *sepHarnessBase) setup() error { sepWaitMap[fn.WatcherID] = c go func(id int, c chan struct{}) { for { - time.Sleep(time.Second * 1) // Check counts every second. + time.Sleep(time.Millisecond * 50) // Check counts every second. sepClientMu.Lock() var unblock bool err := sepClient.Call("Watchers.Check", &Args{WatcherID: id}, &unblock) diff --git a/sdks/go/pkg/beam/runners/prism/internal/stage.go b/sdks/go/pkg/beam/runners/prism/internal/stage.go index 4ce3ce7ffeb6e..4925405bb4eff 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/stage.go +++ b/sdks/go/pkg/beam/runners/prism/internal/stage.go @@ -128,12 +128,32 @@ func (s *stage) Execute(ctx context.Context, j *jobservices.Job, wk *worker.W, d previousIndex := int64(-2) var splitsDone bool progTick := time.NewTicker(100 * time.Millisecond) + defer progTick.Stop() + var dataFinished, bundleFinished bool + // If we have no data outputs, we still need to have progress & splits + // while waiting for bundle completion. + if b.OutputCount == 0 { + dataFinished = true + } + var resp *fnpb.ProcessBundleResponse progress: for { select { + case <-ctx.Done(): + return context.Cause(ctx) + case resp = <-b.Resp: + bundleFinished = true + if b.BundleErr != nil { + return b.BundleErr + } + if dataFinished && bundleFinished { + break progress // exit progress loop on close. + } case <-dataReady: - progTick.Stop() - break progress // exit progress loop on close. + dataFinished = true + if dataFinished && bundleFinished { + break progress // exit progress loop on close. + } case <-progTick.C: resp, err := b.Progress(ctx, wk) if err != nil { @@ -145,9 +165,10 @@ progress: md := wk.MonitoringMetadata(ctx, unknownIDs) j.AddMetricShortIDs(md) } - slog.Debug("progress report", "bundle", rb, "index", index) + slog.Debug("progress report", "bundle", rb, "index", index, "prevIndex", previousIndex) // Progress for the bundle hasn't advanced. Try splitting. if previousIndex == index && !splitsDone { + slog.Debug("splitting report", "bundle", rb, "index", index) sr, err := b.Split(ctx, wk, 0.5 /* fraction of remainder */, nil /* allowed splits */) if err != nil { slog.Warn("SDK Error from split, aborting splits", "bundle", rb, "error", err.Error()) @@ -187,16 +208,6 @@ progress: // Tentative Data is ready, commit it to the main datastore. slog.Debug("Execute: commiting data", "bundle", rb, slog.Any("outputsWithData", maps.Keys(b.OutputData.Raw)), slog.Any("outputs", maps.Keys(s.OutputsToCoders))) - var resp *fnpb.ProcessBundleResponse - select { - case resp = <-b.Resp: - if b.BundleErr != nil { - return b.BundleErr - } - case <-ctx.Done(): - return context.Cause(ctx) - } - // Tally metrics immeadiately so they're available before // pipeline termination. unknownIDs := j.ContributeFinalMetrics(resp) @@ -279,6 +290,9 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, ds *wor for _, tid := range stg.transforms { transforms[tid] = comps.GetTransforms()[tid] } + if len(transforms) == 0 { + return fmt.Errorf("buildDescriptor: invalid stage - no transforms at all %v", stg.ID) + } // Start with outputs, since they're simple and uniform. sink2Col := map[string]string{} @@ -287,7 +301,7 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, ds *wor col := comps.GetPcollections()[o.global] wOutCid, err := makeWindowedValueCoder(o.global, comps, coders) if err != nil { - return fmt.Errorf("buildDescriptor: failed to handle coder on stage %v for output %+v, pcol %q %v:\n%w", stg.ID, o, o.global, prototext.Format(col), err) + return fmt.Errorf("buildDescriptor: failed to handle coder on stage %v for output %+v, pcol %q %v:\n%w %v", stg.ID, o, o.global, prototext.Format(col), err, stg.transforms) } sinkID := o.transform + "_" + o.local ed := collectionPullDecoder(col.GetCoderId(), coders, comps) @@ -343,7 +357,7 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, ds *wor col := comps.GetPcollections()[stg.primaryInput] wInCid, err := makeWindowedValueCoder(stg.primaryInput, comps, coders) if err != nil { - return fmt.Errorf("buildDescriptor: failed to handle coder on stage %v for primary input, pcol %q %v:\n%w", stg.ID, stg.primaryInput, prototext.Format(col), err) + return fmt.Errorf("buildDescriptor: failed to handle coder on stage %v for primary input, pcol %q %v:\n%w\n%v", stg.ID, stg.primaryInput, prototext.Format(col), err, stg.transforms) } ed := collectionPullDecoder(col.GetCoderId(), coders, comps) diff --git a/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go b/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go index 5f8d387599982..b8a04a7306b2a 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/unimplemented_test.go @@ -108,7 +108,6 @@ func TestImplemented(t *testing.T) { {pipeline: primitives.Flatten}, {pipeline: primitives.FlattenDup}, {pipeline: primitives.Checkpoints}, - {pipeline: primitives.CoGBK}, {pipeline: primitives.ReshuffleKV}, } diff --git a/sdks/go/pkg/beam/runners/prism/internal/web/web.go b/sdks/go/pkg/beam/runners/prism/internal/web/web.go index 7bfbe19a910b7..765f0b50c836d 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/web/web.go +++ b/sdks/go/pkg/beam/runners/prism/internal/web/web.go @@ -188,6 +188,10 @@ func (h *jobDetailsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { trs := pipeResp.GetPipeline().GetComponents().GetTransforms() col2T, topo := preprocessTransforms(trs) + counters := toTransformMap(results.AllMetrics().Counters()) + distributions := toTransformMap(results.AllMetrics().Distributions()) + msecs := toTransformMap(results.AllMetrics().Msecs()) + data.Transforms = make([]pTransform, 0, len(trs)) for _, id := range topo { pt := trs[id] @@ -224,6 +228,38 @@ func (h *jobDetailsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { strMets = append(strMets, outMets...) } + var msecMets []string + // TODO: Figure out where uniquename or id is being used in prism. It should be all global transform ID to faciliate lookups. + for _, msec := range msecs[id] { + msecMets = append(msecMets, fmt.Sprintf("\n- %+v", msec.Result())) + } + for _, msec := range msecs[pt.GetUniqueName()] { + msecMets = append(msecMets, fmt.Sprintf("\n- %+v", msec.Result())) + } + if len(msecMets) > 0 { + strMets = append(strMets, "Profiling metrics") + strMets = append(strMets, msecMets...) + } + + var userMetrics []string + for _, ctr := range counters[id] { + userMetrics = append(userMetrics, fmt.Sprintf("\n- %s.%s: %v", ctr.Namespace(), ctr.Name(), ctr.Result())) + } + for _, dist := range distributions[id] { + userMetrics = append(userMetrics, fmt.Sprintf("\n- %s.%s: %+v", dist.Namespace(), dist.Name(), dist.Result())) + } + for _, ctr := range counters[pt.GetUniqueName()] { + userMetrics = append(userMetrics, fmt.Sprintf("\n- %s.%s: %v", ctr.Namespace(), ctr.Name(), ctr.Result())) + } + for _, dist := range distributions[pt.GetUniqueName()] { + userMetrics = append(userMetrics, fmt.Sprintf("\n- %s.%s: %+v", dist.Namespace(), dist.Name(), dist.Result())) + } + + if len(userMetrics) > 0 { + strMets = append(strMets, "User metrics") + strMets = append(strMets, userMetrics...) + } + data.Transforms = append(data.Transforms, pTransform{ ID: id, Transform: pt, @@ -234,6 +270,14 @@ func (h *jobDetailsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { renderPage(jobPage, &data, w) } +func toTransformMap[E interface{ Transform() string }](mets []E) map[string][]E { + ret := map[string][]E{} + for _, met := range mets { + ret[met.Transform()] = append(ret[met.Transform()], met) + } + return ret +} + type pcolParent struct { L string T *pipepb.PTransform @@ -244,7 +288,10 @@ type pcolParent struct { func preprocessTransforms(trs map[string]*pipepb.PTransform) (map[string]pcolParent, []string) { ret := map[string]pcolParent{} var leaves []string - for id, t := range trs { + keys := maps.Keys(trs) + sort.Strings(keys) + for _, id := range keys { + t := trs[id] // Skip composites at this time. if len(t.GetSubtransforms()) > 0 { continue diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go index 573bdf4aeb9db..fab8cbc141f0b 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go @@ -16,6 +16,7 @@ package worker import ( + "bytes" "context" "fmt" "sync/atomic" @@ -126,22 +127,21 @@ func (b *B) ProcessOn(ctx context.Context, wk *W) <-chan struct{} { } // TODO: make batching decisions. - for i, d := range b.InputData { - select { - case wk.DataReqs <- &fnpb.Elements{ - Data: []*fnpb.Elements_Data{ - { - InstructionId: b.InstID, - TransformId: b.InputTransformID, - Data: d, - IsLast: i+1 == len(b.InputData), - }, + dataBuf := bytes.Join(b.InputData, []byte{}) + select { + case wk.DataReqs <- &fnpb.Elements{ + Data: []*fnpb.Elements_Data{ + { + InstructionId: b.InstID, + TransformId: b.InputTransformID, + Data: dataBuf, + IsLast: true, }, - }: - case <-ctx.Done(): - b.DataDone() - return b.DataWait - } + }, + }: + case <-ctx.Done(): + b.DataDone() + return b.DataWait } return b.DataWait } diff --git a/sdks/go/pkg/beam/transforms/filter/filter.go b/sdks/go/pkg/beam/transforms/filter/filter.go index 997eec5eb4ef3..699ec9c4c7925 100644 --- a/sdks/go/pkg/beam/transforms/filter/filter.go +++ b/sdks/go/pkg/beam/transforms/filter/filter.go @@ -40,10 +40,15 @@ var ( // the filter function returns false. It returns a PCollection of the same type // as the input. For example: // +// func lessThanThree(s string) bool { +// return len(s) < 3 +// } +// +// // Filter functions must be registered with Beam, and must not be closures. +// func init() { register.Function1x1(lessThanThree) } +// // words := beam.Create(s, "a", "b", "long", "alsolong") -// short := filter.Include(s, words, func(s string) bool { -// return len(s) < 3 -// }) +// short := filter.Include(s, words, lessThanThree) // // Here, "short" will contain "a" and "b" at runtime. func Include(s beam.Scope, col beam.PCollection, fn any) beam.PCollection { @@ -58,10 +63,15 @@ func Include(s beam.Scope, col beam.PCollection, fn any) beam.PCollection { // the filter function returns true. It returns a PCollection of the same type // as the input. For example: // +// func lessThanThree(s string) bool { +// return len(s) < 3 +// } +// +// // Filter functions must be registered with Beam, and must not be closures. +// func init() { register.Function1x1(lessThanThree) } +// // words := beam.Create(s, "a", "b", "long", "alsolong") -// long := filter.Exclude(s, words, func(s string) bool { -// return len(s) < 3 -// }) +// long := filter.Exclude(s, words, lessThanThree) // // Here, "long" will contain "long" and "alsolong" at runtime. func Exclude(s beam.Scope, col beam.PCollection, fn any) beam.PCollection { diff --git a/sdks/go/test/integration/integration.go b/sdks/go/test/integration/integration.go index f3cffd1761109..5b7473fb561a4 100644 --- a/sdks/go/test/integration/integration.go +++ b/sdks/go/test/integration/integration.go @@ -38,7 +38,6 @@ package integration import ( "fmt" "math/rand" - "os" "regexp" "strings" "testing" @@ -304,12 +303,6 @@ func CheckFilters(t *testing.T) { panic("ptest.Main() has not been called: please override TestMain to ensure that the integration test runs properly.") } - // TODO(https://github.com/apache/beam/issues/28227): Grant github-actions service account permission to healthcare.fhirStores.create. - var user = os.Getenv("USER") - if user == "github-actions" { - dataflowFilters = append(dataflowFilters, "TestFhirIO.*") - } - // Check for sickbaying first. n := t.Name() for _, f := range sickbay { diff --git a/sdks/go/test/integration/io/fhirio/fhirio_test.go b/sdks/go/test/integration/io/fhirio/fhirio_test.go index 03e3654d5c49b..01f6db9324f31 100644 --- a/sdks/go/test/integration/io/fhirio/fhirio_test.go +++ b/sdks/go/test/integration/io/fhirio/fhirio_test.go @@ -96,9 +96,9 @@ func setupFhirStore(t *testing.T, shouldPopulateStore bool) (fhirStoreInfo, func var resourcePaths [][]byte if shouldPopulateStore { - resourcePaths = populateStore(createdFhirStorePath) - if len(resourcePaths) == 0 { - t.Fatal("No data got populated to test") + resourcePaths, err = populateStore(createdFhirStorePath) + if err != nil { + t.Fatal(err) } } @@ -127,11 +127,13 @@ func deleteStore(storePath string) (*healthcare.Empty, error) { // Populates fhir store with data. Note that failure to populate some data is not // detrimental to the tests, so it is fine to ignore. -func populateStore(storePath string) [][]byte { +func populateStore(storePath string) ([][]byte, error) { resourcePaths := make([][]byte, 0) + bufferedErrors := make([]string, 0) for _, bundle := range readPrettyBundles() { response, err := storeService.ExecuteBundle(storePath, strings.NewReader(bundle)).Do() if err != nil { + bufferedErrors = append(bufferedErrors, err.Error()) continue } @@ -145,23 +147,30 @@ func populateStore(storePath string) [][]byte { } err = json.NewDecoder(response.Body).Decode(&body) if err != nil { + bufferedErrors = append(bufferedErrors, err.Error()) continue } for _, entry := range body.Entry { bundleFailedToBeCreated := !strings.Contains(entry.Response.Status, "201") if bundleFailedToBeCreated { + bufferedErrors = append(bufferedErrors, fmt.Sprintf("Bundle creation failed with: %v", entry.Response)) continue } resourcePath, err := extractResourcePathFrom(entry.Response.Location) if err != nil { + bufferedErrors = append(bufferedErrors, err.Error()) continue } resourcePaths = append(resourcePaths, resourcePath) } } - return resourcePaths + if len(resourcePaths) == 0 { + return nil, fmt.Errorf("failed to populate fhir store with any data. Errors with requests: %s", bufferedErrors) + } + + return resourcePaths, nil } func readPrettyBundles() []string { diff --git a/sdks/java/bom/build.gradle b/sdks/java/bom/build.gradle index 1559d6b6bea57..da24346842059 100644 --- a/sdks/java/bom/build.gradle +++ b/sdks/java/bom/build.gradle @@ -76,7 +76,7 @@ tasks.whenTaskAdded { task -> expand(version: project.version, modules: bomModuleNames) } } - } else if (task.name.startsWith('publishMavenJavaPublication')) { + } else if (task.name.startsWith('publishMavenJavaPublication') || task.name.startsWith('signMavenJavaPublication')) { task.dependsOn copyPom } } diff --git a/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml b/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml index 7037f0543f4fa..c30c48f824456 100644 --- a/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml +++ b/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml @@ -87,6 +87,8 @@ + + diff --git a/sdks/java/container/Dockerfile b/sdks/java/container/Dockerfile index e0fa8d4a0a6f9..9c266ea132b81 100644 --- a/sdks/java/container/Dockerfile +++ b/sdks/java/container/Dockerfile @@ -15,8 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +ARG base_image ARG java_version -FROM eclipse-temurin:${java_version} +FROM ${base_image}:${java_version} LABEL Author "Apache Beam " ARG TARGETOS ARG TARGETARCH diff --git a/sdks/java/container/agent/build.gradle b/sdks/java/container/agent/build.gradle index df3780e454465..44338915bf285 100644 --- a/sdks/java/container/agent/build.gradle +++ b/sdks/java/container/agent/build.gradle @@ -20,10 +20,13 @@ plugins { id 'org.apache.beam.module' } +// the order is intended here if (project.hasProperty('java11Home')) { javaVersion = "1.11" } else if (project.hasProperty('java17Home')) { javaVersion = "1.17" +} else if (project.hasProperty('java21Home')) { + javaVersion = "1.21" } applyJavaNature( @@ -42,16 +45,22 @@ jar { } } +// the order is intended here if (project.hasProperty('java11Home')) { - def java11Home = project.findProperty('java11Home') project.tasks.withType(JavaCompile) { - options.fork = true - options.forkOptions.javaHome = java11Home as File - options.compilerArgs += ['-Xlint:-path'] + setJavaVerOptions(options, '11') } } else if (project.hasProperty('java17Home')) { project.tasks.withType(JavaCompile) { - setJava17Options(options) + setJavaVerOptions(options, '17') + + checkerFramework { + skipCheckerFramework = true + } + } +} else if (project.hasProperty('java21Home')) { + project.tasks.withType(JavaCompile) { + setJavaVerOptions(options, '21') checkerFramework { skipCheckerFramework = true @@ -62,7 +71,7 @@ if (project.hasProperty('java11Home')) { // Module classes requires JDK > 8 project.tasks.each { it.onlyIf { - project.hasProperty('java11Home') || project.hasProperty('java17Home') + project.hasProperty('java11Home') || project.hasProperty('java17Home') || project.hasProperty('java21Home') || JavaVersion.VERSION_1_8.compareTo(JavaVersion.current()) < 0 } } diff --git a/sdks/java/container/build.gradle b/sdks/java/container/build.gradle index 4c4b6aaa31fd7..ac4104638b657 100644 --- a/sdks/java/container/build.gradle +++ b/sdks/java/container/build.gradle @@ -31,7 +31,7 @@ description = "Apache Beam :: SDKs :: Java :: Container" configurations { dockerDependency - sdkHarnessLauncher + javaHarnessLauncher } dependencies { @@ -76,11 +76,12 @@ task downloadCloudProfilerAgent(type: Exec) { } artifacts { - sdkHarnessLauncher file: file('./build/target'), builtBy: goBuild + javaHarnessLauncher file: file('./build/target'), builtBy: goBuild } task pushAll { dependsOn ":sdks:java:container:java8:docker" dependsOn ":sdks:java:container:java11:docker" dependsOn ":sdks:java:container:java17:docker" + dependsOn ":sdks:java:container:java21:docker" } diff --git a/sdks/java/container/common.gradle b/sdks/java/container/common.gradle index bf4c122ca91fe..c242f714543e4 100644 --- a/sdks/java/container/common.gradle +++ b/sdks/java/container/common.gradle @@ -29,19 +29,20 @@ applyDockerNature() if (!project.hasProperty('imageJavaVersion')) { throw new GradleException('imageJavaVersion project property must be set') } +def javaBaseImage = project.findProperty('javaBaseImage') ?: 'eclipse-temurin' def imageJavaVersion = project.findProperty('imageJavaVersion') description = "Apache Beam :: SDKs :: Java :: Container :: Java ${imageJavaVersion} Container" configurations { dockerDependency - sdkHarnessLauncher + javaHarnessLauncher pulledLicenses } dependencies { dockerDependency project(path: ":sdks:java:container", configuration: "dockerDependency") - sdkHarnessLauncher project(path: ":sdks:java:container", configuration: "sdkHarnessLauncher") + javaHarnessLauncher project(path: ":sdks:java:container", configuration: "javaHarnessLauncher") } task copyDockerfileDependencies(type: Copy) { @@ -63,25 +64,27 @@ task copyDockerfileDependencies(type: Copy) { task copySdkHarnessLauncher(type: Copy) { dependsOn ":sdks:java:container:downloadCloudProfilerAgent" + // if licenses are required, they should be present before this task run. + mustRunAfter ":sdks:java:container:pullLicenses" - from configurations.sdkHarnessLauncher + from configurations.javaHarnessLauncher into "build/target" } task copyJavaThirdPartyLicenses(type: Copy) { - from("${project(':sdks:java:container').buildDir}/target/third_party_licenses") + from project(':sdks:java:container').layout.buildDirectory.dir('target/third_party_licenses') into "build/target/third_party_licenses" dependsOn ':sdks:java:container:pullLicenses' } task copyGolangLicenses(type: Copy) { - from "${project(':release:go-licenses:java').buildDir}/output" + from project(':release:go-licenses:java').layout.buildDirectory.dir('output') into "build/target/go-licenses" dependsOn ':release:go-licenses:java:createLicenses' } task copyJdkOptions(type: Copy) { - if (imageJavaVersion == "17" || imageJavaVersion == "11") { + if (["11", "17", "21"].contains(imageJavaVersion)) { from "option-jamm.json" } from "java${imageJavaVersion}-security.properties" @@ -94,11 +97,25 @@ task skipPullLicenses(type: Exec) { args "-c", "mkdir -p build/target/go-licenses build/target/options build/target/third_party_licenses && touch build/target/go-licenses/skip && touch build/target/third_party_licenses/skip" } +// Java11+ container depends on the java agent project. To compile it, need a compatible JDK version: +// lower bound 11 and upper bound imageJavaVersion task validateJavaHome { - if (imageJavaVersion == "11" || imageJavaVersion == "17") { + def requiredForVer = ["11", "17", "21"] + if (requiredForVer.contains(imageJavaVersion)) { doFirst { - if (!project.hasProperty('java17Home') && !project.hasProperty('java11Home')) { - throw new GradleException('java17Home or java11Home property required. Re-run with -Pjava17Home or -Pjava11Home') + boolean propertyFound = false + // enable to build agent with compatible java versions (11-requiredForVer) + for (def checkVer : requiredForVer) { + if (project.hasProperty("java${checkVer}Home")) { + propertyFound = true + } + if (checkVer == imageJavaVersion) { + // cannot build agent with a higher version than the docker java ver + break + } + } + if (!propertyFound) { + throw new GradleException("java${imageJavaVersion}Home or compatible properties required for imageJavaVersion=${imageJavaVersion}. Re-run with -Pjava${imageJavaVersion}Home") } } } @@ -120,8 +137,9 @@ docker { dockerfile project.file("../Dockerfile") files "./build/" buildArgs([ - 'pull_licenses': project.rootProject.hasProperty(["docker-pull-licenses"]) || - project.rootProject.hasProperty(["isRelease"]), + 'pull_licenses': project.rootProject.hasProperty("docker-pull-licenses") || + project.rootProject.hasProperty("isRelease"), + 'base_image': javaBaseImage, 'java_version': imageJavaVersion, ]) buildx useBuildx @@ -130,8 +148,8 @@ docker { push pushContainers } -if (project.rootProject.hasProperty(["docker-pull-licenses"]) || - project.rootProject.hasProperty(["isRelease"])) { +if (project.rootProject.hasProperty("docker-pull-licenses") || + project.rootProject.hasProperty("isRelease")) { project.logger.lifecycle('docker-pull-licenses set, creating go-licenses') dockerPrepare.dependsOn copyJavaThirdPartyLicenses dockerPrepare.dependsOn copyGolangLicenses diff --git a/sdks/java/container/java21/build.gradle b/sdks/java/container/java21/build.gradle new file mode 100644 index 0000000000000..75c84c0c7cdfa --- /dev/null +++ b/sdks/java/container/java21/build.gradle @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +project.ext { + imageJavaVersion = '21' +} + +// Load the main build script which contains all build logic. +apply from: "../common.gradle" + +dependencies { + dockerDependency project(path: ":sdks:java:container:agent") +} \ No newline at end of file diff --git a/sdks/java/container/java21/option-jamm.json b/sdks/java/container/java21/option-jamm.json new file mode 100644 index 0000000000000..5647ff66be5c9 --- /dev/null +++ b/sdks/java/container/java21/option-jamm.json @@ -0,0 +1,12 @@ +{ + "name": "jamm", + "enabled": true, + "options": { + "java_arguments": [ + "--add-modules=jamm", + "--module-path=/opt/apache/beam/jars/jamm.jar", + "--add-opens=java.base/java.lang=jamm", + "--add-opens=java.base/java.util=jamm" + ] + } +} \ No newline at end of file diff --git a/sdks/java/container/license_scripts/dep_urls_java.yaml b/sdks/java/container/license_scripts/dep_urls_java.yaml index ca47184d2a229..8a028f459727c 100644 --- a/sdks/java/container/license_scripts/dep_urls_java.yaml +++ b/sdks/java/container/license_scripts/dep_urls_java.yaml @@ -46,7 +46,7 @@ jaxen: '1.1.6': type: "3-Clause BSD" libraries-bom: - '26.22.0': + '26.23.0': license: "https://raw.githubusercontent.com/GoogleCloudPlatform/cloud-opensource-java/master/LICENSE" type: "Apache License 2.0" paranamer: diff --git a/sdks/java/core/build.gradle b/sdks/java/core/build.gradle index a0f9b739dac69..7c788eaba49dd 100644 --- a/sdks/java/core/build.gradle +++ b/sdks/java/core/build.gradle @@ -36,7 +36,6 @@ applyJavaNature( relocate "org.antlr.v4", getJavaRelocatedPath("org.antlr.v4") }, ) -applyAvroNature() applyAntlrNature() generateGrammarSource { @@ -89,7 +88,6 @@ dependencies { shadow library.java.jackson_annotations shadow library.java.jackson_databind shadow library.java.slf4j_api - shadow library.java.avro shadow library.java.snappy_java shadow library.java.joda_time implementation enforcedPlatform(library.java.google_cloud_platform_libraries_bom) @@ -114,7 +112,6 @@ dependencies { shadowTest "com.esotericsoftware.kryo:kryo:2.21" shadowTest library.java.quickcheck_core shadowTest library.java.quickcheck_generators - shadowTest library.java.avro_tests shadowTest library.java.zstd_jni shadowTest library.java.commons_logging shadowTest library.java.log4j @@ -124,5 +121,6 @@ dependencies { } project.tasks.compileTestJava { - options.compilerArgs += ['-Xlint:-rawtypes'] // generated avro uses rawtypes without suppression + // TODO: fix other places with warnings in tests and delete this option + options.compilerArgs += ['-Xlint:-rawtypes'] } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/AvroCoder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/AvroCoder.java deleted file mode 100644 index ee252ba70f851..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/AvroCoder.java +++ /dev/null @@ -1,820 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.coders; - -import com.google.errorprone.annotations.FormatMethod; -import com.google.errorprone.annotations.FormatString; -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.Serializable; -import java.lang.reflect.Field; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; -import java.util.SortedMap; -import java.util.SortedSet; -import org.apache.avro.AvroRuntimeException; -import org.apache.avro.Conversion; -import org.apache.avro.LogicalType; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.BinaryEncoder; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.DatumWriter; -import org.apache.avro.io.DecoderFactory; -import org.apache.avro.io.EncoderFactory; -import org.apache.avro.reflect.AvroEncode; -import org.apache.avro.reflect.AvroName; -import org.apache.avro.reflect.AvroSchema; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.reflect.ReflectDatumReader; -import org.apache.avro.reflect.ReflectDatumWriter; -import org.apache.avro.reflect.Union; -import org.apache.avro.specific.SpecificData; -import org.apache.avro.specific.SpecificDatumReader; -import org.apache.avro.specific.SpecificDatumWriter; -import org.apache.avro.specific.SpecificRecord; -import org.apache.avro.util.ClassUtils; -import org.apache.avro.util.Utf8; -import org.apache.beam.sdk.util.EmptyOnDeserializationThreadLocal; -import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; - -/** - * A {@link Coder} using Avro binary format. - * - *

Each instance of {@code AvroCoder} encapsulates an Avro schema for objects of type {@code - * T}. - * - *

The Avro schema may be provided explicitly via {@link AvroCoder#of(Class, Schema)} or omitted - * via {@link AvroCoder#of(Class)}, in which case it will be inferred using Avro's {@link - * org.apache.avro.reflect.ReflectData}. - * - *

For complete details about schema generation and how it can be controlled please see the - * {@link org.apache.avro.reflect} package. Only concrete classes with a no-argument constructor can - * be mapped to Avro records. All inherited fields that are not static or transient are included. - * Fields are not permitted to be null unless annotated by {@link Nullable} or a {@link Union} - * schema containing {@code "null"}. - * - *

To use, specify the {@code Coder} type on a PCollection: - * - *

{@code
- * PCollection records =
- *     input.apply(...)
- *          .setCoder(AvroCoder.of(MyCustomElement.class));
- * }
- * - *

or annotate the element class using {@code @DefaultCoder}. - * - *

{@code @DefaultCoder(AvroCoder.class)
- * public class MyCustomElement {
- *     ...
- * }
- * }
- * - *

The implementation attempts to determine if the Avro encoding of the given type will satisfy - * the criteria of {@link Coder#verifyDeterministic} by inspecting both the type and the Schema - * provided or generated by Avro. Only coders that are deterministic can be used in {@link - * org.apache.beam.sdk.transforms.GroupByKey} operations. - * - * @param the type of elements handled by this coder - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.coders.AvroCoder instead of this one. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -@Deprecated -public class AvroCoder extends CustomCoder { - - /** - * Returns an {@code AvroCoder} instance for the provided element type. - * - * @param the element type - */ - public static AvroCoder of(TypeDescriptor type) { - return of(type, true); - } - - /** - * Returns an {@code AvroCoder} instance for the provided element type, respecting whether to use - * Avro's Reflect* or Specific* suite for encoding and decoding. - * - * @param the element type - */ - public static AvroCoder of(TypeDescriptor type, boolean useReflectApi) { - @SuppressWarnings("unchecked") - Class clazz = (Class) type.getRawType(); - return of(clazz, useReflectApi); - } - - /** - * Returns an {@code AvroCoder} instance for the provided element class. - * - * @param the element type - */ - public static AvroCoder of(Class clazz) { - return of(clazz, true); - } - - /** - * Returns an {@code AvroGenericCoder} instance for the Avro schema. The implicit type is - * GenericRecord. - */ - public static AvroGenericCoder of(Schema schema) { - return AvroGenericCoder.of(schema); - } - - /** - * Returns an {@code AvroCoder} instance for the given class, respecting whether to use Avro's - * Reflect* or Specific* suite for encoding and decoding. - * - * @param the element type - */ - public static AvroCoder of(Class type, boolean useReflectApi) { - ClassLoader cl = type.getClassLoader(); - SpecificData data = useReflectApi ? new ReflectData(cl) : new SpecificData(cl); - return of(type, data.getSchema(type), useReflectApi); - } - - /** - * Returns an {@code AvroCoder} instance for the provided element type using the provided Avro - * schema. - * - *

The schema must correspond to the type provided. - * - * @param the element type - */ - public static AvroCoder of(Class type, Schema schema) { - return of(type, schema, true); - } - - /** - * Returns an {@code AvroCoder} instance for the given class and schema, respecting whether to use - * Avro's Reflect* or Specific* suite for encoding and decoding. - * - * @param the element type - */ - public static AvroCoder of(Class type, Schema schema, boolean useReflectApi) { - return new AvroCoder<>(type, schema, useReflectApi); - } - - /** - * Returns a {@link CoderProvider} which uses the {@link AvroCoder} if possible for all types. - * - *

It is unsafe to register this as a {@link CoderProvider} because Avro will reflectively - * accept dangerous types such as {@link Object}. - * - *

This method is invoked reflectively from {@link DefaultCoder}. - */ - @SuppressWarnings("unused") - public static CoderProvider getCoderProvider() { - return new AvroCoderProvider(); - } - - /** - * A {@link CoderProvider} that constructs an {@link AvroCoder} for Avro compatible classes. - * - *

It is unsafe to register this as a {@link CoderProvider} because Avro will reflectively - * accept dangerous types such as {@link Object}. - */ - static class AvroCoderProvider extends CoderProvider { - @Override - public Coder coderFor( - TypeDescriptor typeDescriptor, List> componentCoders) - throws CannotProvideCoderException { - try { - return AvroCoder.of(typeDescriptor); - } catch (AvroRuntimeException e) { - throw new CannotProvideCoderException( - String.format("%s is not compatible with Avro", typeDescriptor), e); - } - } - } - - private final Class type; - private final boolean useReflectApi; - private final SerializableSchemaSupplier schemaSupplier; - private final TypeDescriptor typeDescriptor; - - private final List nonDeterministicReasons; - - // Factories allocated by .get() are thread-safe and immutable. - private static final EncoderFactory ENCODER_FACTORY = EncoderFactory.get(); - private static final DecoderFactory DECODER_FACTORY = DecoderFactory.get(); - - /** - * A {@link Serializable} object that holds the {@link String} version of a {@link Schema}. This - * is paired with the {@link SerializableSchemaSupplier} via {@link Serializable}'s usage of the - * {@link #readResolve} method. - */ - private static class SerializableSchemaString implements Serializable { - private final String schema; - - private SerializableSchemaString(String schema) { - this.schema = schema; - } - - private Object readResolve() throws IOException, ClassNotFoundException { - return new SerializableSchemaSupplier(new Schema.Parser().parse(schema)); - } - } - - /** - * A {@link Serializable} object that delegates to the {@link SerializableSchemaString} via {@link - * Serializable}'s usage of the {@link #writeReplace} method. Kryo doesn't utilize Java's - * serialization and hence is able to encode the {@link Schema} object directly. - */ - private static class SerializableSchemaSupplier implements Serializable, Supplier { - // writeReplace makes this object serializable. This is a limitation of FindBugs as discussed - // here: - // http://stackoverflow.com/questions/26156523/is-writeobject-not-neccesary-using-the-serialization-proxy-pattern - @SuppressFBWarnings("SE_BAD_FIELD") - private final Schema schema; - - private SerializableSchemaSupplier(Schema schema) { - this.schema = schema; - } - - private Object writeReplace() { - return new SerializableSchemaString(schema.toString()); - } - - @Override - public Schema get() { - return schema; - } - } - - /** - * A {@link Serializable} object that lazily supplies a {@link ReflectData} built from the - * appropriate {@link ClassLoader} for the type encoded by this {@link AvroCoder}. - */ - private static class SerializableReflectDataSupplier - implements Serializable, Supplier { - - private final Class clazz; - - private SerializableReflectDataSupplier(Class clazz) { - this.clazz = clazz; - } - - @Override - public ReflectData get() { - ReflectData reflectData = new ReflectData(clazz.getClassLoader()); - reflectData.addLogicalTypeConversion(new JodaTimestampConversion()); - return reflectData; - } - } - - // Cache the old encoder/decoder and let the factories reuse them when possible. To be threadsafe, - // these are ThreadLocal. This code does not need to be re-entrant as AvroCoder does not use - // an inner coder. - private final EmptyOnDeserializationThreadLocal decoder; - private final EmptyOnDeserializationThreadLocal encoder; - private final EmptyOnDeserializationThreadLocal> writer; - private final EmptyOnDeserializationThreadLocal> reader; - - // Lazily re-instantiated after deserialization - private final Supplier reflectData; - - protected AvroCoder(Class type, Schema schema) { - this(type, schema, false); - } - - protected AvroCoder(Class type, Schema schema, boolean useReflectApi) { - this.type = type; - this.useReflectApi = useReflectApi; - this.schemaSupplier = new SerializableSchemaSupplier(schema); - typeDescriptor = TypeDescriptor.of(type); - nonDeterministicReasons = new AvroDeterminismChecker().check(TypeDescriptor.of(type), schema); - - // Decoder and Encoder start off null for each thread. They are allocated and potentially - // reused inside encode/decode. - this.decoder = new EmptyOnDeserializationThreadLocal<>(); - this.encoder = new EmptyOnDeserializationThreadLocal<>(); - - this.reflectData = Suppliers.memoize(new SerializableReflectDataSupplier(getType())); - - // Reader and writer are allocated once per thread per Coder - this.reader = - new EmptyOnDeserializationThreadLocal>() { - private final AvroCoder myCoder = AvroCoder.this; - - @Override - public DatumReader initialValue() { - if (myCoder.getType().equals(GenericRecord.class)) { - return new GenericDatumReader<>(myCoder.getSchema()); - } else if (SpecificRecord.class.isAssignableFrom(myCoder.getType()) && !useReflectApi) { - return new SpecificDatumReader<>(myCoder.getType()); - } - return new ReflectDatumReader<>( - myCoder.getSchema(), myCoder.getSchema(), myCoder.reflectData.get()); - } - }; - - this.writer = - new EmptyOnDeserializationThreadLocal>() { - private final AvroCoder myCoder = AvroCoder.this; - - @Override - public DatumWriter initialValue() { - if (myCoder.getType().equals(GenericRecord.class)) { - return new GenericDatumWriter<>(myCoder.getSchema()); - } else if (SpecificRecord.class.isAssignableFrom(myCoder.getType()) && !useReflectApi) { - return new SpecificDatumWriter<>(myCoder.getType()); - } - return new ReflectDatumWriter<>(myCoder.getSchema(), myCoder.reflectData.get()); - } - }; - } - - /** Returns the type this coder encodes/decodes. */ - public Class getType() { - return type; - } - - public boolean useReflectApi() { - return useReflectApi; - } - - @Override - public void encode(T value, OutputStream outStream) throws IOException { - // Get a BinaryEncoder instance from the ThreadLocal cache and attempt to reuse it. - BinaryEncoder encoderInstance = ENCODER_FACTORY.directBinaryEncoder(outStream, encoder.get()); - // Save the potentially-new instance for reuse later. - encoder.set(encoderInstance); - writer.get().write(value, encoderInstance); - // Direct binary encoder does not buffer any data and need not be flushed. - } - - @Override - public T decode(InputStream inStream) throws IOException { - // Get a BinaryDecoder instance from the ThreadLocal cache and attempt to reuse it. - BinaryDecoder decoderInstance = DECODER_FACTORY.directBinaryDecoder(inStream, decoder.get()); - // Save the potentially-new instance for later. - decoder.set(decoderInstance); - return reader.get().read(null, decoderInstance); - } - - /** - * @throws NonDeterministicException when the type may not be deterministically encoded using the - * given {@link Schema}, the {@code directBinaryEncoder}, and the {@link ReflectDatumWriter} - * or {@link GenericDatumWriter}. - */ - @Override - public void verifyDeterministic() throws NonDeterministicException { - if (!nonDeterministicReasons.isEmpty()) { - throw new NonDeterministicException(this, nonDeterministicReasons); - } - } - - /** Returns the schema used by this coder. */ - public Schema getSchema() { - return schemaSupplier.get(); - } - - @Override - public TypeDescriptor getEncodedTypeDescriptor() { - return typeDescriptor; - } - - /** - * Helper class encapsulating the various pieces of state maintained by the recursive walk used - * for checking if the encoding will be deterministic. - */ - private static class AvroDeterminismChecker { - - // Reasons that the original type are not deterministic. This accumulates - // the actual output. - private List reasons = new ArrayList<>(); - - // Types that are currently "open". Used to make sure we don't have any - // recursive types. Note that we assume that all occurrences of a given type - // are equal, rather than tracking pairs of type + schema. - private Set> activeTypes = new HashSet<>(); - - // Similarly to how we record active types, we record the schemas we visit - // to make sure we don't encounter recursive fields. - private Set activeSchemas = new HashSet<>(); - - /** Report an error in the current context. */ - @FormatMethod - private void reportError(String context, @FormatString String fmt, Object... args) { - String message = String.format(fmt, args); - reasons.add(context + ": " + message); - } - - /** - * Classes that are serialized by Avro as a String include - * - *

    - *
  • Subtypes of CharSequence (including String, Avro's mutable Utf8, etc.) - *
  • Several predefined classes (BigDecimal, BigInteger, URI, URL) - *
  • Classes annotated with @Stringable (uses their #toString() and a String constructor) - *
- * - *

Rather than determine which of these cases are deterministic, we list some classes that - * definitely are, and treat any others as non-deterministic. - */ - private static final Set> DETERMINISTIC_STRINGABLE_CLASSES = new HashSet<>(); - - static { - // CharSequences: - DETERMINISTIC_STRINGABLE_CLASSES.add(String.class); - DETERMINISTIC_STRINGABLE_CLASSES.add(Utf8.class); - - // Explicitly Stringable: - DETERMINISTIC_STRINGABLE_CLASSES.add(java.math.BigDecimal.class); - DETERMINISTIC_STRINGABLE_CLASSES.add(java.math.BigInteger.class); - DETERMINISTIC_STRINGABLE_CLASSES.add(java.net.URI.class); - DETERMINISTIC_STRINGABLE_CLASSES.add(java.net.URL.class); - - // Classes annotated with @Stringable: - } - - /** Return true if the given type token is a subtype of *any* of the listed parents. */ - private static boolean isSubtypeOf(TypeDescriptor type, Class... parents) { - for (Class parent : parents) { - if (type.isSubtypeOf(TypeDescriptor.of(parent))) { - return true; - } - } - return false; - } - - protected AvroDeterminismChecker() {} - - // The entry point for the check. Should not be recursively called. - public List check(TypeDescriptor type, Schema schema) { - recurse(type.getRawType().getName(), type, schema); - return reasons; - } - - // This is the method that should be recursively called. It sets up the path - // and visited types correctly. - private void recurse(String context, TypeDescriptor type, Schema schema) { - if (type.getRawType().isAnnotationPresent(AvroSchema.class)) { - reportError(context, "Custom schemas are not supported -- remove @AvroSchema."); - return; - } - - if (!activeTypes.add(type)) { - reportError(context, "%s appears recursively", type); - return; - } - - // If the record isn't a true class, but rather a GenericRecord, SpecificRecord, etc. - // with a specified schema, then we need to make the decision based on the generated - // implementations. - if (isSubtypeOf(type, IndexedRecord.class)) { - checkIndexedRecord(context, schema, null); - } else { - doCheck(context, type, schema); - } - - activeTypes.remove(type); - } - - private void doCheck(String context, TypeDescriptor type, Schema schema) { - switch (schema.getType()) { - case ARRAY: - checkArray(context, type, schema); - break; - case ENUM: - // Enums should be deterministic, since they depend only on the ordinal. - break; - case FIXED: - // Depending on the implementation of GenericFixed, we don't know how - // the given field will be encoded. So, we assume that it isn't - // deterministic. - reportError(context, "FIXED encodings are not guaranteed to be deterministic"); - break; - case MAP: - checkMap(context, type, schema); - break; - case RECORD: - if (!(type.getType() instanceof Class)) { - reportError(context, "Cannot determine type from generic %s due to erasure", type); - return; - } - checkRecord(type, schema); - break; - case UNION: - checkUnion(context, type, schema); - break; - case STRING: - checkString(context, type); - break; - case BOOLEAN: - case BYTES: - case DOUBLE: - case INT: - case FLOAT: - case LONG: - case NULL: - // For types that Avro encodes using one of the above primitives, we assume they are - // deterministic. - break; - default: - // In any other case (eg., new types added to Avro) we cautiously return - // false. - reportError(context, "Unknown schema type %s may be non-deterministic", schema.getType()); - break; - } - } - - private void checkString(String context, TypeDescriptor type) { - // For types that are encoded as strings, we need to make sure they're in an approved - // list. For other types that are annotated @Stringable, Avro will just use the - // #toString() methods, which has no guarantees of determinism. - if (!DETERMINISTIC_STRINGABLE_CLASSES.contains(type.getRawType())) { - reportError(context, "%s may not have deterministic #toString()", type); - } - } - - private static final Schema AVRO_NULL_SCHEMA = Schema.create(Schema.Type.NULL); - - private void checkUnion(String context, TypeDescriptor type, Schema schema) { - final List unionTypes = schema.getTypes(); - - if (!type.getRawType().isAnnotationPresent(Union.class)) { - // First check for @Nullable field, which shows up as a union of field type and null. - if (unionTypes.size() == 2 && unionTypes.contains(AVRO_NULL_SCHEMA)) { - // Find the Schema that is not NULL and recursively check that it is deterministic. - Schema nullableFieldSchema = - unionTypes.get(0).equals(AVRO_NULL_SCHEMA) ? unionTypes.get(1) : unionTypes.get(0); - doCheck(context, type, nullableFieldSchema); - return; - } - - // Otherwise report a schema error. - reportError(context, "Expected type %s to have @Union annotation", type); - return; - } - - // Errors associated with this union will use the base class as their context. - String baseClassContext = type.getRawType().getName(); - - // For a union, we need to make sure that each possible instantiation is deterministic. - for (Schema concrete : unionTypes) { - @SuppressWarnings("unchecked") - TypeDescriptor unionType = TypeDescriptor.of(ReflectData.get().getClass(concrete)); - - recurse(baseClassContext, unionType, concrete); - } - } - - private void checkRecord(TypeDescriptor type, Schema schema) { - // For a record, we want to make sure that all the fields are deterministic. - Class clazz = type.getRawType(); - for (Schema.Field fieldSchema : schema.getFields()) { - Field field = getField(clazz, fieldSchema.name()); - String fieldContext = field.getDeclaringClass().getName() + "#" + field.getName(); - - if (field.isAnnotationPresent(AvroEncode.class)) { - reportError( - fieldContext, "Custom encoders may be non-deterministic -- remove @AvroEncode"); - continue; - } - - if (!IndexedRecord.class.isAssignableFrom(field.getType()) - && field.isAnnotationPresent(AvroSchema.class)) { - // TODO: We should be able to support custom schemas on POJO fields, but we shouldn't - // need to, so we just allow it in the case of IndexedRecords. - reportError( - fieldContext, "Custom schemas are only supported for subtypes of IndexedRecord."); - continue; - } - - TypeDescriptor fieldType = type.resolveType(field.getGenericType()); - recurse(fieldContext, fieldType, fieldSchema.schema()); - } - } - - private void checkIndexedRecord( - String context, Schema schema, @Nullable String specificClassStr) { - - if (!activeSchemas.add(schema)) { - reportError(context, "%s appears recursively", schema.getName()); - return; - } - - switch (schema.getType()) { - case ARRAY: - // Generic Records use GenericData.Array to implement arrays, which is - // essentially an ArrayList, and therefore ordering is deterministic. - // The array is thus deterministic if the elements are deterministic. - checkIndexedRecord(context, schema.getElementType(), null); - break; - case ENUM: - // Enums are deterministic because they encode as a single integer. - break; - case FIXED: - // In the case of GenericRecords, FIXED is deterministic because it - // encodes/decodes as a Byte[]. - break; - case MAP: - reportError( - context, - "GenericRecord and SpecificRecords use a HashMap to represent MAPs," - + " so it is non-deterministic"); - break; - case RECORD: - for (Schema.Field field : schema.getFields()) { - checkIndexedRecord( - schema.getName() + "." + field.name(), - field.schema(), - field.getProp(SpecificData.CLASS_PROP)); - } - break; - case STRING: - // GenericDatumWriter#findStringClass will use a CharSequence or a String - // for each string, so it is deterministic. - - // SpecificCompiler#getStringType will use java.lang.String, org.apache.avro.util.Utf8, - // or java.lang.CharSequence, unless SpecificData.CLASS_PROP overrides that. - if (specificClassStr != null) { - Class specificClass; - try { - specificClass = ClassUtils.forName(specificClassStr); - if (!DETERMINISTIC_STRINGABLE_CLASSES.contains(specificClass)) { - reportError( - context, - "Specific class %s is not known to be deterministic", - specificClassStr); - } - } catch (ClassNotFoundException e) { - reportError( - context, "Specific class %s is not known to be deterministic", specificClassStr); - } - } - break; - case UNION: - for (Schema subschema : schema.getTypes()) { - checkIndexedRecord(subschema.getName(), subschema, null); - } - break; - case BOOLEAN: - case BYTES: - case DOUBLE: - case INT: - case FLOAT: - case LONG: - case NULL: - // For types that Avro encodes using one of the above primitives, we assume they are - // deterministic. - break; - default: - reportError(context, "Unknown schema type %s may be non-deterministic", schema.getType()); - break; - } - - activeSchemas.remove(schema); - } - - private void checkMap(String context, TypeDescriptor type, Schema schema) { - if (!isSubtypeOf(type, SortedMap.class)) { - reportError(context, "%s may not be deterministically ordered", type); - } - - // Avro (currently) asserts that all keys are strings. - // In case that changes, we double check that the key was a string: - Class keyType = type.resolveType(Map.class.getTypeParameters()[0]).getRawType(); - if (!String.class.equals(keyType)) { - reportError(context, "map keys should be Strings, but was %s", keyType); - } - - recurse(context, type.resolveType(Map.class.getTypeParameters()[1]), schema.getValueType()); - } - - private void checkArray(String context, TypeDescriptor type, Schema schema) { - TypeDescriptor elementType = null; - if (type.isArray()) { - // The type is an array (with ordering)-> deterministic iff the element is deterministic. - elementType = type.getComponentType(); - } else if (isSubtypeOf(type, Collection.class)) { - if (isSubtypeOf(type, List.class, SortedSet.class)) { - // Ordered collection -> deterministic iff the element is deterministic - elementType = type.resolveType(Collection.class.getTypeParameters()[0]); - } else { - // Not an ordered collection -> not deterministic - reportError(context, "%s may not be deterministically ordered", type); - return; - } - } else { - // If it was an unknown type encoded as an array, be conservative and assume - // that we don't know anything about the order. - reportError(context, "encoding %s as an ARRAY was unexpected", type); - return; - } - - // If we get here, it's either a deterministically-ordered Collection, or - // an array. Either way, the type is deterministic iff the element type is - // deterministic. - recurse(context, elementType, schema.getElementType()); - } - - /** - * Extract a field from a class. We need to look at the declared fields so that we can see - * private fields. We may need to walk up to the parent to get classes from the parent. - */ - private static Field getField(Class originalClazz, String name) { - Class clazz = originalClazz; - while (clazz != null) { - for (Field field : clazz.getDeclaredFields()) { - AvroName avroName = field.getAnnotation(AvroName.class); - if (avroName != null && name.equals(avroName.value())) { - return field; - } else if (avroName == null && name.equals(field.getName())) { - return field; - } - } - clazz = clazz.getSuperclass(); - } - - throw new IllegalArgumentException("Unable to get field " + name + " from " + originalClazz); - } - } - - @Override - public boolean equals(@Nullable Object other) { - if (other == this) { - return true; - } - if (!(other instanceof AvroCoder)) { - return false; - } - AvroCoder that = (AvroCoder) other; - return Objects.equals(this.schemaSupplier.get(), that.schemaSupplier.get()) - && Objects.equals(this.typeDescriptor, that.typeDescriptor) - && this.useReflectApi == that.useReflectApi; - } - - @Override - public int hashCode() { - return Objects.hash(schemaSupplier.get(), typeDescriptor, useReflectApi); - } - - /** - * Conversion for DateTime. - * - *

This is a copy from Avro 1.8's TimestampConversion, which is renamed in Avro 1.9. Defining - * own copy gives flexibility for Beam Java SDK to work with Avro 1.8 and 1.9 at runtime. - * - * @see BEAM-9144: Beam's own Avro - * TimeConversion class in beam-sdk-java-core - */ - public static class JodaTimestampConversion extends Conversion { - @Override - public Class getConvertedType() { - return DateTime.class; - } - - @Override - public String getLogicalTypeName() { - return "timestamp-millis"; - } - - @Override - public DateTime fromLong(Long millisFromEpoch, Schema schema, LogicalType type) { - return new DateTime(millisFromEpoch, DateTimeZone.UTC); - } - - @Override - public Long toLong(DateTime timestamp, Schema schema, LogicalType type) { - return timestamp.getMillis(); - } - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java index 04bf8af4d1878..08e25c6b77e77 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java @@ -112,7 +112,10 @@ public String toString() { } /** - * Encodes the given value of type {@code T} onto the given output stream. + * Encodes the given value of type {@code T} onto the given output stream. Multiple elements can + * be encoded next to each other on the output stream, each coder should encode information to + * know how many bytes to read when decoding. A common approach is to prefix the encoding with the + * element's encoded length. * * @throws IOException if writing to the {@code OutputStream} fails for some reason * @throws CoderException if the value could not be encoded for some reason @@ -134,7 +137,9 @@ public void encode(T value, OutputStream outStream, Context context) /** * Decodes a value of type {@code T} from the given input stream in the given context. Returns the - * decoded value. + * decoded value. Multiple elements can be encoded next to each other on the input stream, each + * coder should encode information to know how many bytes to read when decoding. A common approach + * is to prefix the encoding with the element's encoded length. * * @throws IOException if reading from the {@code InputStream} fails for some reason * @throws CoderException if the value could not be decoded for some reason diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderProviders.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderProviders.java index 8e47f4f2bc9cf..e0a3199d0c69d 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderProviders.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderProviders.java @@ -178,7 +178,12 @@ public CoderProviderForCoder(TypeDescriptor type, Coder coder) { @Override public Coder coderFor(TypeDescriptor type, List> componentCoders) throws CannotProvideCoderException { - if (!this.type.equals(type)) { + boolean isTypeEqual = this.type.equals(type); + boolean isAutoValueConcrete = + type.getRawType().getName().contains("AutoValue_") + && this.type.getRawType().isAssignableFrom(type.getRawType()); + + if (!isTypeEqual && !isAutoValueConcrete) { throw new CannotProvideCoderException( String.format( "Unable to provide coder for %s, this factory can only provide coders for %s", diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/DefaultCoder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/DefaultCoder.java index 52718fcde2afe..782a77cde6852 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/DefaultCoder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/DefaultCoder.java @@ -88,6 +88,14 @@ public Coder coderFor( Class clazz = typeDescriptor.getRawType(); DefaultCoder defaultAnnotation = clazz.getAnnotation(DefaultCoder.class); + if (defaultAnnotation == null) { + // check if the superclass has DefaultCoder annotation if the class is generated using + // AutoValue + if (clazz.getName().contains("AutoValue_")) { + clazz = clazz.getSuperclass(); + defaultAnnotation = clazz.getAnnotation(DefaultCoder.class); + } + } if (defaultAnnotation == null) { throw new CannotProvideCoderException( String.format("Class %s does not have a @DefaultCoder annotation.", clazz.getName())); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroIO.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroIO.java deleted file mode 100644 index 5593d5e70ea94..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroIO.java +++ /dev/null @@ -1,2031 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import static org.apache.beam.sdk.io.FileIO.ReadMatches.DirectoryTreatment; -import static org.apache.beam.sdk.io.ReadAllViaFileBasedSource.ReadFileRangesFnExceptionHandler; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; - -import com.google.auto.value.AutoValue; -import java.io.IOException; -import java.io.Serializable; -import java.nio.channels.Channels; -import java.nio.channels.WritableByteChannel; -import java.util.Map; -import org.apache.avro.Schema; -import org.apache.avro.file.CodecFactory; -import org.apache.avro.file.DataFileConstants; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.reflect.ReflectDatumWriter; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.coders.CannotProvideCoderException; -import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.coders.CoderRegistry; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.io.FileIO.MatchConfiguration; -import org.apache.beam.sdk.io.FileIO.ReadableFile; -import org.apache.beam.sdk.io.fs.EmptyMatchTreatment; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.options.ValueProvider; -import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider; -import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; -import org.apache.beam.sdk.schemas.utils.AvroUtils; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.SerializableFunction; -import org.apache.beam.sdk.transforms.SerializableFunctions; -import org.apache.beam.sdk.transforms.Watch.Growth.TerminationCondition; -import org.apache.beam.sdk.transforms.display.DisplayData; -import org.apache.beam.sdk.values.PBegin; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; -import org.apache.beam.sdk.values.TypeDescriptors; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.Duration; - -/** - * {@link PTransform}s for reading and writing Avro files. - * - *

Reading Avro files

- * - *

To read a {@link PCollection} from one or more Avro files with the same schema known at - * pipeline construction time, use {@link #read}, using {@link AvroIO.Read#from} to specify the - * filename or filepattern to read from. If the filepatterns to be read are themselves in a {@link - * PCollection} you can use {@link FileIO} to match them and {@link AvroIO#readFiles} to read them. - * If the schema is unknown at pipeline construction time, use {@link #parseGenericRecords} or - * {@link #parseFilesGenericRecords}. - * - *

Many configuration options below apply to several or all of these transforms. - * - *

See {@link FileSystems} for information on supported file systems and filepatterns. - * - *

Filepattern expansion and watching

- * - *

By default, the filepatterns are expanded only once. {@link Read#watchForNewFiles} or the - * combination of {@link FileIO.Match#continuously(Duration, TerminationCondition)} and {@link - * AvroIO#readFiles(Class)} allow streaming of new files matching the filepattern(s). - * - *

By default, {@link #read} prohibits filepatterns that match no files, and {@link - * AvroIO#readFiles(Class)} allows them in case the filepattern contains a glob wildcard character. - * Use {@link Read#withEmptyMatchTreatment} or {@link - * FileIO.Match#withEmptyMatchTreatment(EmptyMatchTreatment)} plus {@link AvroIO#readFiles(Class)} - * to configure this behavior. - * - *

Reading records of a known schema

- * - *

To read specific records, such as Avro-generated classes, use {@link #read(Class)}. To read - * {@link GenericRecord GenericRecords}, use {@link #readGenericRecords(Schema)} which takes a - * {@link Schema} object, or {@link #readGenericRecords(String)} which takes an Avro schema in a - * JSON-encoded string form. An exception will be thrown if a record doesn't match the specified - * schema. Likewise, to read a {@link PCollection} of filepatterns, apply {@link FileIO} matching - * plus {@link #readFilesGenericRecords}. - * - *

For example: - * - *

{@code
- * Pipeline p = ...;
- *
- * // Read Avro-generated classes from files on GCS
- * PCollection records =
- *     p.apply(AvroIO.read(AvroAutoGenClass.class).from("gs://my_bucket/path/to/records-*.avro"));
- *
- * // Read GenericRecord's of the given schema from files on GCS
- * Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
- * PCollection records =
- *     p.apply(AvroIO.readGenericRecords(schema)
- *                .from("gs://my_bucket/path/to/records-*.avro"));
- * }
- * - *

Reading records of an unknown schema

- * - *

To read records from files whose schema is unknown at pipeline construction time or differs - * between files, use {@link #parseGenericRecords} - in this case, you will need to specify a - * parsing function for converting each {@link GenericRecord} into a value of your custom type. - * Likewise, to read a {@link PCollection} of filepatterns with unknown schema, use {@link FileIO} - * matching plus {@link #parseFilesGenericRecords(SerializableFunction)}. - * - *

For example: - * - *

{@code
- * Pipeline p = ...;
- *
- * PCollection records =
- *     p.apply(AvroIO.parseGenericRecords(new SerializableFunction() {
- *       public Foo apply(GenericRecord record) {
- *         // If needed, access the schema of the record using record.getSchema()
- *         return ...;
- *       }
- *     }));
- * }
- * - *

Reading from a {@link PCollection} of filepatterns

- * - *
{@code
- * Pipeline p = ...;
- *
- * PCollection filepatterns = p.apply(...);
- * PCollection records =
- *     filepatterns.apply(AvroIO.readAll(AvroAutoGenClass.class));
- * PCollection records =
- *     filepatterns
- *         .apply(FileIO.matchAll())
- *         .apply(FileIO.readMatches())
- *         .apply(AvroIO.readFiles(AvroAutoGenClass.class));
- * PCollection genericRecords =
- *     filepatterns.apply(AvroIO.readGenericRecords(schema));
- * PCollection records =
- *     filepatterns
- *         .apply(FileIO.matchAll())
- *         .apply(FileIO.readMatches())
- *         .apply(AvroIO.parseFilesGenericRecords(new SerializableFunction...);
- * }
- * - *

Streaming new files matching a filepattern

- * - *
{@code
- * Pipeline p = ...;
- *
- * PCollection lines = p.apply(AvroIO
- *     .read(AvroAutoGenClass.class)
- *     .from("gs://my_bucket/path/to/records-*.avro")
- *     .watchForNewFiles(
- *       // Check for new files every minute
- *       Duration.standardMinutes(1),
- *       // Stop watching the filepattern if no new files appear within an hour
- *       afterTimeSinceNewOutput(Duration.standardHours(1))));
- * }
- * - *

Reading a very large number of files

- * - *

If it is known that the filepattern will match a very large number of files (e.g. tens of - * thousands or more), use {@link Read#withHintMatchesManyFiles} for better performance and - * scalability. Note that it may decrease performance if the filepattern matches only a small number - * of files. - * - *

Inferring Beam schemas from Avro files

- * - *

If you want to use SQL or schema based operations on an Avro-based PCollection, you must - * configure the read transform to infer the Beam schema and automatically setup the Beam related - * coders by doing: - * - *

{@code
- * PCollection records =
- *     p.apply(AvroIO.read(...).from(...).withBeamSchemas(true));
- * }
- * - *

Inferring Beam schemas from Avro PCollections

- * - *

If you created an Avro-based PCollection by other means e.g. reading records from Kafka or as - * the output of another PTransform, you may be interested on making your PCollection schema-aware - * so you can use the Schema-based APIs or Beam's SqlTransform. - * - *

If you are using Avro specific records (generated classes from an Avro schema), you can - * register a schema provider for the specific Avro class to make any PCollection of these objects - * schema-aware. - * - *

{@code
- * pipeline.getSchemaRegistry().registerSchemaProvider(AvroAutoGenClass.class, AvroAutoGenClass.getClassSchema());
- * }
- * - * You can also manually set an Avro-backed Schema coder for a PCollection using {@link - * org.apache.beam.sdk.schemas.utils.AvroUtils#schemaCoder(Class, Schema)} to make it schema-aware. - * - *
{@code
- * PCollection records = ...
- * AvroCoder coder = (AvroCoder) users.getCoder();
- * records.setCoder(AvroUtils.schemaCoder(coder.getType(), coder.getSchema()));
- * }
- * - *

If you are using GenericRecords you may need to set a specific Beam schema coder for each - * PCollection to match their internal Avro schema. - * - *

{@code
- * org.apache.avro.Schema avroSchema = ...
- * PCollection records = ...
- * records.setCoder(AvroUtils.schemaCoder(avroSchema));
- * }
- * - *

Writing Avro files

- * - *

To write a {@link PCollection} to one or more Avro files, use {@link AvroIO.Write}, using - * {@code AvroIO.write().to(String)} to specify the output filename prefix. The default {@link - * DefaultFilenamePolicy} will use this prefix, in conjunction with a {@link ShardNameTemplate} (set - * via {@link Write#withShardNameTemplate(String)}) and optional filename suffix (set via {@link - * Write#withSuffix(String)}, to generate output filenames in a sharded way. You can override this - * default write filename policy using {@link Write#to(FileBasedSink.FilenamePolicy)} to specify a - * custom file naming policy. - * - *

By default, {@link AvroIO.Write} produces output files that are compressed using the {@link - * org.apache.avro.file.Codec CodecFactory.snappyCodec()}. This default can be changed or overridden - * using {@link AvroIO.Write#withCodec}. - * - *

Writing specific or generic records

- * - *

To write specific records, such as Avro-generated classes, use {@link #write(Class)}. To write - * {@link GenericRecord GenericRecords}, use either {@link #writeGenericRecords(Schema)} which takes - * a {@link Schema} object, or {@link #writeGenericRecords(String)} which takes a schema in a - * JSON-encoded string form. An exception will be thrown if a record doesn't match the specified - * schema. - * - *

For example: - * - *

{@code
- * // A simple Write to a local file (only runs locally):
- * PCollection records = ...;
- * records.apply(AvroIO.write(AvroAutoGenClass.class).to("/path/to/file.avro"));
- *
- * // A Write to a sharded GCS file (runs locally and using remote execution):
- * Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
- * PCollection records = ...;
- * records.apply("WriteToAvro", AvroIO.writeGenericRecords(schema)
- *     .to("gs://my_bucket/path/to/numbers")
- *     .withSuffix(".avro"));
- * }
- * - *

Writing windowed or unbounded data

- * - *

By default, all input is put into the global window before writing. If per-window writes are - * desired - for example, when using a streaming runner - {@link AvroIO.Write#withWindowedWrites()} - * will cause windowing and triggering to be preserved. When producing windowed writes with a - * streaming runner that supports triggers, the number of output shards must be set explicitly using - * {@link AvroIO.Write#withNumShards(int)}; some runners may set this for you to a runner-chosen - * value, so you may need not set it yourself. A {@link FileBasedSink.FilenamePolicy} must be set, - * and unique windows and triggers must produce unique filenames. - * - *

Writing data to multiple destinations

- * - *

The following shows a more-complex example of AvroIO.Write usage, generating dynamic file - * destinations as well as a dynamic Avro schema per file. In this example, a PCollection of user - * events (e.g. actions on a website) is written out to Avro files. Each event contains the user id - * as an integer field. We want events for each user to go into a specific directory for that user, - * and each user's data should be written with a specific schema for that user; a side input is - * used, so the schema can be calculated in a different stage. - * - *

{@code
- * // This is the user class that controls dynamic destinations for this avro write. The input to
- * // AvroIO.Write will be UserEvent, and we will be writing GenericRecords to the file (in order
- * // to have dynamic schemas). Everything is per userid, so we define a dynamic destination type
- * // of Integer.
- * class UserDynamicAvroDestinations
- *     extends DynamicAvroDestinations {
- *   private final PCollectionView> userToSchemaMap;
- *   public UserDynamicAvroDestinations( PCollectionView> userToSchemaMap) {
- *     this.userToSchemaMap = userToSchemaMap;
- *   }
- *   public GenericRecord formatRecord(UserEvent record) {
- *     return formatUserRecord(record, getSchema(record.getUserId()));
- *   }
- *   public Schema getSchema(Integer userId) {
- *     return new Schema.Parser().parse(sideInput(userToSchemaMap).get(userId));
- *   }
- *   public Integer getDestination(UserEvent record) {
- *     return record.getUserId();
- *   }
- *   public Integer getDefaultDestination() {
- *     return 0;
- *   }
- *   public FilenamePolicy getFilenamePolicy(Integer userId) {
- *     return DefaultFilenamePolicy.fromParams(new Params().withBaseFilename(baseDir + "/user-"
- *     + userId + "/events"));
- *   }
- *   public List> getSideInputs() {
- *     return ImmutableList.>of(userToSchemaMap);
- *   }
- * }
- * PCollection events = ...;
- * PCollectionView> userToSchemaMap = events.apply(
- *     "ComputePerUserSchemas", new ComputePerUserSchemas());
- * events.apply("WriteAvros", AvroIO.writeCustomTypeToGenericRecords()
- *     .to(new UserDynamicAvroDestinations(userToSchemaMap)));
- * }
- * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.io.AvroIO instead of this one. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -@Deprecated -public class AvroIO { - /** - * Reads records of the given type from an Avro file (or multiple Avro files matching a pattern). - * - *

The schema must be specified using one of the {@code withSchema} functions. - */ - public static Read read(Class recordClass) { - return new AutoValue_AvroIO_Read.Builder() - .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) - .setRecordClass(recordClass) - .setSchema(ReflectData.get().getSchema(recordClass)) - .setInferBeamSchema(false) - .setHintMatchesManyFiles(false) - .build(); - } - - /** - * Like {@link #read}, but reads each file in a {@link PCollection} of {@link ReadableFile}, - * returned by {@link FileIO#readMatches}. - * - *

You can read {@link GenericRecord} by using {@code #readFiles(GenericRecord.class)} or - * {@code #readFiles(new Schema.Parser().parse(schema))} if the schema is a String. - */ - public static ReadFiles readFiles(Class recordClass) { - return new AutoValue_AvroIO_ReadFiles.Builder() - .setRecordClass(recordClass) - .setSchema(ReflectData.get().getSchema(recordClass)) - .setInferBeamSchema(false) - .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES) - .setUsesReshuffle(ReadAllViaFileBasedSource.DEFAULT_USES_RESHUFFLE) - .setFileExceptionHandler(new ReadFileRangesFnExceptionHandler()) - .build(); - } - - /** - * Like {@link #read}, but reads each filepattern in the input {@link PCollection}. - * - * @deprecated You can achieve The functionality of {@link #readAll} using {@link FileIO} matching - * plus {@link #readFiles(Class)}. This is the preferred method to make composition explicit. - * {@link ReadAll} will not receive upgrades and will be removed in a future version of Beam. - */ - @Deprecated - public static ReadAll readAll(Class recordClass) { - return new AutoValue_AvroIO_ReadAll.Builder() - .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) - .setRecordClass(recordClass) - .setSchema(ReflectData.get().getSchema(recordClass)) - .setInferBeamSchema(false) - .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES) - .build(); - } - - /** Reads Avro file(s) containing records of the specified schema. */ - public static Read readGenericRecords(Schema schema) { - return new AutoValue_AvroIO_Read.Builder() - .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) - .setRecordClass(GenericRecord.class) - .setSchema(schema) - .setInferBeamSchema(false) - .setHintMatchesManyFiles(false) - .build(); - } - - /** - * Like {@link #readGenericRecords(Schema)}, but for a {@link PCollection} of {@link - * ReadableFile}, for example, returned by {@link FileIO#readMatches}. - */ - public static ReadFiles readFilesGenericRecords(Schema schema) { - return new AutoValue_AvroIO_ReadFiles.Builder() - .setRecordClass(GenericRecord.class) - .setSchema(schema) - .setInferBeamSchema(false) - .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES) - .setUsesReshuffle(ReadAllViaFileBasedSource.DEFAULT_USES_RESHUFFLE) - .setFileExceptionHandler(new ReadFileRangesFnExceptionHandler()) - .build(); - } - - /** - * Like {@link #readGenericRecords(Schema)}, but for a {@link PCollection} of {@link - * ReadableFile}, for example, returned by {@link FileIO#readMatches}. - * - * @deprecated You can achieve The functionality of {@link #readAllGenericRecords(Schema)} using - * {@link FileIO} matching plus {@link #readFilesGenericRecords(Schema)}. This is the - * preferred method to make composition explicit. {@link ReadAll} will not receive upgrades - * and will be removed in a future version of Beam. - */ - @Deprecated - public static ReadAll readAllGenericRecords(Schema schema) { - return new AutoValue_AvroIO_ReadAll.Builder() - .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) - .setRecordClass(GenericRecord.class) - .setSchema(schema) - .setInferBeamSchema(false) - .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES) - .build(); - } - - /** - * Reads Avro file(s) containing records of the specified schema. The schema is specified as a - * JSON-encoded string. - */ - public static Read readGenericRecords(String schema) { - return readGenericRecords(new Schema.Parser().parse(schema)); - } - - /** Like {@link #readGenericRecords(String)}, but for {@link ReadableFile} collections. */ - public static ReadFiles readFilesGenericRecords(String schema) { - return readFilesGenericRecords(new Schema.Parser().parse(schema)); - } - - /** - * Like {@link #readGenericRecords(String)}, but reads each filepattern in the input {@link - * PCollection}. - * - * @deprecated You can achieve The functionality of {@link #readAllGenericRecords(String)} using - * {@link FileIO} matching plus {@link #readFilesGenericRecords(String)}. This is the - * preferred method to make composition explicit. {@link ReadAll} will not receive upgrades - * and will be removed in a future version of Beam. - */ - @Deprecated - public static ReadAll readAllGenericRecords(String schema) { - return readAllGenericRecords(new Schema.Parser().parse(schema)); - } - - /** - * Reads Avro file(s) containing records of an unspecified schema and converting each record to a - * custom type. - */ - public static Parse parseGenericRecords(SerializableFunction parseFn) { - return new AutoValue_AvroIO_Parse.Builder() - .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) - .setParseFn(parseFn) - .setHintMatchesManyFiles(false) - .build(); - } - - /** - * Like {@link #parseGenericRecords(SerializableFunction)}, but reads each {@link ReadableFile} in - * the input {@link PCollection}. - */ - public static ParseFiles parseFilesGenericRecords( - SerializableFunction parseFn) { - return new AutoValue_AvroIO_ParseFiles.Builder() - .setParseFn(parseFn) - .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES) - .setUsesReshuffle(ReadAllViaFileBasedSource.DEFAULT_USES_RESHUFFLE) - .setFileExceptionHandler(new ReadFileRangesFnExceptionHandler()) - .build(); - } - - /** - * Like {@link #parseGenericRecords(SerializableFunction)}, but reads each filepattern in the - * input {@link PCollection}. - * - * @deprecated You can achieve The functionality of {@link - * #parseAllGenericRecords(SerializableFunction)} using {@link FileIO} matching plus {@link - * #parseFilesGenericRecords(SerializableFunction)} ()}. This is the preferred method to make - * composition explicit. {@link ParseAll} will not receive upgrades and will be removed in a - * future version of Beam. - */ - @Deprecated - public static ParseAll parseAllGenericRecords( - SerializableFunction parseFn) { - return new AutoValue_AvroIO_ParseAll.Builder() - .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) - .setParseFn(parseFn) - .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES) - .build(); - } - - /** - * Writes a {@link PCollection} to an Avro file (or multiple Avro files matching a sharding - * pattern). - */ - public static Write write(Class recordClass) { - return new Write<>( - AvroIO.defaultWriteBuilder() - .setGenericRecords(false) - .setSchema(ReflectData.get().getSchema(recordClass)) - .build()); - } - - /** Writes Avro records of the specified schema. */ - public static Write writeGenericRecords(Schema schema) { - return new Write<>( - AvroIO.defaultWriteBuilder() - .setGenericRecords(true) - .setSchema(schema) - .build()); - } - - /** - * A {@link PTransform} that writes a {@link PCollection} to an avro file (or multiple avro files - * matching a sharding pattern), with each element of the input collection encoded into its own - * record of type OutputT. - * - *

This version allows you to apply {@link AvroIO} writes to a PCollection of a custom type - * {@link UserT}. A format mechanism that converts the input type {@link UserT} to the output type - * that will be written to the file must be specified. If using a custom {@link - * DynamicAvroDestinations} object this is done using {@link - * DynamicAvroDestinations#formatRecord}, otherwise the {@link - * AvroIO.TypedWrite#withFormatFunction} can be used to specify a format function. - * - *

The advantage of using a custom type is that is it allows a user-provided {@link - * DynamicAvroDestinations} object, set via {@link AvroIO.Write#to(DynamicAvroDestinations)} to - * examine the custom type when choosing a destination. - * - *

If the output type is {@link GenericRecord} use {@link #writeCustomTypeToGenericRecords()} - * instead. - */ - public static TypedWrite writeCustomType() { - return AvroIO.defaultWriteBuilder().setGenericRecords(false).build(); - } - - /** - * Similar to {@link #writeCustomType()}, but specialized for the case where the output type is - * {@link GenericRecord}. A schema must be specified either in {@link - * DynamicAvroDestinations#getSchema} or if not using dynamic destinations, by using {@link - * TypedWrite#withSchema(Schema)}. - */ - public static TypedWrite writeCustomTypeToGenericRecords() { - return AvroIO.defaultWriteBuilder().setGenericRecords(true).build(); - } - - /** - * Writes Avro records of the specified schema. The schema is specified as a JSON-encoded string. - */ - public static Write writeGenericRecords(String schema) { - return writeGenericRecords(new Schema.Parser().parse(schema)); - } - - private static TypedWrite.Builder defaultWriteBuilder() { - return new AutoValue_AvroIO_TypedWrite.Builder() - .setFilenameSuffix(null) - .setShardTemplate(null) - .setNumShards(0) - .setCodec(TypedWrite.DEFAULT_SERIALIZABLE_CODEC) - .setMetadata(ImmutableMap.of()) - .setWindowedWrites(false) - .setNoSpilling(false) - .setSyncInterval(DataFileConstants.DEFAULT_SYNC_INTERVAL); - } - - private static PCollection setBeamSchema( - PCollection pc, Class clazz, @Nullable Schema schema) { - return pc.setCoder(AvroUtils.schemaCoder(clazz, schema)); - } - - /** - * 64MB is a reasonable value that allows to amortize the cost of opening files, but is not so - * large as to exhaust a typical runner's maximum amount of output per ProcessElement call. - */ - private static final long DEFAULT_BUNDLE_SIZE_BYTES = 64 * 1024 * 1024L; - - /** Implementation of {@link #read} and {@link #readGenericRecords}. */ - @AutoValue - public abstract static class Read extends PTransform> { - - abstract @Nullable ValueProvider getFilepattern(); - - abstract MatchConfiguration getMatchConfiguration(); - - abstract @Nullable Class getRecordClass(); - - abstract @Nullable Schema getSchema(); - - abstract boolean getInferBeamSchema(); - - abstract boolean getHintMatchesManyFiles(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setFilepattern(ValueProvider filepattern); - - abstract Builder setMatchConfiguration(MatchConfiguration matchConfiguration); - - abstract Builder setRecordClass(Class recordClass); - - abstract Builder setSchema(Schema schema); - - abstract Builder setInferBeamSchema(boolean infer); - - abstract Builder setHintMatchesManyFiles(boolean hintManyFiles); - - abstract Read build(); - } - - /** - * Reads from the given filename or filepattern. - * - *

If it is known that the filepattern will match a very large number of files (at least tens - * of thousands), use {@link #withHintMatchesManyFiles} for better performance and scalability. - */ - public Read from(ValueProvider filepattern) { - return toBuilder().setFilepattern(filepattern).build(); - } - - /** Like {@link #from(ValueProvider)}. */ - public Read from(String filepattern) { - return from(StaticValueProvider.of(filepattern)); - } - - /** Sets the {@link MatchConfiguration}. */ - public Read withMatchConfiguration(MatchConfiguration matchConfiguration) { - return toBuilder().setMatchConfiguration(matchConfiguration).build(); - } - - /** Configures whether or not a filepattern matching no files is allowed. */ - public Read withEmptyMatchTreatment(EmptyMatchTreatment treatment) { - return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); - } - - /** - * Continuously watches for new files matching the filepattern, polling it at the given - * interval, until the given termination condition is reached. The returned {@link PCollection} - * is unbounded. If {@code matchUpdatedFiles} is set, also watches for files with timestamp - * change. - * - *

This works only in runners supporting splittable {@link - * org.apache.beam.sdk.transforms.DoFn}. - */ - public Read watchForNewFiles( - Duration pollInterval, - TerminationCondition terminationCondition, - boolean matchUpdatedFiles) { - return withMatchConfiguration( - getMatchConfiguration() - .continuously(pollInterval, terminationCondition, matchUpdatedFiles)); - } - - /** - * Same as {@link Read#watchForNewFiles(Duration, TerminationCondition, boolean)} with {@code - * matchUpdatedFiles=false}. - */ - public Read watchForNewFiles( - Duration pollInterval, TerminationCondition terminationCondition) { - return watchForNewFiles(pollInterval, terminationCondition, false); - } - - /** - * Hints that the filepattern specified in {@link #from(String)} matches a very large number of - * files. - * - *

This hint may cause a runner to execute the transform differently, in a way that improves - * performance for this case, but it may worsen performance if the filepattern matches only a - * small number of files (e.g., in a runner that supports dynamic work rebalancing, it will - * happen less efficiently within individual files). - */ - public Read withHintMatchesManyFiles() { - return toBuilder().setHintMatchesManyFiles(true).build(); - } - - /** - * If set to true, a Beam schema will be inferred from the AVRO schema. This allows the output - * to be used by SQL and by the schema-transform library. - */ - public Read withBeamSchemas(boolean withBeamSchemas) { - return toBuilder().setInferBeamSchema(withBeamSchemas).build(); - } - - @Override - @SuppressWarnings("unchecked") - public PCollection expand(PBegin input) { - checkNotNull(getFilepattern(), "filepattern"); - checkNotNull(getSchema(), "schema"); - - if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) { - PCollection read = - input.apply( - "Read", - org.apache.beam.sdk.io.Read.from( - createSource( - getFilepattern(), - getMatchConfiguration().getEmptyMatchTreatment(), - getRecordClass(), - getSchema(), - null))); - return getInferBeamSchema() ? setBeamSchema(read, getRecordClass(), getSchema()) : read; - } - - // All other cases go through FileIO + ReadFiles - ReadFiles readFiles = - (getRecordClass() == GenericRecord.class) - ? (ReadFiles) readFilesGenericRecords(getSchema()) - : readFiles(getRecordClass()); - return input - .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of())) - .apply("Match All", FileIO.matchAll().withConfiguration(getMatchConfiguration())) - .apply( - "Read Matches", - FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) - .apply("Via ReadFiles", readFiles); - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - super.populateDisplayData(builder); - builder - .add( - DisplayData.item("inferBeamSchema", getInferBeamSchema()) - .withLabel("Infer Beam Schema")) - .addIfNotNull(DisplayData.item("schema", String.valueOf(getSchema()))) - .addIfNotNull(DisplayData.item("recordClass", getRecordClass()).withLabel("Record Class")) - .addIfNotNull( - DisplayData.item("filePattern", getFilepattern()).withLabel("Input File Pattern")) - .include("matchConfiguration", getMatchConfiguration()); - } - - @SuppressWarnings("unchecked") - private static AvroSource createSource( - ValueProvider filepattern, - EmptyMatchTreatment emptyMatchTreatment, - Class recordClass, - Schema schema, - AvroSource.@Nullable DatumReaderFactory readerFactory) { - AvroSource source = - AvroSource.from(filepattern).withEmptyMatchTreatment(emptyMatchTreatment); - - if (readerFactory != null) { - source = source.withDatumReaderFactory(readerFactory); - } - return recordClass == GenericRecord.class - ? (AvroSource) source.withSchema(schema) - : source.withSchema(recordClass); - } - } - - ///////////////////////////////////////////////////////////////////////////// - - /** Implementation of {@link #readFiles}. */ - @AutoValue - public abstract static class ReadFiles - extends PTransform, PCollection> { - - abstract @Nullable Class getRecordClass(); - - abstract @Nullable Schema getSchema(); - - abstract boolean getUsesReshuffle(); - - abstract ReadFileRangesFnExceptionHandler getFileExceptionHandler(); - - abstract long getDesiredBundleSizeBytes(); - - abstract boolean getInferBeamSchema(); - - abstract AvroSource.@Nullable DatumReaderFactory getDatumReaderFactory(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setRecordClass(Class recordClass); - - abstract Builder setSchema(Schema schema); - - abstract Builder setUsesReshuffle(boolean usesReshuffle); - - abstract Builder setFileExceptionHandler( - ReadFileRangesFnExceptionHandler exceptionHandler); - - abstract Builder setDesiredBundleSizeBytes(long desiredBundleSizeBytes); - - abstract Builder setInferBeamSchema(boolean infer); - - abstract Builder setDatumReaderFactory(AvroSource.DatumReaderFactory factory); - - abstract ReadFiles build(); - } - - /** - * Set a value for the bundle size for parallel reads. Default is 64 MB. You may want to use a - * lower value (e.g. 1 MB) for streaming applications. - */ - public ReadFiles withDesiredBundleSizeBytes(long desiredBundleSizeBytes) { - return toBuilder().setDesiredBundleSizeBytes(desiredBundleSizeBytes).build(); - } - - /** Specifies if a Reshuffle should run before file reads occur. */ - public ReadFiles withUsesReshuffle(boolean usesReshuffle) { - return toBuilder().setUsesReshuffle(usesReshuffle).build(); - } - - /** Specifies if exceptions should be logged only for streaming pipelines. */ - public ReadFiles withFileExceptionHandler( - ReadFileRangesFnExceptionHandler exceptionHandler) { - return toBuilder().setFileExceptionHandler(exceptionHandler).build(); - } - - /** - * If set to true, a Beam schema will be inferred from the AVRO schema. This allows the output - * to be used by SQL and by the schema-transform library. - */ - public ReadFiles withBeamSchemas(boolean withBeamSchemas) { - return toBuilder().setInferBeamSchema(withBeamSchemas).build(); - } - - public ReadFiles withDatumReaderFactory(AvroSource.DatumReaderFactory factory) { - return toBuilder().setDatumReaderFactory(factory).build(); - } - - @Override - public PCollection expand(PCollection input) { - checkNotNull(getSchema(), "schema"); - PCollection read = - input.apply( - "Read all via FileBasedSource", - new ReadAllViaFileBasedSource<>( - getDesiredBundleSizeBytes(), - new CreateSourceFn<>( - getRecordClass(), getSchema().toString(), getDatumReaderFactory()), - AvroCoder.of(getRecordClass(), getSchema()), - getUsesReshuffle(), - getFileExceptionHandler())); - return getInferBeamSchema() ? setBeamSchema(read, getRecordClass(), getSchema()) : read; - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - super.populateDisplayData(builder); - builder - .add( - DisplayData.item("inferBeamSchema", getInferBeamSchema()) - .withLabel("Infer Beam Schema")) - .addIfNotNull(DisplayData.item("schema", String.valueOf(getSchema()))) - .addIfNotNull( - DisplayData.item("recordClass", getRecordClass()).withLabel("Record Class")); - } - } - - ///////////////////////////////////////////////////////////////////////////// - - /** - * Implementation of {@link #readAll}. - * - * @deprecated See {@link #readAll(Class)} for details. - */ - @Deprecated - @AutoValue - public abstract static class ReadAll extends PTransform, PCollection> { - abstract MatchConfiguration getMatchConfiguration(); - - abstract @Nullable Class getRecordClass(); - - abstract @Nullable Schema getSchema(); - - abstract long getDesiredBundleSizeBytes(); - - abstract boolean getInferBeamSchema(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setMatchConfiguration(MatchConfiguration matchConfiguration); - - abstract Builder setRecordClass(Class recordClass); - - abstract Builder setSchema(Schema schema); - - abstract Builder setDesiredBundleSizeBytes(long desiredBundleSizeBytes); - - abstract Builder setInferBeamSchema(boolean infer); - - abstract ReadAll build(); - } - - /** Sets the {@link MatchConfiguration}. */ - public ReadAll withMatchConfiguration(MatchConfiguration configuration) { - return toBuilder().setMatchConfiguration(configuration).build(); - } - - /** Like {@link Read#withEmptyMatchTreatment}. */ - public ReadAll withEmptyMatchTreatment(EmptyMatchTreatment treatment) { - return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); - } - - /** Like {@link Read#watchForNewFiles}. */ - public ReadAll watchForNewFiles( - Duration pollInterval, TerminationCondition terminationCondition) { - return withMatchConfiguration( - getMatchConfiguration().continuously(pollInterval, terminationCondition)); - } - - /** - * Set a value for the bundle size for parallel reads. Default is 64 MB. You may want to use a - * lower value (e.g. 1 MB) for streaming applications. - */ - public ReadAll withDesiredBundleSizeBytes(long desiredBundleSizeBytes) { - return toBuilder().setDesiredBundleSizeBytes(desiredBundleSizeBytes).build(); - } - - /** - * If set to true, a Beam schema will be inferred from the AVRO schema. This allows the output - * to be used by SQL and by the schema-transform library. - */ - public ReadAll withBeamSchemas(boolean withBeamSchemas) { - return toBuilder().setInferBeamSchema(withBeamSchemas).build(); - } - - @Override - public PCollection expand(PCollection input) { - checkNotNull(getSchema(), "schema"); - PCollection read = - input - .apply(FileIO.matchAll().withConfiguration(getMatchConfiguration())) - .apply(FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) - .apply(readFiles(getRecordClass())); - return getInferBeamSchema() ? setBeamSchema(read, getRecordClass(), getSchema()) : read; - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - super.populateDisplayData(builder); - builder - .add( - DisplayData.item("inferBeamSchema", getInferBeamSchema()) - .withLabel("Infer Beam Schema")) - .addIfNotNull(DisplayData.item("schema", String.valueOf(getSchema()))) - .addIfNotNull(DisplayData.item("recordClass", getRecordClass()).withLabel("Record Class")) - .include("matchConfiguration", getMatchConfiguration()); - } - } - - private static class CreateSourceFn - implements SerializableFunction> { - private final Class recordClass; - private final Supplier schemaSupplier; - private final AvroSource.DatumReaderFactory readerFactory; - - CreateSourceFn( - Class recordClass, String jsonSchema, AvroSource.DatumReaderFactory readerFactory) { - this.recordClass = recordClass; - this.schemaSupplier = - Suppliers.memoize( - Suppliers.compose(new JsonToSchema(), Suppliers.ofInstance(jsonSchema))); - this.readerFactory = readerFactory; - } - - @Override - public FileBasedSource apply(String input) { - return Read.createSource( - StaticValueProvider.of(input), - EmptyMatchTreatment.DISALLOW, - recordClass, - schemaSupplier.get(), - readerFactory); - } - - private static class JsonToSchema implements Function, Serializable { - @Override - public Schema apply(String input) { - return new Schema.Parser().parse(input); - } - } - } - - ///////////////////////////////////////////////////////////////////////////// - - /** Implementation of {@link #parseGenericRecords}. */ - @AutoValue - public abstract static class Parse extends PTransform> { - - abstract @Nullable ValueProvider getFilepattern(); - - abstract MatchConfiguration getMatchConfiguration(); - - abstract SerializableFunction getParseFn(); - - abstract @Nullable Coder getCoder(); - - abstract boolean getHintMatchesManyFiles(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setFilepattern(ValueProvider filepattern); - - abstract Builder setMatchConfiguration(MatchConfiguration matchConfiguration); - - abstract Builder setParseFn(SerializableFunction parseFn); - - abstract Builder setCoder(Coder coder); - - abstract Builder setHintMatchesManyFiles(boolean hintMatchesManyFiles); - - abstract Parse build(); - } - - /** Reads from the given filename or filepattern. */ - public Parse from(String filepattern) { - return from(StaticValueProvider.of(filepattern)); - } - - /** Like {@link #from(String)}. */ - public Parse from(ValueProvider filepattern) { - return toBuilder().setFilepattern(filepattern).build(); - } - - /** Sets the {@link MatchConfiguration}. */ - public Parse withMatchConfiguration(MatchConfiguration configuration) { - return toBuilder().setMatchConfiguration(configuration).build(); - } - - /** Like {@link Read#withEmptyMatchTreatment}. */ - public Parse withEmptyMatchTreatment(EmptyMatchTreatment treatment) { - return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); - } - - /** Like {@link Read#watchForNewFiles}. */ - public Parse watchForNewFiles( - Duration pollInterval, TerminationCondition terminationCondition) { - return withMatchConfiguration( - getMatchConfiguration().continuously(pollInterval, terminationCondition)); - } - - /** Sets a coder for the result of the parse function. */ - public Parse withCoder(Coder coder) { - return toBuilder().setCoder(coder).build(); - } - - /** Like {@link Read#withHintMatchesManyFiles()}. */ - public Parse withHintMatchesManyFiles() { - return toBuilder().setHintMatchesManyFiles(true).build(); - } - - @Override - public PCollection expand(PBegin input) { - checkNotNull(getFilepattern(), "filepattern"); - Coder coder = inferCoder(getCoder(), getParseFn(), input.getPipeline().getCoderRegistry()); - - if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) { - return input.apply( - org.apache.beam.sdk.io.Read.from( - AvroSource.from(getFilepattern()).withParseFn(getParseFn(), coder))); - } - - // All other cases go through FileIO + ParseFilesGenericRecords. - return input - .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of())) - .apply("Match All", FileIO.matchAll().withConfiguration(getMatchConfiguration())) - .apply( - "Read Matches", - FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) - .apply("Via ParseFiles", parseFilesGenericRecords(getParseFn()).withCoder(coder)); - } - - private static Coder inferCoder( - @Nullable Coder explicitCoder, - SerializableFunction parseFn, - CoderRegistry coderRegistry) { - if (explicitCoder != null) { - return explicitCoder; - } - // If a coder was not specified explicitly, infer it from parse fn. - try { - return coderRegistry.getCoder(TypeDescriptors.outputOf(parseFn)); - } catch (CannotProvideCoderException e) { - throw new IllegalArgumentException( - "Unable to infer coder for output of parseFn. Specify it explicitly using withCoder().", - e); - } - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - super.populateDisplayData(builder); - builder - .addIfNotNull( - DisplayData.item("filePattern", getFilepattern()).withLabel("Input File Pattern")) - .add(DisplayData.item("parseFn", getParseFn().getClass()).withLabel("Parse function")) - .include("matchConfiguration", getMatchConfiguration()); - } - } - - ///////////////////////////////////////////////////////////////////////////// - - /** Implementation of {@link #parseFilesGenericRecords}. */ - @AutoValue - public abstract static class ParseFiles - extends PTransform, PCollection> { - abstract SerializableFunction getParseFn(); - - abstract @Nullable Coder getCoder(); - - abstract boolean getUsesReshuffle(); - - abstract ReadFileRangesFnExceptionHandler getFileExceptionHandler(); - - abstract long getDesiredBundleSizeBytes(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setParseFn(SerializableFunction parseFn); - - abstract Builder setCoder(Coder coder); - - abstract Builder setUsesReshuffle(boolean usesReshuffle); - - abstract Builder setFileExceptionHandler( - ReadFileRangesFnExceptionHandler exceptionHandler); - - abstract Builder setDesiredBundleSizeBytes(long desiredBundleSizeBytes); - - abstract ParseFiles build(); - } - - /** Specifies the coder for the result of the {@code parseFn}. */ - public ParseFiles withCoder(Coder coder) { - return toBuilder().setCoder(coder).build(); - } - - /** Specifies if a Reshuffle should run before file reads occur. */ - public ParseFiles withUsesReshuffle(boolean usesReshuffle) { - return toBuilder().setUsesReshuffle(usesReshuffle).build(); - } - - /** Specifies if exceptions should be logged only for streaming pipelines. */ - public ParseFiles withFileExceptionHandler( - ReadFileRangesFnExceptionHandler exceptionHandler) { - return toBuilder().setFileExceptionHandler(exceptionHandler).build(); - } - - /** - * Set a value for the bundle size for parallel reads. Default is 64 MB. You may want to use a - * lower value (e.g. 1 MB) for streaming applications. - */ - public ParseFiles withDesiredBundleSizeBytes(long desiredBundleSizeBytes) { - return toBuilder().setDesiredBundleSizeBytes(desiredBundleSizeBytes).build(); - } - - @Override - public PCollection expand(PCollection input) { - final Coder coder = - Parse.inferCoder(getCoder(), getParseFn(), input.getPipeline().getCoderRegistry()); - final SerializableFunction parseFn = getParseFn(); - final SerializableFunction> createSource = - new CreateParseSourceFn<>(parseFn, coder); - return input.apply( - "Parse Files via FileBasedSource", - new ReadAllViaFileBasedSource<>( - getDesiredBundleSizeBytes(), - createSource, - coder, - getUsesReshuffle(), - getFileExceptionHandler())); - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - super.populateDisplayData(builder); - builder.add(DisplayData.item("parseFn", getParseFn().getClass()).withLabel("Parse function")); - } - - private static class CreateParseSourceFn - implements SerializableFunction> { - private final SerializableFunction parseFn; - private final Coder coder; - - CreateParseSourceFn(SerializableFunction parseFn, Coder coder) { - this.parseFn = parseFn; - this.coder = coder; - } - - @Override - public FileBasedSource apply(String input) { - return AvroSource.from(input).withParseFn(parseFn, coder); - } - } - } - - ///////////////////////////////////////////////////////////////////////////// - - /** - * Implementation of {@link #parseAllGenericRecords}. - * - * @deprecated See {@link #parseAllGenericRecords(SerializableFunction)} for details. - */ - @Deprecated - @AutoValue - public abstract static class ParseAll extends PTransform, PCollection> { - abstract MatchConfiguration getMatchConfiguration(); - - abstract SerializableFunction getParseFn(); - - abstract @Nullable Coder getCoder(); - - abstract long getDesiredBundleSizeBytes(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setMatchConfiguration(MatchConfiguration matchConfiguration); - - abstract Builder setParseFn(SerializableFunction parseFn); - - abstract Builder setCoder(Coder coder); - - abstract Builder setDesiredBundleSizeBytes(long desiredBundleSizeBytes); - - abstract ParseAll build(); - } - - /** Sets the {@link MatchConfiguration}. */ - public ParseAll withMatchConfiguration(MatchConfiguration configuration) { - return toBuilder().setMatchConfiguration(configuration).build(); - } - - /** Like {@link Read#withEmptyMatchTreatment}. */ - public ParseAll withEmptyMatchTreatment(EmptyMatchTreatment treatment) { - return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); - } - - /** Like {@link Read#watchForNewFiles(Duration, TerminationCondition, boolean)}. */ - public ParseAll watchForNewFiles( - Duration pollInterval, - TerminationCondition terminationCondition, - boolean matchUpdatedFiles) { - return withMatchConfiguration( - getMatchConfiguration() - .continuously(pollInterval, terminationCondition, matchUpdatedFiles)); - } - - /** Like {@link Read#watchForNewFiles(Duration, TerminationCondition)}. */ - public ParseAll watchForNewFiles( - Duration pollInterval, TerminationCondition terminationCondition) { - return watchForNewFiles(pollInterval, terminationCondition, false); - } - - /** Specifies the coder for the result of the {@code parseFn}. */ - public ParseAll withCoder(Coder coder) { - return toBuilder().setCoder(coder).build(); - } - - /** - * Set a value for the bundle size for parallel reads. Default is 64 MB. You may want to use a - * lower value (e.g. 1 MB) for streaming applications. - */ - public ParseAll withDesiredBundleSizeBytes(long desiredBundleSizeBytes) { - return toBuilder().setDesiredBundleSizeBytes(desiredBundleSizeBytes).build(); - } - - @Override - public PCollection expand(PCollection input) { - return input - .apply(FileIO.matchAll().withConfiguration(getMatchConfiguration())) - .apply(FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) - .apply( - "Parse all via FileBasedSource", - parseFilesGenericRecords(getParseFn()).withCoder(getCoder())); - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - super.populateDisplayData(builder); - builder - .add(DisplayData.item("parseFn", getParseFn().getClass()).withLabel("Parse function")) - .include("matchConfiguration", getMatchConfiguration()); - } - } - - ///////////////////////////////////////////////////////////////////////////// - - /** Implementation of {@link #write}. */ - @AutoValue - public abstract static class TypedWrite - extends PTransform, WriteFilesResult> { - static final CodecFactory DEFAULT_CODEC = CodecFactory.snappyCodec(); - static final SerializableAvroCodecFactory DEFAULT_SERIALIZABLE_CODEC = - new SerializableAvroCodecFactory(DEFAULT_CODEC); - - abstract @Nullable SerializableFunction getFormatFunction(); - - abstract @Nullable ValueProvider getFilenamePrefix(); - - abstract @Nullable String getShardTemplate(); - - abstract @Nullable String getFilenameSuffix(); - - abstract @Nullable ValueProvider getTempDirectory(); - - abstract int getNumShards(); - - abstract boolean getGenericRecords(); - - abstract int getSyncInterval(); - - abstract @Nullable Schema getSchema(); - - abstract boolean getWindowedWrites(); - - abstract boolean getNoSpilling(); - - abstract @Nullable FilenamePolicy getFilenamePolicy(); - - abstract @Nullable DynamicAvroDestinations - getDynamicDestinations(); - - abstract AvroSink.@Nullable DatumWriterFactory getDatumWriterFactory(); - - /** - * The codec used to encode the blocks in the Avro file. String value drawn from those in - * https://avro.apache.org/docs/1.7.7/api/java/org/apache/avro/file/CodecFactory.html - */ - abstract SerializableAvroCodecFactory getCodec(); - /** Avro file metadata. */ - abstract ImmutableMap getMetadata(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - abstract Builder setFormatFunction( - @Nullable SerializableFunction formatFunction); - - abstract Builder setFilenamePrefix( - ValueProvider filenamePrefix); - - abstract Builder setFilenameSuffix( - @Nullable String filenameSuffix); - - abstract Builder setTempDirectory( - ValueProvider tempDirectory); - - abstract Builder setNumShards(int numShards); - - abstract Builder setShardTemplate( - @Nullable String shardTemplate); - - abstract Builder setGenericRecords(boolean genericRecords); - - abstract Builder setSyncInterval(int syncInterval); - - abstract Builder setSchema(Schema schema); - - abstract Builder setWindowedWrites(boolean windowedWrites); - - abstract Builder setNoSpilling(boolean noSpilling); - - abstract Builder setFilenamePolicy( - FilenamePolicy filenamePolicy); - - abstract Builder setCodec(SerializableAvroCodecFactory codec); - - abstract Builder setMetadata( - ImmutableMap metadata); - - abstract Builder setDynamicDestinations( - DynamicAvroDestinations dynamicDestinations); - - abstract Builder setDatumWriterFactory( - AvroSink.DatumWriterFactory datumWriterFactory); - - abstract TypedWrite build(); - } - - /** - * Writes to file(s) with the given output prefix. See {@link FileSystems} for information on - * supported file systems. - * - *

The name of the output files will be determined by the {@link FilenamePolicy} used. - * - *

By default, a {@link DefaultFilenamePolicy} will build output filenames using the - * specified prefix, a shard name template (see {@link #withShardNameTemplate(String)}, and a - * common suffix (if supplied using {@link #withSuffix(String)}). This default can be overridden - * using {@link #to(FilenamePolicy)}. - */ - public TypedWrite to(String outputPrefix) { - return to(FileBasedSink.convertToFileResourceIfPossible(outputPrefix)); - } - - /** - * Writes to file(s) with the given output prefix. See {@link FileSystems} for information on - * supported file systems. This prefix is used by the {@link DefaultFilenamePolicy} to generate - * filenames. - * - *

By default, a {@link DefaultFilenamePolicy} will build output filenames using the - * specified prefix, a shard name template (see {@link #withShardNameTemplate(String)}, and a - * common suffix (if supplied using {@link #withSuffix(String)}). This default can be overridden - * using {@link #to(FilenamePolicy)}. - * - *

This default policy can be overridden using {@link #to(FilenamePolicy)}, in which case - * {@link #withShardNameTemplate(String)} and {@link #withSuffix(String)} should not be set. - * Custom filename policies do not automatically see this prefix - you should explicitly pass - * the prefix into your {@link FilenamePolicy} object if you need this. - * - *

If {@link #withTempDirectory} has not been called, this filename prefix will be used to - * infer a directory for temporary files. - */ - public TypedWrite to(ResourceId outputPrefix) { - return toResource(StaticValueProvider.of(outputPrefix)); - } - - private static class OutputPrefixToResourceId - implements SerializableFunction { - @Override - public ResourceId apply(String input) { - return FileBasedSink.convertToFileResourceIfPossible(input); - } - } - - /** Like {@link #to(String)}. */ - public TypedWrite to(ValueProvider outputPrefix) { - return toResource( - NestedValueProvider.of( - outputPrefix, - // The function cannot be created as an anonymous class here since the enclosed class - // may contain unserializable members. - new OutputPrefixToResourceId())); - } - - /** Like {@link #to(ResourceId)}. */ - public TypedWrite toResource( - ValueProvider outputPrefix) { - return toBuilder().setFilenamePrefix(outputPrefix).build(); - } - - /** - * Writes to files named according to the given {@link FileBasedSink.FilenamePolicy}. A - * directory for temporary files must be specified using {@link #withTempDirectory}. - */ - public TypedWrite to(FilenamePolicy filenamePolicy) { - return toBuilder().setFilenamePolicy(filenamePolicy).build(); - } - - /** - * Use a {@link DynamicAvroDestinations} object to vend {@link FilenamePolicy} objects. These - * objects can examine the input record when creating a {@link FilenamePolicy}. A directory for - * temporary files must be specified using {@link #withTempDirectory}. - * - * @deprecated Use {@link FileIO#write()} or {@link FileIO#writeDynamic()} instead. - */ - @Deprecated - public TypedWrite to( - DynamicAvroDestinations dynamicDestinations) { - return toBuilder() - .setDynamicDestinations((DynamicAvroDestinations) dynamicDestinations) - .build(); - } - - /** - * Sets the approximate number of uncompressed bytes to write in each block for the AVRO - * container format. - */ - public TypedWrite withSyncInterval(int syncInterval) { - return toBuilder().setSyncInterval(syncInterval).build(); - } - - /** - * Sets the output schema. Can only be used when the output type is {@link GenericRecord} and - * when not using {@link #to(DynamicAvroDestinations)}. - */ - public TypedWrite withSchema(Schema schema) { - return toBuilder().setSchema(schema).build(); - } - - /** - * Specifies a format function to convert {@link UserT} to the output type. If {@link - * #to(DynamicAvroDestinations)} is used, {@link DynamicAvroDestinations#formatRecord} must be - * used instead. - */ - public TypedWrite withFormatFunction( - @Nullable SerializableFunction formatFunction) { - return toBuilder().setFormatFunction(formatFunction).build(); - } - - /** Set the base directory used to generate temporary files. */ - public TypedWrite withTempDirectory( - ValueProvider tempDirectory) { - return toBuilder().setTempDirectory(tempDirectory).build(); - } - - /** Set the base directory used to generate temporary files. */ - public TypedWrite withTempDirectory(ResourceId tempDirectory) { - return withTempDirectory(StaticValueProvider.of(tempDirectory)); - } - - /** - * Uses the given {@link ShardNameTemplate} for naming output files. This option may only be - * used when using one of the default filename-prefix to() overrides. - * - *

See {@link DefaultFilenamePolicy} for how the prefix, shard name template, and suffix are - * used. - */ - public TypedWrite withShardNameTemplate(String shardTemplate) { - return toBuilder().setShardTemplate(shardTemplate).build(); - } - - /** - * Configures the filename suffix for written files. This option may only be used when using one - * of the default filename-prefix to() overrides. - * - *

See {@link DefaultFilenamePolicy} for how the prefix, shard name template, and suffix are - * used. - */ - public TypedWrite withSuffix(String filenameSuffix) { - return toBuilder().setFilenameSuffix(filenameSuffix).build(); - } - - /** - * Configures the number of output shards produced overall (when using unwindowed writes) or - * per-window (when using windowed writes). - * - *

For unwindowed writes, constraining the number of shards is likely to reduce the - * performance of a pipeline. Setting this value is not recommended unless you require a - * specific number of output files. - * - * @param numShards the number of shards to use, or 0 to let the system decide. - */ - public TypedWrite withNumShards(int numShards) { - checkArgument(numShards >= 0); - return toBuilder().setNumShards(numShards).build(); - } - - /** - * Forces a single file as output and empty shard name template. This option is only compatible - * with unwindowed writes. - * - *

For unwindowed writes, constraining the number of shards is likely to reduce the - * performance of a pipeline. Setting this value is not recommended unless you require a - * specific number of output files. - * - *

This is equivalent to {@code .withNumShards(1).withShardNameTemplate("")} - */ - public TypedWrite withoutSharding() { - return withNumShards(1).withShardNameTemplate(""); - } - - /** - * Preserves windowing of input elements and writes them to files based on the element's window. - * - *

If using {@link #to(FileBasedSink.FilenamePolicy)}. Filenames will be generated using - * {@link FilenamePolicy#windowedFilename}. See also {@link WriteFiles#withWindowedWrites()}. - */ - public TypedWrite withWindowedWrites() { - return toBuilder().setWindowedWrites(true).build(); - } - - /** See {@link WriteFiles#withNoSpilling()}. */ - public TypedWrite withNoSpilling() { - return toBuilder().setNoSpilling(true).build(); - } - - /** Writes to Avro file(s) compressed using specified codec. */ - public TypedWrite withCodec(CodecFactory codec) { - return toBuilder().setCodec(new SerializableAvroCodecFactory(codec)).build(); - } - - /** - * Specifies a {@link AvroSink.DatumWriterFactory} to use for creating {@link - * org.apache.avro.io.DatumWriter} instances. - */ - public TypedWrite withDatumWriterFactory( - AvroSink.DatumWriterFactory datumWriterFactory) { - return toBuilder().setDatumWriterFactory(datumWriterFactory).build(); - } - - /** - * Writes to Avro file(s) with the specified metadata. - * - *

Supported value types are String, Long, and byte[]. - */ - public TypedWrite withMetadata(Map metadata) { - Map badKeys = Maps.newLinkedHashMap(); - for (Map.Entry entry : metadata.entrySet()) { - Object v = entry.getValue(); - if (!(v instanceof String || v instanceof Long || v instanceof byte[])) { - badKeys.put(entry.getKey(), v.getClass().getSimpleName()); - } - } - checkArgument( - badKeys.isEmpty(), - "Metadata value type must be one of String, Long, or byte[]. Found %s", - badKeys); - return toBuilder().setMetadata(ImmutableMap.copyOf(metadata)).build(); - } - - DynamicAvroDestinations resolveDynamicDestinations() { - DynamicAvroDestinations dynamicDestinations = - getDynamicDestinations(); - if (dynamicDestinations == null) { - // In this case DestinationT is Void. - FilenamePolicy usedFilenamePolicy = getFilenamePolicy(); - if (usedFilenamePolicy == null) { - usedFilenamePolicy = - DefaultFilenamePolicy.fromStandardParameters( - getFilenamePrefix(), - getShardTemplate(), - getFilenameSuffix(), - getWindowedWrites()); - } - dynamicDestinations = - (DynamicAvroDestinations) - constantDestinations( - usedFilenamePolicy, - getSchema(), - getMetadata(), - getCodec().getCodec(), - getFormatFunction(), - getDatumWriterFactory()); - } - return dynamicDestinations; - } - - @Override - public WriteFilesResult expand(PCollection input) { - checkArgument( - getFilenamePrefix() != null || getTempDirectory() != null, - "Need to set either the filename prefix or the tempDirectory of a AvroIO.Write " - + "transform."); - if (getFilenamePolicy() != null) { - checkArgument( - getShardTemplate() == null && getFilenameSuffix() == null, - "shardTemplate and filenameSuffix should only be used with the default " - + "filename policy"); - } - if (getDynamicDestinations() != null) { - checkArgument( - getFormatFunction() == null, - "A format function should not be specified " - + "with DynamicDestinations. Use DynamicDestinations.formatRecord instead"); - } else { - checkArgument( - getSchema() != null, "Unless using DynamicDestinations, .withSchema() is required."); - } - - ValueProvider tempDirectory = getTempDirectory(); - if (tempDirectory == null) { - tempDirectory = getFilenamePrefix(); - } - WriteFiles write = - WriteFiles.to( - new AvroSink<>( - tempDirectory, - resolveDynamicDestinations(), - getGenericRecords(), - getSyncInterval())); - if (getNumShards() > 0) { - write = write.withNumShards(getNumShards()); - } - if (getWindowedWrites()) { - write = write.withWindowedWrites(); - } - if (getNoSpilling()) { - write = write.withNoSpilling(); - } - return input.apply("Write", write); - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - super.populateDisplayData(builder); - resolveDynamicDestinations().populateDisplayData(builder); - builder - .addIfNotDefault( - DisplayData.item("numShards", getNumShards()).withLabel("Maximum Output Shards"), 0) - .addIfNotNull( - DisplayData.item("tempDirectory", getTempDirectory()) - .withLabel("Directory for temporary files")); - } - } - - /** - * This class is used as the default return value of {@link AvroIO#write} - * - *

All methods in this class delegate to the appropriate method of {@link AvroIO.TypedWrite}. - * This class exists for backwards compatibility, and will be removed in Beam 3.0. - */ - public static class Write extends PTransform, PDone> { - @VisibleForTesting final TypedWrite inner; - - Write(TypedWrite inner) { - this.inner = inner; - } - - /** See {@link TypedWrite#to(String)}. */ - public Write to(String outputPrefix) { - return new Write<>( - inner - .to(FileBasedSink.convertToFileResourceIfPossible(outputPrefix)) - .withFormatFunction(SerializableFunctions.identity())); - } - - /** See {@link TypedWrite#to(ResourceId)} . */ - public Write to(ResourceId outputPrefix) { - return new Write<>( - inner.to(outputPrefix).withFormatFunction(SerializableFunctions.identity())); - } - - /** See {@link TypedWrite#to(ValueProvider)}. */ - public Write to(ValueProvider outputPrefix) { - return new Write<>( - inner.to(outputPrefix).withFormatFunction(SerializableFunctions.identity())); - } - - /** See {@link TypedWrite#to(ResourceId)}. */ - public Write toResource(ValueProvider outputPrefix) { - return new Write<>( - inner.toResource(outputPrefix).withFormatFunction(SerializableFunctions.identity())); - } - - /** See {@link TypedWrite#to(FilenamePolicy)}. */ - public Write to(FilenamePolicy filenamePolicy) { - return new Write<>( - inner.to(filenamePolicy).withFormatFunction(SerializableFunctions.identity())); - } - - /** - * See {@link TypedWrite#to(DynamicAvroDestinations)}. - * - * @deprecated Use {@link FileIO#write()} or {@link FileIO#writeDynamic()} instead. - */ - @Deprecated - public Write to(DynamicAvroDestinations dynamicDestinations) { - return new Write<>(inner.to(dynamicDestinations).withFormatFunction(null)); - } - - /** See {@link TypedWrite#withSyncInterval}. */ - public Write withSyncInterval(int syncInterval) { - return new Write<>(inner.withSyncInterval(syncInterval)); - } - - /** See {@link TypedWrite#withSchema}. */ - public Write withSchema(Schema schema) { - return new Write<>(inner.withSchema(schema)); - } - - /** See {@link TypedWrite#withTempDirectory(ValueProvider)}. */ - public Write withTempDirectory(ValueProvider tempDirectory) { - return new Write<>(inner.withTempDirectory(tempDirectory)); - } - - /** See {@link TypedWrite#withTempDirectory(ResourceId)}. */ - public Write withTempDirectory(ResourceId tempDirectory) { - return new Write<>(inner.withTempDirectory(tempDirectory)); - } - - /** See {@link TypedWrite#withShardNameTemplate}. */ - public Write withShardNameTemplate(String shardTemplate) { - return new Write<>(inner.withShardNameTemplate(shardTemplate)); - } - - /** See {@link TypedWrite#withSuffix}. */ - public Write withSuffix(String filenameSuffix) { - return new Write<>(inner.withSuffix(filenameSuffix)); - } - - /** See {@link TypedWrite#withNumShards}. */ - public Write withNumShards(int numShards) { - return new Write<>(inner.withNumShards(numShards)); - } - - /** See {@link TypedWrite#withoutSharding}. */ - public Write withoutSharding() { - return new Write<>(inner.withoutSharding()); - } - - /** See {@link TypedWrite#withWindowedWrites}. */ - public Write withWindowedWrites() { - return new Write<>(inner.withWindowedWrites()); - } - - /** See {@link TypedWrite#withCodec}. */ - public Write withCodec(CodecFactory codec) { - return new Write<>(inner.withCodec(codec)); - } - - /** See {@link TypedWrite#withDatumWriterFactory}. */ - public Write withDatumWriterFactory(AvroSink.DatumWriterFactory datumWriterFactory) { - return new Write<>(inner.withDatumWriterFactory(datumWriterFactory)); - } - - /** - * Specify that output filenames are wanted. - * - *

The nested {@link TypedWrite}transform always has access to output filenames, however due - * to backwards-compatibility concerns, {@link Write} cannot return them. This method simply - * returns the inner {@link TypedWrite} transform which has {@link WriteFilesResult} as its - * output type, allowing access to output files. - * - *

The supplied {@code DestinationT} type must be: the same as that supplied in {@link - * #to(DynamicAvroDestinations)} if that method was used, or {@code Void} otherwise. - */ - public TypedWrite withOutputFilenames() { - return (TypedWrite) inner; - } - - /** See {@link TypedWrite#withMetadata} . */ - public Write withMetadata(Map metadata) { - return new Write<>(inner.withMetadata(metadata)); - } - - @Override - public PDone expand(PCollection input) { - input.apply(inner); - return PDone.in(input.getPipeline()); - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - inner.populateDisplayData(builder); - } - } - - /** - * Returns a {@link DynamicAvroDestinations} that always returns the same {@link FilenamePolicy}, - * schema, metadata, and codec. - */ - public static DynamicAvroDestinations constantDestinations( - FilenamePolicy filenamePolicy, - Schema schema, - Map metadata, - CodecFactory codec, - SerializableFunction formatFunction) { - return constantDestinations(filenamePolicy, schema, metadata, codec, formatFunction, null); - } - - /** - * Returns a {@link DynamicAvroDestinations} that always returns the same {@link FilenamePolicy}, - * schema, metadata, and codec. - */ - public static DynamicAvroDestinations constantDestinations( - FilenamePolicy filenamePolicy, - Schema schema, - Map metadata, - CodecFactory codec, - SerializableFunction formatFunction, - AvroSink.@Nullable DatumWriterFactory datumWriterFactory) { - return new ConstantAvroDestination<>( - filenamePolicy, schema, metadata, codec, formatFunction, datumWriterFactory); - } - ///////////////////////////////////////////////////////////////////////////// - - /** - * Formats an element of a user type into a record with the given schema. - * - * @deprecated Users can achieve the same by providing this transform in a {@link - * org.apache.beam.sdk.transforms.ParDo} before using write in AvroIO {@link #write(Class)}. - */ - @Deprecated - public interface RecordFormatter extends Serializable { - GenericRecord formatRecord(ElementT element, Schema schema); - } - - /** - * A {@link Sink} for use with {@link FileIO#write} and {@link FileIO#writeDynamic}, writing - * elements of the given generated class, like {@link #write(Class)}. - */ - public static Sink sink(final Class clazz) { - return new AutoValue_AvroIO_Sink.Builder() - .setJsonSchema(ReflectData.get().getSchema(clazz).toString()) - .setMetadata(ImmutableMap.of()) - .setCodec(TypedWrite.DEFAULT_SERIALIZABLE_CODEC) - .build(); - } - - /** - * A {@link Sink} for use with {@link FileIO#write} and {@link FileIO#writeDynamic}, writing - * elements with a given (common) schema, like {@link #writeGenericRecords(Schema)}. - */ - public static Sink sink(Schema schema) { - return sink(schema.toString()); - } - - /** - * A {@link Sink} for use with {@link FileIO#write} and {@link FileIO#writeDynamic}, writing - * elements with a given (common) schema, like {@link #writeGenericRecords(String)}. - */ - public static Sink sink(String jsonSchema) { - return new AutoValue_AvroIO_Sink.Builder() - .setJsonSchema(jsonSchema) - .setMetadata(ImmutableMap.of()) - .setCodec(TypedWrite.DEFAULT_SERIALIZABLE_CODEC) - .build(); - } - - /** - * A {@link Sink} for use with {@link FileIO#write} and {@link FileIO#writeDynamic}, writing - * elements by converting each one to a {@link GenericRecord} with a given (common) schema, like - * {@link #writeCustomTypeToGenericRecords()}. - * - * @deprecated RecordFormatter will be removed in future versions. - */ - @Deprecated - public static Sink sinkViaGenericRecords( - Schema schema, RecordFormatter formatter) { - return new AutoValue_AvroIO_Sink.Builder() - .setRecordFormatter(formatter) - .setJsonSchema(schema.toString()) - .setMetadata(ImmutableMap.of()) - .setCodec(TypedWrite.DEFAULT_SERIALIZABLE_CODEC) - .build(); - } - - /** Implementation of {@link #sink} and {@link #sinkViaGenericRecords}. */ - @AutoValue - public abstract static class Sink implements FileIO.Sink { - /** @deprecated RecordFormatter will be removed in future versions. */ - @Deprecated - abstract @Nullable RecordFormatter getRecordFormatter(); - - abstract @Nullable String getJsonSchema(); - - abstract Map getMetadata(); - - abstract SerializableAvroCodecFactory getCodec(); - - abstract Builder toBuilder(); - - @AutoValue.Builder - abstract static class Builder { - /** @deprecated RecordFormatter will be removed in future versions. */ - @Deprecated - abstract Builder setRecordFormatter(RecordFormatter formatter); - - abstract Builder setJsonSchema(String jsonSchema); - - abstract Builder setMetadata(Map metadata); - - abstract Builder setCodec(SerializableAvroCodecFactory codec); - - abstract Sink build(); - } - - /** Specifies to put the given metadata into each generated file. By default, empty. */ - public Sink withMetadata(Map metadata) { - return toBuilder().setMetadata(metadata).build(); - } - - /** - * Specifies to use the given {@link CodecFactory} for each generated file. By default, {@code - * CodecFactory.snappyCodec()}. - */ - public Sink withCodec(CodecFactory codec) { - return toBuilder().setCodec(new SerializableAvroCodecFactory(codec)).build(); - } - - private transient @Nullable Schema schema; - private transient @Nullable DataFileWriter reflectWriter; - private transient @Nullable DataFileWriter genericWriter; - - @Override - public void open(WritableByteChannel channel) throws IOException { - this.schema = new Schema.Parser().parse(getJsonSchema()); - DataFileWriter writer; - if (getRecordFormatter() == null) { - writer = reflectWriter = new DataFileWriter<>(new ReflectDatumWriter<>(schema)); - } else { - writer = genericWriter = new DataFileWriter<>(new GenericDatumWriter<>(schema)); - } - writer.setCodec(getCodec().getCodec()); - for (Map.Entry entry : getMetadata().entrySet()) { - Object v = entry.getValue(); - if (v instanceof String) { - writer.setMeta(entry.getKey(), (String) v); - } else if (v instanceof Long) { - writer.setMeta(entry.getKey(), (Long) v); - } else if (v instanceof byte[]) { - writer.setMeta(entry.getKey(), (byte[]) v); - } else { - throw new IllegalStateException( - "Metadata value type must be one of String, Long, or byte[]. Found " - + v.getClass().getSimpleName()); - } - } - writer.create(schema, Channels.newOutputStream(channel)); - } - - @Override - public void write(ElementT element) throws IOException { - if (getRecordFormatter() == null) { - reflectWriter.append(element); - } else { - genericWriter.append(getRecordFormatter().formatRecord(element, schema)); - } - } - - @Override - public void flush() throws IOException { - MoreObjects.firstNonNull(reflectWriter, genericWriter).flush(); - } - } - - /** Disallow construction of utility class. */ - private AvroIO() {} -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSchemaIOProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSchemaIOProvider.java deleted file mode 100644 index 43498235992a7..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSchemaIOProvider.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import com.google.auto.service.AutoService; -import java.io.Serializable; -import org.apache.avro.generic.GenericRecord; -import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.sdk.io.AvroIO.Write; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.Schema.FieldType; -import org.apache.beam.sdk.schemas.io.SchemaIO; -import org.apache.beam.sdk.schemas.io.SchemaIOProvider; -import org.apache.beam.sdk.schemas.transforms.Convert; -import org.apache.beam.sdk.schemas.utils.AvroUtils; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.values.PBegin; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PCollection.IsBounded; -import org.apache.beam.sdk.values.PDone; -import org.apache.beam.sdk.values.POutput; -import org.apache.beam.sdk.values.Row; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.Duration; - -/** - * An implementation of {@link SchemaIOProvider} for reading and writing Avro files with {@link - * AvroIO}. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.io.AvroSchemaIOProvider instead of this one. - */ -@Internal -@AutoService(SchemaIOProvider.class) -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -@Deprecated -public class AvroSchemaIOProvider implements SchemaIOProvider { - /** Returns an id that uniquely represents this IO. */ - @Override - public String identifier() { - return "avro"; - } - - /** - * Returns the expected schema of the configuration object. Note this is distinct from the schema - * of the data source itself. No configuration expected for Avro. - */ - @Override - public Schema configurationSchema() { - return Schema.builder().addNullableField("writeWindowSizeSeconds", FieldType.INT64).build(); - } - - /** - * Produce a SchemaIO given a String representing the data's location, the schema of the data that - * resides there, and some IO-specific configuration object. - */ - @Override - public AvroSchemaIO from(String location, Row configuration, Schema dataSchema) { - return new AvroSchemaIO(location, dataSchema, configuration); - } - - @Override - public boolean requiresDataSchema() { - return true; - } - - @Override - public PCollection.IsBounded isBounded() { - // This supports streaming now as well but there's no option for this. The move to - // SchemaTransform will remove the need to provide this. - return PCollection.IsBounded.BOUNDED; - } - - /** An abstraction to create schema aware IOs. */ - private static class AvroSchemaIO implements SchemaIO, Serializable { - protected final Schema dataSchema; - protected final String location; - protected final @Nullable Duration windowSize; - - private AvroSchemaIO(String location, Schema dataSchema, Row configuration) { - this.dataSchema = dataSchema; - this.location = location; - if (configuration.getInt64("writeWindowSizeSeconds") != null) { - windowSize = Duration.standardSeconds(configuration.getInt64("writeWindowSizeSeconds")); - } else { - windowSize = null; - } - } - - @Override - public Schema schema() { - return dataSchema; - } - - @Override - public PTransform> buildReader() { - return new PTransform>() { - @Override - public PCollection expand(PBegin begin) { - return begin - .apply( - "AvroIORead", - AvroIO.readGenericRecords(AvroUtils.toAvroSchema(dataSchema, null, null)) - .withBeamSchemas(true) - .from(location)) - .apply("ToRows", Convert.toRows()); - } - }; - } - - @Override - public PTransform, POutput> buildWriter() { - return new PTransform, POutput>() { - @Override - public PDone expand(PCollection input) { - PCollection asRecords = - input.apply("ToGenericRecords", Convert.to(GenericRecord.class)); - Write avroWrite = - AvroIO.writeGenericRecords(AvroUtils.toAvroSchema(dataSchema, null, null)) - .to(location); - if (input.isBounded() == IsBounded.UNBOUNDED || windowSize != null) { - asRecords = - asRecords.apply( - Window.into( - FixedWindows.of( - windowSize == null ? Duration.standardMinutes(1) : windowSize))); - avroWrite = avroWrite.withWindowedWrites().withNumShards(1); - } else { - avroWrite = avroWrite.withoutSharding(); - } - return asRecords.apply("AvroIOWrite", avroWrite); - } - }; - } - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSink.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSink.java deleted file mode 100644 index bc92113925cd7..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSink.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import java.io.Serializable; -import java.nio.channels.Channels; -import java.nio.channels.WritableByteChannel; -import java.util.Map; -import org.apache.avro.Schema; -import org.apache.avro.file.CodecFactory; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.io.DatumWriter; -import org.apache.avro.reflect.ReflectDatumWriter; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.options.ValueProvider; -import org.apache.beam.sdk.util.MimeTypes; -import org.checkerframework.checker.nullness.qual.Nullable; - -/** - * A {@link FileBasedSink} for Avro files. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.io.AvroSink instead of this one. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -@Deprecated -public class AvroSink - extends FileBasedSink { - private final boolean genericRecords; - private final int syncInterval; - - @FunctionalInterface - public interface DatumWriterFactory extends Serializable { - DatumWriter apply(Schema writer); - } - - AvroSink( - ValueProvider outputPrefix, - DynamicAvroDestinations dynamicDestinations, - boolean genericRecords, - int syncInterval) { - // Avro handles compression internally using the codec. - super(outputPrefix, dynamicDestinations, Compression.UNCOMPRESSED); - this.genericRecords = genericRecords; - this.syncInterval = syncInterval; - } - - @Override - public DynamicAvroDestinations getDynamicDestinations() { - return (DynamicAvroDestinations) super.getDynamicDestinations(); - } - - @Override - public WriteOperation createWriteOperation() { - return new AvroWriteOperation<>(this, genericRecords, syncInterval); - } - - /** A {@link WriteOperation WriteOperation} for Avro files. */ - private static class AvroWriteOperation - extends WriteOperation { - private final DynamicAvroDestinations dynamicDestinations; - private final boolean genericRecords; - private final int syncInterval; - - private AvroWriteOperation( - AvroSink sink, boolean genericRecords, int syncInterval) { - super(sink); - this.dynamicDestinations = sink.getDynamicDestinations(); - this.genericRecords = genericRecords; - this.syncInterval = syncInterval; - } - - @Override - public Writer createWriter() throws Exception { - return new AvroWriter<>(this, dynamicDestinations, genericRecords, syncInterval); - } - } - - /** A {@link Writer Writer} for Avro files. */ - private static class AvroWriter extends Writer { - - // Initialized in prepareWrite - private @Nullable DataFileWriter dataFileWriter; - - private final DynamicAvroDestinations dynamicDestinations; - private final boolean genericRecords; - private final int syncInterval; - - public AvroWriter( - WriteOperation writeOperation, - DynamicAvroDestinations dynamicDestinations, - boolean genericRecords, - int syncInterval) { - super(writeOperation, MimeTypes.BINARY); - this.dynamicDestinations = dynamicDestinations; - this.genericRecords = genericRecords; - this.syncInterval = syncInterval; - } - - @SuppressWarnings("deprecation") // uses internal test functionality. - @Override - protected void prepareWrite(WritableByteChannel channel) throws Exception { - DestinationT destination = getDestination(); - CodecFactory codec = dynamicDestinations.getCodec(destination); - Schema schema = dynamicDestinations.getSchema(destination); - Map metadata = dynamicDestinations.getMetadata(destination); - DatumWriter datumWriter; - DatumWriterFactory datumWriterFactory = - dynamicDestinations.getDatumWriterFactory(destination); - - if (datumWriterFactory == null) { - datumWriter = - genericRecords ? new GenericDatumWriter<>(schema) : new ReflectDatumWriter<>(schema); - } else { - datumWriter = datumWriterFactory.apply(schema); - } - - dataFileWriter = new DataFileWriter<>(datumWriter).setCodec(codec); - for (Map.Entry entry : metadata.entrySet()) { - Object v = entry.getValue(); - if (v instanceof String) { - dataFileWriter.setMeta(entry.getKey(), (String) v); - } else if (v instanceof Long) { - dataFileWriter.setMeta(entry.getKey(), (Long) v); - } else if (v instanceof byte[]) { - dataFileWriter.setMeta(entry.getKey(), (byte[]) v); - } else { - throw new IllegalStateException( - "Metadata value type must be one of String, Long, or byte[]. Found " - + v.getClass().getSimpleName()); - } - } - dataFileWriter.setSyncInterval(syncInterval); - dataFileWriter.create(schema, Channels.newOutputStream(channel)); - } - - @Override - public void write(OutputT value) throws Exception { - dataFileWriter.append(value); - } - - @Override - protected void finishWrite() throws Exception { - dataFileWriter.flush(); - } - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSource.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSource.java deleted file mode 100644 index f19375a7bcdce..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/AvroSource.java +++ /dev/null @@ -1,773 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import static org.apache.beam.sdk.io.FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InvalidObjectException; -import java.io.ObjectInputStream; -import java.io.ObjectStreamException; -import java.io.Serializable; -import java.nio.ByteBuffer; -import java.nio.channels.Channels; -import java.nio.channels.ReadableByteChannel; -import java.nio.channels.SeekableByteChannel; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Iterator; -import java.util.Map; -import java.util.WeakHashMap; -import javax.annotation.concurrent.GuardedBy; -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileConstants; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.file.SeekableInput; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.BinaryDecoder; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.DecoderFactory; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.reflect.ReflectDatumReader; -import org.apache.beam.sdk.PipelineRunner; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.io.fs.EmptyMatchTreatment; -import org.apache.beam.sdk.io.fs.MatchResult.Metadata; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.ValueProvider; -import org.apache.beam.sdk.transforms.SerializableFunction; -import org.apache.beam.sdk.util.VarInt; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.checkerframework.checker.nullness.qual.Nullable; - -// CHECKSTYLE.OFF: JavadocStyle -/** - * Do not use in pipelines directly: most users should use {@link AvroIO.Read}. - * - *

A {@link FileBasedSource} for reading Avro files. - * - *

To read a {@link PCollection} of objects from one or more Avro files, use {@link - * AvroSource#from} to specify the path(s) of the files to read. The {@link AvroSource} that is - * returned will read objects of type {@link GenericRecord} with the schema(s) that were written at - * file creation. To further configure the {@link AvroSource} to read with a user-defined schema, or - * to return records of a type other than {@link GenericRecord}, use {@link - * AvroSource#withSchema(Schema)} (using an Avro {@link Schema}), {@link - * AvroSource#withSchema(String)} (using a JSON schema), or {@link AvroSource#withSchema(Class)} (to - * return objects of the Avro-generated class specified). - * - *

An {@link AvroSource} can be read from using the {@link Read} transform. For example: - * - *

{@code
- * AvroSource source = AvroSource.from(file.toPath()).withSchema(MyType.class);
- * PCollection records = Read.from(mySource);
- * }
- * - *

This class's implementation is based on the Avro 1.7.7 specification and implements - * parsing of some parts of Avro Object Container Files. The rationale for doing so is that the Avro - * API does not provide efficient ways of computing the precise offsets of blocks within a file, - * which is necessary to support dynamic work rebalancing. However, whenever it is possible to use - * the Avro API in a way that supports maintaining precise offsets, this class uses the Avro API. - * - *

Avro Object Container files store records in blocks. Each block contains a collection of - * records. Blocks may be encoded (e.g., with bzip2, deflate, snappy, etc.). Blocks are delineated - * from one another by a 16-byte sync marker. - * - *

An {@link AvroSource} for a subrange of a single file contains records in the blocks such that - * the start offset of the block is greater than or equal to the start offset of the source and less - * than the end offset of the source. - * - *

To use XZ-encoded Avro files, please include an explicit dependency on {@code xz-1.8.jar}, - * which has been marked as optional in the Maven {@code sdk/pom.xml}. - * - *

{@code
- * 
- *   org.tukaani
- *   xz
- *   1.8
- * 
- * }
- * - *

Permissions

- * - *

Permission requirements depend on the {@link PipelineRunner} that is used to execute the - * pipeline. Please refer to the documentation of corresponding {@link PipelineRunner}s for more - * details. - * - * @param The type of records to be read from the source. - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.io.AvroSource instead of this one. - */ -// CHECKSTYLE.ON: JavadocStyle - -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -@Deprecated -public class AvroSource extends BlockBasedSource { - // Default minimum bundle size (chosen as two default-size Avro blocks to attempt to - // ensure that every source has at least one block of records). - // The default sync interval is 64k. - private static final long DEFAULT_MIN_BUNDLE_SIZE = 2L * DataFileConstants.DEFAULT_SYNC_INTERVAL; - - @FunctionalInterface - public interface DatumReaderFactory extends Serializable { - DatumReader apply(Schema writer, Schema reader); - } - - private static final DatumReaderFactory GENERIC_DATUM_READER_FACTORY = GenericDatumReader::new; - - private static final DatumReaderFactory REFLECT_DATUM_READER_FACTORY = ReflectDatumReader::new; - - // Use cases of AvroSource are: - // 1) AvroSource Reading GenericRecord records with a specified schema. - // 2) AvroSource Reading records of a generated Avro class Foo. - // 3) AvroSource Reading GenericRecord records with an unspecified schema - // and converting them to type T. - // | Case 1 | Case 2 | Case 3 | - // type | GenericRecord | Foo | GenericRecord | - // readerSchemaString | non-null | non-null | null | - // parseFn | null | null | non-null | - // outputCoder | null | null | non-null | - // readerFactory | either | either | either | - private static class Mode implements Serializable { - private final Class type; - - // The JSON schema used to decode records. - private @Nullable String readerSchemaString; - - private final @Nullable SerializableFunction parseFn; - - private final @Nullable Coder outputCoder; - - private final @Nullable DatumReaderFactory readerFactory; - - private Mode( - Class type, - @Nullable String readerSchemaString, - @Nullable SerializableFunction parseFn, - @Nullable Coder outputCoder, - @Nullable DatumReaderFactory readerFactory) { - this.type = type; - this.readerSchemaString = internSchemaString(readerSchemaString); - this.parseFn = parseFn; - this.outputCoder = outputCoder; - this.readerFactory = readerFactory; - } - - private void readObject(ObjectInputStream is) throws IOException, ClassNotFoundException { - is.defaultReadObject(); - readerSchemaString = internSchemaString(readerSchemaString); - } - - private Coder getOutputCoder() { - if (parseFn == null) { - return AvroCoder.of((Class) type, internOrParseSchemaString(readerSchemaString)); - } else { - return outputCoder; - } - } - - private void validate() { - if (parseFn == null) { - checkArgument( - readerSchemaString != null, - "schema must be specified using withSchema() when not using a parse fn"); - } - } - - private Mode withReaderFactory(DatumReaderFactory factory) { - return new Mode<>(type, readerSchemaString, parseFn, outputCoder, factory); - } - - private DatumReader createReader(Schema writerSchema, Schema readerSchema) { - DatumReaderFactory factory = this.readerFactory; - if (factory == null) { - factory = - (type == GenericRecord.class) - ? GENERIC_DATUM_READER_FACTORY - : REFLECT_DATUM_READER_FACTORY; - } - return factory.apply(writerSchema, readerSchema); - } - } - - private static Mode readGenericRecordsWithSchema( - String schema, @Nullable DatumReaderFactory factory) { - return new Mode<>(GenericRecord.class, schema, null, null, factory); - } - - private static Mode readGeneratedClasses( - Class clazz, @Nullable DatumReaderFactory factory) { - return new Mode<>(clazz, ReflectData.get().getSchema(clazz).toString(), null, null, factory); - } - - private static Mode parseGenericRecords( - SerializableFunction parseFn, - Coder outputCoder, - @Nullable DatumReaderFactory factory) { - return new Mode<>(GenericRecord.class, null, parseFn, outputCoder, factory); - } - - private final Mode mode; - - /** - * Reads from the given file name or pattern ("glob"). The returned source needs to be further - * configured by calling {@link #withSchema} to return a type other than {@link GenericRecord}. - */ - public static AvroSource from(ValueProvider fileNameOrPattern) { - return new AvroSource<>( - fileNameOrPattern, - EmptyMatchTreatment.DISALLOW, - DEFAULT_MIN_BUNDLE_SIZE, - readGenericRecordsWithSchema(null /* will need to be specified in withSchema */, null)); - } - - public static AvroSource from(Metadata metadata) { - return new AvroSource<>( - metadata, - DEFAULT_MIN_BUNDLE_SIZE, - 0, - metadata.sizeBytes(), - readGenericRecordsWithSchema(null /* will need to be specified in withSchema */, null)); - } - - /** Like {@link #from(ValueProvider)}. */ - public static AvroSource from(String fileNameOrPattern) { - return from(ValueProvider.StaticValueProvider.of(fileNameOrPattern)); - } - - public AvroSource withEmptyMatchTreatment(EmptyMatchTreatment emptyMatchTreatment) { - return new AvroSource<>( - getFileOrPatternSpecProvider(), emptyMatchTreatment, getMinBundleSize(), mode); - } - - /** Reads files containing records that conform to the given schema. */ - public AvroSource withSchema(String schema) { - checkArgument(schema != null, "schema can not be null"); - return new AvroSource<>( - getFileOrPatternSpecProvider(), - getEmptyMatchTreatment(), - getMinBundleSize(), - readGenericRecordsWithSchema(schema, mode.readerFactory)); - } - - /** Like {@link #withSchema(String)}. */ - public AvroSource withSchema(Schema schema) { - checkArgument(schema != null, "schema can not be null"); - return withSchema(schema.toString()); - } - - /** Reads files containing records of the given class. */ - public AvroSource withSchema(Class clazz) { - checkArgument(clazz != null, "clazz can not be null"); - if (getMode() == SINGLE_FILE_OR_SUBRANGE) { - return new AvroSource<>( - getSingleFileMetadata(), - getMinBundleSize(), - getStartOffset(), - getEndOffset(), - readGeneratedClasses(clazz, mode.readerFactory)); - } - return new AvroSource<>( - getFileOrPatternSpecProvider(), - getEmptyMatchTreatment(), - getMinBundleSize(), - readGeneratedClasses(clazz, mode.readerFactory)); - } - - /** - * Reads {@link GenericRecord} of unspecified schema and maps them to instances of a custom type - * using the given {@code parseFn} and encoded using the given coder. - */ - public AvroSource withParseFn( - SerializableFunction parseFn, Coder coder) { - checkArgument(parseFn != null, "parseFn can not be null"); - checkArgument(coder != null, "coder can not be null"); - if (getMode() == SINGLE_FILE_OR_SUBRANGE) { - return new AvroSource<>( - getSingleFileMetadata(), - getMinBundleSize(), - getStartOffset(), - getEndOffset(), - parseGenericRecords(parseFn, coder, mode.readerFactory)); - } - return new AvroSource<>( - getFileOrPatternSpecProvider(), - getEmptyMatchTreatment(), - getMinBundleSize(), - parseGenericRecords(parseFn, coder, mode.readerFactory)); - } - - /** - * Sets the minimum bundle size. Refer to {@link OffsetBasedSource} for a description of {@code - * minBundleSize} and its use. - */ - public AvroSource withMinBundleSize(long minBundleSize) { - if (getMode() == SINGLE_FILE_OR_SUBRANGE) { - return new AvroSource<>( - getSingleFileMetadata(), minBundleSize, getStartOffset(), getEndOffset(), mode); - } - return new AvroSource<>( - getFileOrPatternSpecProvider(), getEmptyMatchTreatment(), minBundleSize, mode); - } - - public AvroSource withDatumReaderFactory(DatumReaderFactory factory) { - Mode newMode = mode.withReaderFactory(factory); - if (getMode() == SINGLE_FILE_OR_SUBRANGE) { - return new AvroSource<>( - getSingleFileMetadata(), getMinBundleSize(), getStartOffset(), getEndOffset(), newMode); - } - return new AvroSource<>( - getFileOrPatternSpecProvider(), getEmptyMatchTreatment(), getMinBundleSize(), newMode); - } - - /** Constructor for FILEPATTERN mode. */ - private AvroSource( - ValueProvider fileNameOrPattern, - EmptyMatchTreatment emptyMatchTreatment, - long minBundleSize, - Mode mode) { - super(fileNameOrPattern, emptyMatchTreatment, minBundleSize); - this.mode = mode; - } - - /** Constructor for SINGLE_FILE_OR_SUBRANGE mode. */ - private AvroSource( - Metadata metadata, long minBundleSize, long startOffset, long endOffset, Mode mode) { - super(metadata, minBundleSize, startOffset, endOffset); - this.mode = mode; - } - - @Override - public void validate() { - super.validate(); - mode.validate(); - } - - /** - * Used by the Dataflow worker. Do not introduce new usages. Do not delete without confirming that - * Dataflow ValidatesRunner tests pass. - * - * @deprecated Used by Dataflow worker - */ - @Deprecated - public BlockBasedSource createForSubrangeOfFile(String fileName, long start, long end) - throws IOException { - return createForSubrangeOfFile(FileSystems.matchSingleFileSpec(fileName), start, end); - } - - @Override - public BlockBasedSource createForSubrangeOfFile(Metadata fileMetadata, long start, long end) { - return new AvroSource<>(fileMetadata, getMinBundleSize(), start, end, mode); - } - - @Override - protected BlockBasedReader createSingleFileReader(PipelineOptions options) { - return new AvroReader<>(this); - } - - @Override - public Coder getOutputCoder() { - return mode.getOutputCoder(); - } - - @VisibleForTesting - @Nullable - String getReaderSchemaString() { - return mode.readerSchemaString; - } - - /** Avro file metadata. */ - @VisibleForTesting - static class AvroMetadata { - private final byte[] syncMarker; - private final String codec; - private final String schemaString; - - AvroMetadata(byte[] syncMarker, String codec, String schemaString) { - this.syncMarker = checkNotNull(syncMarker, "syncMarker"); - this.codec = checkNotNull(codec, "codec"); - this.schemaString = internSchemaString(checkNotNull(schemaString, "schemaString")); - } - - /** - * The JSON-encoded schema - * string for the file. - */ - public String getSchemaString() { - return schemaString; - } - - /** - * The codec of the - * file. - */ - public String getCodec() { - return codec; - } - - /** - * The 16-byte sync marker for the file. See the documentation for Object Container - * File for more information. - */ - public byte[] getSyncMarker() { - return syncMarker; - } - } - - /** - * Reads the {@link AvroMetadata} from the header of an Avro file. - * - *

This method parses the header of an Avro Object Container - * File. - * - * @throws IOException if the file is an invalid format. - */ - @VisibleForTesting - static AvroMetadata readMetadataFromFile(ResourceId fileResource) throws IOException { - String codec = null; - String schemaString = null; - byte[] syncMarker; - try (InputStream stream = Channels.newInputStream(FileSystems.open(fileResource))) { - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null); - - // The header of an object container file begins with a four-byte magic number, followed - // by the file metadata (including the schema and codec), encoded as a map. Finally, the - // header ends with the file's 16-byte sync marker. - // See https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files for details on - // the encoding of container files. - - // Read the magic number. - byte[] magic = new byte[DataFileConstants.MAGIC.length]; - decoder.readFixed(magic); - if (!Arrays.equals(magic, DataFileConstants.MAGIC)) { - throw new IOException("Missing Avro file signature: " + fileResource); - } - - // Read the metadata to find the codec and schema. - ByteBuffer valueBuffer = ByteBuffer.allocate(512); - long numRecords = decoder.readMapStart(); - while (numRecords > 0) { - for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) { - String key = decoder.readString(); - // readBytes() clears the buffer and returns a buffer where: - // - position is the start of the bytes read - // - limit is the end of the bytes read - valueBuffer = decoder.readBytes(valueBuffer); - byte[] bytes = new byte[valueBuffer.remaining()]; - valueBuffer.get(bytes); - if (key.equals(DataFileConstants.CODEC)) { - codec = new String(bytes, StandardCharsets.UTF_8); - } else if (key.equals(DataFileConstants.SCHEMA)) { - schemaString = new String(bytes, StandardCharsets.UTF_8); - } - } - numRecords = decoder.mapNext(); - } - if (codec == null) { - codec = DataFileConstants.NULL_CODEC; - } - - // Finally, read the sync marker. - syncMarker = new byte[DataFileConstants.SYNC_SIZE]; - decoder.readFixed(syncMarker); - } - checkState(schemaString != null, "No schema present in Avro file metadata %s", fileResource); - return new AvroMetadata(syncMarker, codec, schemaString); - } - - // A logical reference cache used to store schemas and schema strings to allow us to - // "intern" values and reduce the number of copies of equivalent objects. - private static final Map schemaLogicalReferenceCache = new WeakHashMap<>(); - private static final Map schemaStringLogicalReferenceCache = new WeakHashMap<>(); - - // We avoid String.intern() because depending on the JVM, these may be added to the PermGenSpace - // which we want to avoid otherwise we could run out of PermGenSpace. - private static synchronized String internSchemaString(String schema) { - String internSchema = schemaStringLogicalReferenceCache.get(schema); - if (internSchema != null) { - return internSchema; - } - schemaStringLogicalReferenceCache.put(schema, schema); - return schema; - } - - static synchronized Schema internOrParseSchemaString(String schemaString) { - Schema schema = schemaLogicalReferenceCache.get(schemaString); - if (schema != null) { - return schema; - } - Schema.Parser parser = new Schema.Parser(); - schema = parser.parse(schemaString); - schemaLogicalReferenceCache.put(schemaString, schema); - return schema; - } - - // Reading the object from Java serialization typically does not go through the constructor, - // we use readResolve to replace the constructed instance with one which uses the constructor - // allowing us to intern any schemas. - @SuppressWarnings("unused") - private Object readResolve() throws ObjectStreamException { - switch (getMode()) { - case SINGLE_FILE_OR_SUBRANGE: - return new AvroSource<>( - getSingleFileMetadata(), getMinBundleSize(), getStartOffset(), getEndOffset(), mode); - case FILEPATTERN: - return new AvroSource<>( - getFileOrPatternSpecProvider(), getEmptyMatchTreatment(), getMinBundleSize(), mode); - default: - throw new InvalidObjectException( - String.format("Unknown mode %s for AvroSource %s", getMode(), this)); - } - } - - /** - * A {@link BlockBasedSource.Block} of Avro records. - * - * @param The type of records stored in the block. - */ - static class AvroBlock extends Block { - - // The current record in the block. Initialized in readNextRecord. - private @Nullable T currentRecord; - - // The index of the current record in the block. - private long currentRecordIndex = 0; - - private final Iterator iterator; - - private final SerializableFunction parseFn; - - private final long numRecordsInBlock; - - AvroBlock( - Iterator iter, SerializableFunction parseFn, long numRecordsInBlock) { - this.iterator = iter; - this.parseFn = parseFn; - this.numRecordsInBlock = numRecordsInBlock; - } - - @Override - public T getCurrentRecord() { - return currentRecord; - } - - @Override - public boolean readNextRecord() { - if (currentRecordIndex >= numRecordsInBlock) { - return false; - } - - Object record = iterator.next(); - currentRecord = (parseFn == null) ? ((T) record) : parseFn.apply((GenericRecord) record); - currentRecordIndex++; - return true; - } - - @Override - public double getFractionOfBlockConsumed() { - return ((double) currentRecordIndex) / numRecordsInBlock; - } - } - - /** - * A {@link BlockBasedSource.BlockBasedReader} for reading blocks from Avro files. - * - *

An Avro Object Container File consists of a header followed by a 16-bit sync marker and then - * a sequence of blocks, where each block begins with two encoded longs representing the total - * number of records in the block and the block's size in bytes, followed by the block's - * (optionally-encoded) records. Each block is terminated by a 16-bit sync marker. - * - * @param The type of records contained in the block. - */ - public static class AvroReader extends BlockBasedReader { - - private static class SeekableChannelInput implements SeekableInput { - - private final SeekableByteChannel channel; - private final InputStream input; - - SeekableChannelInput(SeekableByteChannel channel) { - this.channel = channel; - this.input = Channels.newInputStream(channel); - } - - @Override - public void seek(long p) throws IOException { - channel.position(p); - } - - @Override - public long tell() throws IOException { - return channel.position(); - } - - @Override - public long length() throws IOException { - return channel.size(); - } - - @Override - public int read(byte[] b, int off, int len) throws IOException { - return input.read(b, off, len); - } - - @Override - public void close() throws IOException { - channel.close(); - } - } - - // The current block. - // Initialized in readNextRecord. - private @Nullable AvroBlock currentBlock; - - private @Nullable DataFileReader dataFileReader; - - // A lock used to synchronize block offsets for getRemainingParallelism - private final Object progressLock = new Object(); - - // Offset of the current block. - @GuardedBy("progressLock") - private long currentBlockOffset = 0; - - // Size of the current block. - @GuardedBy("progressLock") - private long currentBlockSizeBytes = 0; - - /** Reads Avro records of type {@code T} from the specified source. */ - public AvroReader(AvroSource source) { - super(source); - } - - @Override - public synchronized AvroSource getCurrentSource() { - return (AvroSource) super.getCurrentSource(); - } - - // Precondition: the stream is positioned after the sync marker in the current (about to be - // previous) block. currentBlockSize equals the size of the current block, or zero if this - // reader was just started. - // - // Postcondition: same as above, but for the new current (formerly next) block. - @Override - public boolean readNextBlock() { - if (!dataFileReader.hasNext()) { - return false; - } - - long headerLength = - (long) VarInt.getLength(dataFileReader.getBlockCount()) - + VarInt.getLength(dataFileReader.getBlockSize()) - + DataFileConstants.SYNC_SIZE; - - currentBlock = - new AvroBlock<>( - dataFileReader, getCurrentSource().mode.parseFn, dataFileReader.getBlockCount()); - - // Atomically update both the position and offset of the new block. - synchronized (progressLock) { - currentBlockOffset = dataFileReader.previousSync(); - // Total block size includes the header, block content, and trailing sync marker. - currentBlockSizeBytes = dataFileReader.getBlockSize() + headerLength; - } - - return true; - } - - @Override - public AvroBlock getCurrentBlock() { - return currentBlock; - } - - @Override - public long getCurrentBlockOffset() { - synchronized (progressLock) { - return currentBlockOffset; - } - } - - @Override - public long getCurrentBlockSize() { - synchronized (progressLock) { - return currentBlockSizeBytes; - } - } - - @Override - public long getSplitPointsRemaining() { - if (isDone()) { - return 0; - } - synchronized (progressLock) { - if (currentBlockOffset + currentBlockSizeBytes >= getCurrentSource().getEndOffset()) { - // This block is known to be the last block in the range. - return 1; - } - } - return super.getSplitPointsRemaining(); - } - - // Postcondition: the stream is positioned at the beginning of the first block after the start - // of the current source, and currentBlockOffset is that position. Additionally, - // currentBlockSizeBytes will be set to 0 indicating that the previous block was empty. - @Override - protected void startReading(ReadableByteChannel channel) throws IOException { - SeekableChannelInput seekableChannelInput = - new SeekableChannelInput((SeekableByteChannel) channel); - // the channel needs to be at the beginning of the file in order for the DataFileReader to - // read the header, etc, we'll seek it back to where it should be after creating the DFR. - seekableChannelInput.seek(0); - - Schema readerSchema = null; - String readerSchemaString = this.getCurrentSource().getReaderSchemaString(); - if (readerSchemaString != null) { - readerSchema = AvroSource.internOrParseSchemaString(readerSchemaString); - } - // the DataFileReader will call setSchema with the writer schema when created. - DatumReader reader = this.getCurrentSource().mode.createReader(readerSchema, readerSchema); - - dataFileReader = new DataFileReader<>(seekableChannelInput, reader); - - long startOffset = getCurrentSource().getStartOffset(); - if (startOffset != 0) { - // the start offset may be in the middle of a sync marker, by rewinding SYNC_SIZE bytes we - // ensure that we won't miss the block if so. - dataFileReader.sync(Math.max(0, startOffset - DataFileConstants.SYNC_SIZE)); - } - - synchronized (progressLock) { - currentBlockOffset = dataFileReader.previousSync(); - currentBlockSizeBytes = 0; - } - } - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ConstantAvroDestination.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ConstantAvroDestination.java deleted file mode 100644 index dc7fee0d7ad5f..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ConstantAvroDestination.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import java.io.Serializable; -import java.util.Map; -import org.apache.avro.Schema; -import org.apache.avro.file.CodecFactory; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.transforms.SerializableFunction; -import org.apache.beam.sdk.transforms.display.DisplayData; -import org.apache.beam.sdk.transforms.display.HasDisplayData; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.BaseEncoding; -import org.checkerframework.checker.nullness.qual.Nullable; - -/** - * Always returns a constant {@link FilenamePolicy}, {@link Schema}, metadata, and codec. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.io.ConstantAvroDestination instead of this one. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -@Deprecated -class ConstantAvroDestination - extends DynamicAvroDestinations { - private static class SchemaFunction implements Serializable, Function { - @Override - public Schema apply(String input) { - return new Schema.Parser().parse(input); - } - } - - // This should be a multiple of 4 to not get a partial encoded byte. - private static final int METADATA_BYTES_MAX_LENGTH = 40; - private final FilenamePolicy filenamePolicy; - private final Supplier schema; - private final Map metadata; - private final SerializableAvroCodecFactory codec; - private final SerializableFunction formatFunction; - private final AvroSink.DatumWriterFactory datumWriterFactory; - - private class Metadata implements HasDisplayData { - @Override - public void populateDisplayData(DisplayData.Builder builder) { - for (Map.Entry entry : metadata.entrySet()) { - DisplayData.Type type = DisplayData.inferType(entry.getValue()); - if (type != null) { - builder.add(DisplayData.item(entry.getKey(), type, entry.getValue())); - } else { - String base64 = BaseEncoding.base64().encode((byte[]) entry.getValue()); - String repr = - base64.length() <= METADATA_BYTES_MAX_LENGTH - ? base64 - : base64.substring(0, METADATA_BYTES_MAX_LENGTH) + "..."; - builder.add(DisplayData.item(entry.getKey(), repr)); - } - } - } - } - - public ConstantAvroDestination( - FilenamePolicy filenamePolicy, - Schema schema, - Map metadata, - CodecFactory codec, - SerializableFunction formatFunction) { - this(filenamePolicy, schema, metadata, codec, formatFunction, null); - } - - public ConstantAvroDestination( - FilenamePolicy filenamePolicy, - Schema schema, - Map metadata, - CodecFactory codec, - SerializableFunction formatFunction, - AvroSink.@Nullable DatumWriterFactory datumWriterFactory) { - this.filenamePolicy = filenamePolicy; - this.schema = Suppliers.compose(new SchemaFunction(), Suppliers.ofInstance(schema.toString())); - this.metadata = metadata; - this.codec = new SerializableAvroCodecFactory(codec); - this.formatFunction = formatFunction; - this.datumWriterFactory = datumWriterFactory; - } - - @Override - public OutputT formatRecord(UserT record) { - return formatFunction.apply(record); - } - - @Override - public @Nullable Void getDestination(UserT element) { - return (Void) null; - } - - @Override - public @Nullable Void getDefaultDestination() { - return (Void) null; - } - - @Override - public FilenamePolicy getFilenamePolicy(Void destination) { - return filenamePolicy; - } - - @Override - public Schema getSchema(Void destination) { - return schema.get(); - } - - @Override - public Map getMetadata(Void destination) { - return metadata; - } - - @Override - public CodecFactory getCodec(Void destination) { - return codec.getCodec(); - } - - @Override - public AvroSink.@Nullable DatumWriterFactory getDatumWriterFactory(Void destination) { - return datumWriterFactory; - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - filenamePolicy.populateDisplayData(builder); - builder.add(DisplayData.item("schema", schema.get().toString()).withLabel("Record Schema")); - builder.addIfNotDefault( - DisplayData.item("codec", codec.getCodec().toString()).withLabel("Avro Compression Codec"), - AvroIO.TypedWrite.DEFAULT_SERIALIZABLE_CODEC.toString()); - builder.include("Metadata", new Metadata()); - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CountingSource.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CountingSource.java index 23896c8cc962a..9d30efb2f1136 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CountingSource.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/CountingSource.java @@ -20,13 +20,19 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; +import java.io.DataInputStream; +import java.io.DataOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.List; import java.util.NoSuchElementException; import java.util.Objects; -import org.apache.beam.sdk.coders.AvroCoder; +import org.apache.beam.sdk.coders.BigEndianLongCoder; import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.CustomCoder; import org.apache.beam.sdk.coders.DefaultCoder; +import org.apache.beam.sdk.coders.InstantCoder; import org.apache.beam.sdk.coders.VarLongCoder; import org.apache.beam.sdk.io.UnboundedSource.UnboundedReader; import org.apache.beam.sdk.metrics.Counter; @@ -354,7 +360,7 @@ public UnboundedReader createReader(PipelineOptions options, CounterMark c @Override public Coder getCheckpointMarkCoder() { - return AvroCoder.of(CountingSource.CounterMark.class); + return new CounterMarkCoder(); } @Override @@ -485,7 +491,7 @@ public long getSplitBacklogBytes() { * The checkpoint for an unbounded {@link CountingSource} is simply the last value produced. The * associated source object encapsulates the information needed to produce the next value. */ - @DefaultCoder(AvroCoder.class) + @DefaultCoder(CounterMarkCoder.class) public static class CounterMark implements UnboundedSource.CheckpointMark { /** The last value emitted. */ private final long lastEmitted; @@ -519,4 +525,22 @@ private CounterMark() { @Override public void finalizeCheckpoint() throws IOException {} } + + /** A custom coder for {@code CounterMark}. */ + public static class CounterMarkCoder extends CustomCoder { + @Override + public void encode(CounterMark value, OutputStream outStream) throws IOException { + DataOutputStream stream = new DataOutputStream(outStream); + BigEndianLongCoder.of().encode(value.lastEmitted, stream); + InstantCoder.of().encode(value.startTime, stream); + } + + @Override + public CounterMark decode(InputStream inStream) throws IOException { + DataInputStream stream = new DataInputStream(inStream); + long lastEmitted = BigEndianLongCoder.of().decode(stream); + Instant startTime = InstantCoder.of().decode(stream); + return new CounterMark(lastEmitted, startTime); + } + } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/DynamicAvroDestinations.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/DynamicAvroDestinations.java deleted file mode 100644 index 8094fe30d8532..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/DynamicAvroDestinations.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import java.util.Map; -import org.apache.avro.Schema; -import org.apache.avro.file.CodecFactory; -import org.apache.beam.sdk.io.FileBasedSink.DynamicDestinations; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.checkerframework.checker.nullness.qual.Nullable; - -/** - * A specialization of {@link DynamicDestinations} for {@link AvroIO}. In addition to dynamic file - * destinations, this allows specifying other AVRO properties (schema, metadata, codec, datum - * writer) per destination. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.io.DynamicAvroDestinations instead of this one. - */ -@Deprecated -public abstract class DynamicAvroDestinations - extends DynamicDestinations { - /** Return an AVRO schema for a given destination. */ - public abstract Schema getSchema(DestinationT destination); - - /** Return AVRO file metadata for a given destination. */ - public Map getMetadata(DestinationT destination) { - return ImmutableMap.of(); - } - - /** Return an AVRO codec for a given destination. */ - public CodecFactory getCodec(DestinationT destination) { - return AvroIO.TypedWrite.DEFAULT_CODEC; - } - - /** - * Return a {@link AvroSink.DatumWriterFactory} for a given destination. If provided, it will be - * used to created {@link org.apache.avro.io.DatumWriter} instances as required. - */ - public AvroSink.@Nullable DatumWriterFactory getDatumWriterFactory( - DestinationT destinationT) { - return null; - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileIO.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileIO.java index 2d28279f90b64..76fc1a70b78c5 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileIO.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileIO.java @@ -227,7 +227,7 @@ * {@link Sink}, e.g. write different elements to Avro files in different directories with different * schemas. * - *

This feature is supported by {@link #writeDynamic}. Use {@link Write#by} to specify how to + *

This feature is supported by {@link #writeDynamic}. Use {@link Write#by} to specify how too * partition the elements into groups ("destinations"). Then elements will be grouped by * destination, and {@link Write#withNaming(Contextful)} and {@link Write#via(Contextful)} will be * applied separately within each group, i.e. different groups will be written using the file naming diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/SerializableAvroCodecFactory.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/SerializableAvroCodecFactory.java deleted file mode 100644 index 29e14ae06668d..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/SerializableAvroCodecFactory.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import static org.apache.avro.file.DataFileConstants.BZIP2_CODEC; -import static org.apache.avro.file.DataFileConstants.DEFLATE_CODEC; -import static org.apache.avro.file.DataFileConstants.NULL_CODEC; -import static org.apache.avro.file.DataFileConstants.SNAPPY_CODEC; -import static org.apache.avro.file.DataFileConstants.XZ_CODEC; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; - -import java.io.Externalizable; -import java.io.IOException; -import java.io.ObjectInput; -import java.io.ObjectOutput; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.avro.file.CodecFactory; -import org.checkerframework.checker.nullness.qual.Nullable; - -/** - * A wrapper that allows {@link org.apache.avro.file.CodecFactory}s to be serialized using Java's - * standard serialization mechanisms. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.io.SerializableAvroCodecFactory instead of this - * one. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -@Deprecated -class SerializableAvroCodecFactory implements Externalizable { - private static final long serialVersionUID = 7445324844109564303L; - private static final List noOptAvroCodecs = - Arrays.asList(NULL_CODEC, SNAPPY_CODEC, BZIP2_CODEC); - private static final Pattern deflatePattern = Pattern.compile(DEFLATE_CODEC + "-(?-?\\d)"); - private static final Pattern xzPattern = Pattern.compile(XZ_CODEC + "-(?\\d)"); - - private @Nullable CodecFactory codecFactory; - - // For java.io.Externalizable - public SerializableAvroCodecFactory() {} - - public SerializableAvroCodecFactory(CodecFactory codecFactory) { - checkNotNull(codecFactory, "Codec can't be null"); - checkState(checkIsSupportedCodec(codecFactory), "%s is not supported", codecFactory); - this.codecFactory = codecFactory; - } - - private boolean checkIsSupportedCodec(CodecFactory codecFactory) { - final String codecStr = codecFactory.toString(); - return noOptAvroCodecs.contains(codecStr) - || deflatePattern.matcher(codecStr).matches() - || xzPattern.matcher(codecStr).matches(); - } - - @Override - public void writeExternal(ObjectOutput out) throws IOException { - out.writeUTF(codecFactory.toString()); - } - - @Override - public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { - final String codecStr = in.readUTF(); - - switch (codecStr) { - case NULL_CODEC: - case SNAPPY_CODEC: - case BZIP2_CODEC: - codecFactory = CodecFactory.fromString(codecStr); - return; - } - - Matcher deflateMatcher = deflatePattern.matcher(codecStr); - if (deflateMatcher.find()) { - codecFactory = CodecFactory.deflateCodec(Integer.parseInt(deflateMatcher.group("level"))); - return; - } - - Matcher xzMatcher = xzPattern.matcher(codecStr); - if (xzMatcher.find()) { - codecFactory = CodecFactory.xzCodec(Integer.parseInt(xzMatcher.group("level"))); - return; - } - - throw new IllegalStateException(codecStr + " is not supported"); - } - - public CodecFactory getCodec() { - return codecFactory; - } - - @Override - public String toString() { - checkNotNull(codecFactory, "Inner CodecFactory is null, please use non default constructor"); - return codecFactory.toString(); - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextIO.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextIO.java index 33beff23b311e..2c7a4fc5d4f5c 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextIO.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextIO.java @@ -191,6 +191,7 @@ public static Read read() { return new AutoValue_TextIO_Read.Builder() .setCompression(Compression.AUTO) .setHintMatchesManyFiles(false) + .setSkipHeaderLines(0) .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .build(); } @@ -214,6 +215,7 @@ public static Read read() { public static ReadAll readAll() { return new AutoValue_TextIO_ReadAll.Builder() .setCompression(Compression.AUTO) + .setSkipHeaderLines(0) .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .build(); } @@ -228,6 +230,7 @@ public static ReadFiles readFiles() { // but is not so large as to exhaust a typical runner's maximum amount of output per // ProcessElement call. .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES) + .setSkipHeaderLines(0) .build(); } @@ -286,6 +289,8 @@ public abstract static class Read extends PTransform @SuppressWarnings("mutable") // this returns an array that can be mutated by the caller abstract byte @Nullable [] getDelimiter(); + abstract int getSkipHeaderLines(); + abstract Builder toBuilder(); @AutoValue.Builder @@ -300,6 +305,8 @@ abstract static class Builder { abstract Builder setDelimiter(byte @Nullable [] delimiter); + abstract Builder setSkipHeaderLines(int skipHeaderLines); + abstract Read build(); } @@ -396,6 +403,10 @@ public Read withDelimiter(byte[] delimiter) { return toBuilder().setDelimiter(delimiter).build(); } + public Read withSkipHeaderLines(int skipHeaderLines) { + return toBuilder().setSkipHeaderLines(skipHeaderLines).build(); + } + static boolean isSelfOverlapping(byte[] s) { // s self-overlaps if v exists such as s = vu = wv with u and w non empty for (int i = 1; i < s.length - 1; ++i) { @@ -422,7 +433,9 @@ public PCollection expand(PBegin input) { FileIO.readMatches() .withCompression(getCompression()) .withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) - .apply("Via ReadFiles", readFiles().withDelimiter(getDelimiter())); + .apply( + "Via ReadFiles", + readFiles().withDelimiter(getDelimiter()).withSkipHeaderLines(getSkipHeaderLines())); } // Helper to create a source specific to the requested compression type. @@ -431,7 +444,8 @@ protected FileBasedSource getSource() { new TextSource( getFilepattern(), getMatchConfiguration().getEmptyMatchTreatment(), - getDelimiter())) + getDelimiter(), + getSkipHeaderLines())) .withCompression(getCompression()); } @@ -468,6 +482,8 @@ public abstract static class ReadAll @SuppressWarnings("mutable") // this returns an array that can be mutated by the caller abstract byte @Nullable [] getDelimiter(); + abstract int getSkipHeaderLines(); + abstract Builder toBuilder(); @AutoValue.Builder @@ -478,6 +494,8 @@ abstract static class Builder { abstract Builder setDelimiter(byte @Nullable [] delimiter); + abstract Builder setSkipHeaderLines(int skipHeaderLines); + abstract ReadAll build(); } @@ -560,6 +578,8 @@ public abstract static class ReadFiles @SuppressWarnings("mutable") // this returns an array that can be mutated by the caller abstract byte @Nullable [] getDelimiter(); + abstract int getSkipHeaderLines(); + abstract Builder toBuilder(); @AutoValue.Builder @@ -568,6 +588,8 @@ abstract static class Builder { abstract Builder setDelimiter(byte @Nullable [] delimiter); + abstract Builder setSkipHeaderLines(int skipHeaderLines); + abstract ReadFiles build(); } @@ -581,13 +603,17 @@ public ReadFiles withDelimiter(byte[] delimiter) { return toBuilder().setDelimiter(delimiter).build(); } + public ReadFiles withSkipHeaderLines(int skipHeaderLines) { + return toBuilder().setSkipHeaderLines(skipHeaderLines).build(); + } + @Override public PCollection expand(PCollection input) { return input.apply( "Read all via FileBasedSource", new ReadAllViaFileBasedSource<>( getDesiredBundleSizeBytes(), - new CreateTextSourceFn(getDelimiter()), + new CreateTextSourceFn(getDelimiter(), getSkipHeaderLines()), StringUtf8Coder.of())); } @@ -602,15 +628,20 @@ public void populateDisplayData(DisplayData.Builder builder) { private static class CreateTextSourceFn implements SerializableFunction> { private byte[] delimiter; + private int skipHeaderLines; - private CreateTextSourceFn(byte[] delimiter) { + private CreateTextSourceFn(byte[] delimiter, int skipHeaderLines) { this.delimiter = delimiter; + this.skipHeaderLines = skipHeaderLines; } @Override public FileBasedSource apply(String input) { return new TextSource( - StaticValueProvider.of(input), EmptyMatchTreatment.DISALLOW, delimiter); + StaticValueProvider.of(input), + EmptyMatchTreatment.DISALLOW, + delimiter, + skipHeaderLines); } } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextRowCountEstimator.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextRowCountEstimator.java index 32b7fb12f414c..8542ce011098c 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextRowCountEstimator.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextRowCountEstimator.java @@ -46,6 +46,8 @@ public abstract class TextRowCountEstimator { @SuppressWarnings("mutable") public abstract byte @Nullable [] getDelimiters(); + public abstract int getSkipHeaderLines(); + public abstract String getFilePattern(); public abstract Compression getCompression(); @@ -62,7 +64,8 @@ public static TextRowCountEstimator.Builder builder() { .setNumSampledBytesPerFile(DEFAULT_NUM_BYTES_PER_FILE) .setCompression(DEFAULT_COMPRESSION) .setDirectoryTreatment(DEFAULT_DIRECTORY_TREATMENT) - .setEmptyMatchTreatment(DEFAULT_EMPTY_MATCH_TREATMENT); + .setEmptyMatchTreatment(DEFAULT_EMPTY_MATCH_TREATMENT) + .setSkipHeaderLines(0); } /** @@ -114,7 +117,8 @@ public Double estimateRowCount(PipelineOptions pipelineOptions) new TextSource( ValueProvider.StaticValueProvider.of(file.getMetadata().resourceId().toString()), getEmptyMatchTreatment(), - getDelimiters()); + getDelimiters(), + getSkipHeaderLines()); FileBasedSource source = CompressedSource.from(textSource).withCompression(file.getCompression()); try (BoundedSource.BoundedReader reader = @@ -160,6 +164,8 @@ public abstract Builder setDirectoryTreatment( public abstract Builder setDelimiters(byte @Nullable [] delimiters); + public abstract Builder setSkipHeaderLines(int skipHeaderLines); + public abstract Builder setFilePattern(String filePattern); public abstract Builder setEmptyMatchTreatment(EmptyMatchTreatment emptyMatchTreatment); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextSource.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextSource.java index bef30dffa8ac7..3d62c677950a0 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextSource.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/TextSource.java @@ -56,26 +56,43 @@ public class TextSource extends FileBasedSource { byte[] delimiter; + int skipHeaderLines; + public TextSource( - ValueProvider fileSpec, EmptyMatchTreatment emptyMatchTreatment, byte[] delimiter) { + ValueProvider fileSpec, + EmptyMatchTreatment emptyMatchTreatment, + byte[] delimiter, + int skipHeaderLines) { super(fileSpec, emptyMatchTreatment, 1L); this.delimiter = delimiter; + this.skipHeaderLines = skipHeaderLines; } - public TextSource(MatchResult.Metadata metadata, long start, long end, byte[] delimiter) { + public TextSource( + ValueProvider fileSpec, EmptyMatchTreatment emptyMatchTreatment, byte[] delimiter) { + this(fileSpec, emptyMatchTreatment, delimiter, 0); + } + + public TextSource( + MatchResult.Metadata metadata, long start, long end, byte[] delimiter, int skipHeaderLines) { super(metadata, 1L, start, end); this.delimiter = delimiter; + this.skipHeaderLines = skipHeaderLines; + } + + public TextSource(MatchResult.Metadata metadata, long start, long end, byte[] delimiter) { + this(metadata, start, end, delimiter, 0); } @Override protected FileBasedSource createForSubrangeOfFile( MatchResult.Metadata metadata, long start, long end) { - return new TextSource(metadata, start, end, delimiter); + return new TextSource(metadata, start, end, delimiter, skipHeaderLines); } @Override protected FileBasedReader createSingleFileReader(PipelineOptions options) { - return new TextBasedReader(this, delimiter); + return new TextBasedReader(this, delimiter, skipHeaderLines); } @Override @@ -98,6 +115,7 @@ static class TextBasedReader extends FileBasedReader { private static final byte LF = '\n'; private final byte @Nullable [] delimiter; + private final int skipHeaderLines; private final ByteArrayOutputStream str; private final byte[] buffer; private final ByteBuffer byteBuffer; @@ -112,11 +130,16 @@ static class TextBasedReader extends FileBasedReader { private boolean skipLineFeedAtStart; // skip an LF if at the start of the next buffer private TextBasedReader(TextSource source, byte[] delimiter) { + this(source, delimiter, 0); + } + + private TextBasedReader(TextSource source, byte[] delimiter, int skipHeaderLines) { super(source); this.buffer = new byte[READ_BUFFER_SIZE]; this.str = new ByteArrayOutputStream(); this.byteBuffer = ByteBuffer.wrap(buffer); this.delimiter = delimiter; + this.skipHeaderLines = skipHeaderLines; } @Override @@ -171,21 +194,42 @@ protected void startReading(ReadableByteChannel channel) throws IOException { } else { startOfNextRecord = bufferPosn = (int) requiredPosition; } + skipHeader(skipHeaderLines, true); } else { - ((SeekableByteChannel) channel).position(requiredPosition); - startOfNextRecord = requiredPosition; + skipHeader(skipHeaderLines, false); + if (requiredPosition > startOfNextRecord) { + ((SeekableByteChannel) channel).position(requiredPosition); + startOfNextRecord = requiredPosition; + bufferLength = bufferPosn = 0; + } + // Read and discard the next record ensuring that startOfNextRecord and bufferPosn point + // to the beginning of the next record. + readNextRecord(); + currentValue = null; } - // Read and discard the next record ensuring that startOfNextRecord and bufferPosn point - // to the beginning of the next record. - readNextRecord(); - currentValue = null; } else { // Check to see if we start with the UTF_BOM bytes skipping them if present. if (fileStartsWithBom()) { startOfNextRecord = bufferPosn = UTF8_BOM.size(); } + skipHeader(skipHeaderLines, false); + } + } + + private void skipHeader(int headerLines, boolean skipFirstLine) throws IOException { + if (headerLines == 1) { + readNextRecord(); + } else if (headerLines > 1) { + // this will be expensive + ((SeekableByteChannel) inChannel).position(0); + for (int line = 0; line < headerLines; ++line) { + readNextRecord(); + } + } else if (headerLines == 0 && skipFirstLine) { + readNextRecord(); } + currentValue = null; } private boolean fileStartsWithBom() throws IOException { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsContainer.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsContainer.java index e93f8677b814d..f48b9195c37cb 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsContainer.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsContainer.java @@ -33,6 +33,14 @@ public interface MetricsContainer extends Serializable { */ Counter getCounter(MetricName metricName); + /** + * Return the {@link Counter} that should be used for implementing the given per-worker {@code metricName) + * in this container. + */ + default Counter getPerWorkerCounter(MetricName metricName) { + return NoOpCounter.getInstance(); + } + /** * Return the {@link Distribution} that should be used for implementing the given {@code * metricName} in this container. @@ -52,6 +60,14 @@ public interface MetricsContainer extends Serializable { default Histogram getHistogram(MetricName metricName, HistogramData.BucketType bucketType) { throw new RuntimeException("Histogram metric is not supported yet."); } + /** + * Return the {@link Histogram} that should be used for implementing the given per-worker {@code + * metricName} in this container. + */ + default Histogram getPerWorkerHistogram( + MetricName metricName, HistogramData.BucketType bucketType) { + return NoOpHistogram.getInstance(); + } /** Return the cumulative values for any metrics in this container as MonitoringInfos. */ default Iterable getMonitoringInfos() { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/NoOpCounter.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/NoOpCounter.java new file mode 100644 index 0000000000000..ab4fa685f9c20 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/NoOpCounter.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.metrics; + +/** + * A no-op implementation of Counter. This class exists to provide a default if an implementation of + * MetricsContainer does not override a Counter getter. + */ +public class NoOpCounter implements Counter { + + private static final NoOpCounter singleton = new NoOpCounter(); + private static final MetricName name = MetricName.named(NoOpCounter.class, "singleton"); + + private NoOpCounter() {} + + @Override + public void inc() {} + + @Override + public void inc(long n) {} + + @Override + public void dec() {} + + @Override + public void dec(long n) {} + + @Override + public MetricName getName() { + return name; + } + + public static NoOpCounter getInstance() { + return singleton; + } +} diff --git a/.test-infra/jenkins/job_ReleaseCandidate_Python.groovy b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/NoOpHistogram.java similarity index 54% rename from .test-infra/jenkins/job_ReleaseCandidate_Python.groovy rename to sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/NoOpHistogram.java index b337d6c03837d..a088223ffe2b8 100644 --- a/.test-infra/jenkins/job_ReleaseCandidate_Python.groovy +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/NoOpHistogram.java @@ -15,24 +15,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.beam.sdk.metrics; -import CommonJobProperties as commonJobProperties +/** + * A no-op implementation of Histogram. This class exists to provide a default if an implementation + * of MetricsContainer does not override a Histogram getter. + */ +public class NoOpHistogram implements Histogram { + + private static final NoOpHistogram singleton = new NoOpHistogram(); + private static final MetricName name = MetricName.named(NoOpHistogram.class, "singleton"); -job('beam_PostRelease_Python_Candidate') { - description('Runs verification of the Python release candidate.') + private NoOpHistogram() {} - // Set common parameters. - commonJobProperties.setTopLevelMainJobProperties(delegate, 'master', 360) + @Override + public void update(double value) {} - // Allows triggering this build against pull requests. - commonJobProperties.enablePhraseTriggeringFromPullRequest( - delegate, - 'Python SDK Release Candidates Validation', - 'Run Python ReleaseCandidate') + @Override + public MetricName getName() { + return name; + } - // Execute shell command to test Python SDK. - steps { - shell('cd ' + commonJobProperties.checkoutDir + - ' && bash release/src/main/python-release/python_release_automation.sh') + public static NoOpHistogram getInstance() { + return singleton; } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/AvroRecordSchema.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/AvroRecordSchema.java deleted file mode 100644 index 19027cd4527f1..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/AvroRecordSchema.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas; - -import static org.apache.beam.sdk.schemas.utils.AvroUtils.toBeamSchema; - -import java.util.List; -import org.apache.avro.reflect.ReflectData; -import org.apache.beam.sdk.schemas.utils.AvroUtils; -import org.apache.beam.sdk.values.TypeDescriptor; - -/** - * A {@link SchemaProvider} for AVRO generated SpecificRecords and POJOs. - * - *

This provider infers a schema from generated SpecificRecord objects, and creates schemas and - * rows that bind to the appropriate fields. This provider also infers schemas from Java POJO - * objects, creating a schema that matches that inferred by the AVRO libraries. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.schemas.AvroRecordSchema instead of this one. - */ -@SuppressWarnings({ - "rawtypes" // TODO(https://github.com/apache/beam/issues/20447) -}) -@Deprecated -public class AvroRecordSchema extends GetterBasedSchemaProvider { - @Override - public Schema schemaFor(TypeDescriptor typeDescriptor) { - return toBeamSchema(ReflectData.get().getSchema(typeDescriptor.getRawType())); - } - - @Override - public List fieldValueGetters(Class targetClass, Schema schema) { - return AvroUtils.getGetters(targetClass, schema); - } - - @Override - public List fieldValueTypeInformations( - Class targetClass, Schema schema) { - return AvroUtils.getFieldTypes(targetClass, schema); - } - - @Override - public SchemaUserTypeCreator schemaTypeCreator(Class targetClass, Schema schema) { - return AvroUtils.getCreator(targetClass, schema); - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/io/Providers.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/io/Providers.java index 35b89bb701800..ed3abcd1ba836 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/io/Providers.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/io/Providers.java @@ -40,27 +40,12 @@ private Providers() {} public static Map loadProviders(Class klass) { Map providers = new HashMap<>(); for (T provider : ServiceLoader.load(klass)) { - // Avro provider is treated as a special case since two Avro providers may want to be loaded - - // from "core" (deprecated) and from "extensions/avro" (actual) - but only one must succeed. - // TODO: we won't need this check once all Avro providers from "core" will be - // removed - if (provider.identifier().equals("avro")) { - // Avro provider from "extensions/avro" must have a priority. - if (provider.getClass().getName().startsWith("org.apache.beam.sdk.extensions.avro")) { - // Load Avro provider from "extensions/avro" by any case. - providers.put(provider.identifier(), provider); - } else { - // Load Avro provider from "core" if it was not loaded from Avro extension before. - providers.putIfAbsent(provider.identifier(), provider); - } - } else { - checkState( - !providers.containsKey(provider.identifier()), - "Duplicate providers exist with identifier `%s` for class %s.", - provider.identifier(), - klass); - providers.put(provider.identifier(), provider); - } + checkState( + !providers.containsKey(provider.identifier()), + "Duplicate providers exist with identifier `%s` for class %s.", + provider.identifier(), + klass); + providers.put(provider.identifier(), provider); } return providers; } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/io/payloads/AvroPayloadSerializerProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/io/payloads/AvroPayloadSerializerProvider.java deleted file mode 100644 index e8f99b33c0ddd..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/io/payloads/AvroPayloadSerializerProvider.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas.io.payloads; - -import com.google.auto.service.AutoService; -import java.util.Map; -import org.apache.beam.sdk.annotations.Internal; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.utils.AvroUtils; - -/** - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.schemas.io.payloads.AvroPayloadSerializerProvider - * instead of this one. - */ -@Internal -@Deprecated -@AutoService(PayloadSerializerProvider.class) -public class AvroPayloadSerializerProvider implements PayloadSerializerProvider { - @Override - public String identifier() { - return "avro"; - } - - @Override - public PayloadSerializer getSerializer(Schema schema, Map tableParams) { - return PayloadSerializer.of( - AvroUtils.getRowToAvroBytesFunction(schema), AvroUtils.getAvroBytesToRowFunction(schema)); - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/ErrorHandling.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/ErrorHandling.java new file mode 100644 index 0000000000000..7fa29708c9ff3 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/ErrorHandling.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import com.google.auto.value.AutoValue; +import javax.annotation.Nullable; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.sdk.values.Row; + +@AutoValue +public abstract class ErrorHandling { + @SchemaFieldDescription("The name of the output PCollection containing failed writes.") + public abstract String getOutput(); + + public static Builder builder() { + return new AutoValue_ErrorHandling.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setOutput(String output); + + public abstract ErrorHandling build(); + } + + public static boolean hasOutput(@Nullable ErrorHandling errorHandling) { + return getOutputOrNull(errorHandling) != null; + } + + public static @Nullable String getOutputOrNull(@Nullable ErrorHandling errorHandling) { + return errorHandling == null ? null : errorHandling.getOutput(); + } + + public static Schema errorSchema(Schema inputSchema) { + return Schema.of( + Schema.Field.of("failed_row", Schema.FieldType.row(inputSchema)), + Schema.Field.of("error_message", Schema.FieldType.STRING)); + } + + public static Schema errorSchemaBytes() { + return Schema.of( + Schema.Field.of("failed_row", Schema.FieldType.BYTES), + Schema.Field.of("error_message", Schema.FieldType.STRING)); + } + + @SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) + }) + public static Row errorRecord(Schema errorSchema, Row inputRow, Throwable th) { + return Row.withSchema(errorSchema) + .withFieldValue("failed_row", inputRow) + .withFieldValue("error_message", th.getMessage()) + .build(); + } + + @SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) + }) + public static Row errorRecord(Schema errorSchema, byte[] inputBytes, Throwable th) { + return Row.withSchema(errorSchema) + .withFieldValue("failed_row", inputBytes) + .withFieldValue("error_message", th.getMessage()) + .build(); + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaExplodeTransformProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaExplodeTransformProvider.java new file mode 100644 index 0000000000000..48ce5e33d9fac --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaExplodeTransformProvider.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import com.google.auto.service.AutoService; +import com.google.auto.value.AutoValue; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import javax.annotation.Nullable; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.primitives.Booleans; + +/** + * An implementation of {@link TypedSchemaTransformProvider} for Explode. + * + *

Internal only: This class is actively being worked on, and it will likely change. We + * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam + * repository. + */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +@AutoService(SchemaTransformProvider.class) +public class JavaExplodeTransformProvider + extends TypedSchemaTransformProvider { + protected static final String INPUT_ROWS_TAG = "input"; + protected static final String OUTPUT_ROWS_TAG = "output"; + + @Override + protected Class configurationClass() { + return Configuration.class; + } + + @Override + protected SchemaTransform from(Configuration configuration) { + return new ExplodeTransform(configuration); + } + + @Override + public String identifier() { + return "beam:schematransform:org.apache.beam:yaml:explode:v1"; + } + + @Override + public List inputCollectionNames() { + return Collections.singletonList(INPUT_ROWS_TAG); + } + + @Override + public List outputCollectionNames() { + return Collections.singletonList(OUTPUT_ROWS_TAG); + } + + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class Configuration { + @Nullable + public abstract List getFields(); + + @Nullable + public abstract Boolean getCrossProduct(); + + public static Builder builder() { + return new AutoValue_JavaExplodeTransformProvider_Configuration.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder setFields(List fields); + + public abstract Builder setCrossProduct(@Nullable Boolean append); + + public abstract Configuration build(); + } + } + + /** A {@link SchemaTransform} for Explode. */ + protected static class ExplodeTransform extends SchemaTransform { + + private final Configuration configuration; + + ExplodeTransform(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + Schema inputSchema = input.get(INPUT_ROWS_TAG).getSchema(); + Schema.Builder outputSchemaBuilder = new Schema.Builder(); + for (Schema.Field field : inputSchema.getFields()) { + if (configuration.getFields().contains(field.getName())) { + if (field.getType().getCollectionElementType() == null) { + throw new IllegalArgumentException( + String.format( + "Exploded field %s must be an iterable type, got %s.", + field.getName(), field.getType())); + } else { + outputSchemaBuilder = + outputSchemaBuilder.addField( + field.getName(), field.getType().getCollectionElementType()); + } + } else { + outputSchemaBuilder = outputSchemaBuilder.addField(field); + } + } + Schema outputSchema = outputSchemaBuilder.build(); + + PCollection result = + input + .get(INPUT_ROWS_TAG) + .apply( + "Explode", + ParDo.of( + createDoFn( + configuration.getFields(), + configuration.getCrossProduct(), + outputSchema))); + result.setRowSchema(outputSchema); + + return PCollectionRowTuple.of(OUTPUT_ROWS_TAG, result); + } + + private static DoFn createDoFn( + List fields, Boolean crossProductObj, Schema outputSchema) { + boolean crossProduct; + if (crossProductObj == null) { + if (fields.size() > 1) { + throw new IllegalArgumentException( + "boolean cross product parameter required to explode more than one field"); + } + crossProduct = false; + } else { + crossProduct = crossProductObj; + } + int numFields = outputSchema.getFields().size(); + boolean[] toExplode = + Booleans.toArray( + IntStream.range(0, numFields) + .mapToObj(index -> fields.contains(outputSchema.getField(index).getName())) + .collect(Collectors.toList())); + if (crossProduct) { + return new DoFn() { + @ProcessElement + public void processElement(@Element Row inputRow, OutputReceiver out) { + emitCrossProduct(inputRow, 0, new Object[numFields], out); + } + + private void emitCrossProduct( + Row inputRow, int index, Object[] current, OutputReceiver out) { + if (index == numFields) { + out.output(Row.withSchema(outputSchema).attachValues(ImmutableList.copyOf(current))); + } else if (toExplode[index]) { + for (Object value : inputRow.getIterable(index)) { + current[index] = value; + emitCrossProduct(inputRow, index + 1, current, out); + } + } else { + current[index] = inputRow.getValue(index); + emitCrossProduct(inputRow, index + 1, current, out); + } + } + }; + } else { + return new DoFn() { + @ProcessElement + public void processElement(@Element Row inputRow, OutputReceiver out) { + @SuppressWarnings("rawtypes") + Iterator[] iterators = new Iterator[numFields]; + for (int i = 0; i < numFields; i++) { + if (toExplode[i]) { + iterators[i] = inputRow.getIterable(i).iterator(); + } + } + while (IntStream.range(0, numFields) + .anyMatch(index -> toExplode[index] && iterators[index].hasNext())) { + Row.Builder builder = Row.withSchema(outputSchema); + for (int i = 0; i < numFields; i++) { + if (toExplode[i]) { + if (iterators[i].hasNext()) { + builder.addValue(iterators[i].next()); + } else { + builder.addValue(null); + } + } else { + builder.addValue(inputRow.getValue(i)); + } + } + out.output(builder.build()); + } + } + }; + } + } + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaFilterTransformProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaFilterTransformProvider.java new file mode 100644 index 0000000000000..4ae8d2e41b303 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaFilterTransformProvider.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import com.google.auto.service.AutoService; +import com.google.auto.value.AutoValue; +import java.net.MalformedURLException; +import java.util.Collections; +import java.util.List; +import javax.annotation.Nullable; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; + +/** + * An implementation of {@link TypedSchemaTransformProvider} for Filter for the java language. + * + *

Internal only: This class is actively being worked on, and it will likely change. We + * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam + * repository. + */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +@AutoService(SchemaTransformProvider.class) +public class JavaFilterTransformProvider + extends TypedSchemaTransformProvider { + protected static final String INPUT_ROWS_TAG = "input"; + protected static final String OUTPUT_ROWS_TAG = "output"; + + @Override + protected Class configurationClass() { + return Configuration.class; + } + + @Override + protected SchemaTransform from(Configuration configuration) { + return new JavaFilterTransform(configuration); + } + + @Override + public String identifier() { + return "beam:schematransform:org.apache.beam:yaml:filter-java:v1"; + } + + @Override + public List inputCollectionNames() { + return Collections.singletonList(INPUT_ROWS_TAG); + } + + @Override + public List outputCollectionNames() { + return Collections.singletonList(OUTPUT_ROWS_TAG); + } + + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class Configuration { + @Nullable + public abstract String getLanguage(); + + public abstract JavaRowUdf.Configuration getKeep(); + + @Nullable + public abstract ErrorHandling getErrorHandling(); + + public static Builder builder() { + return new AutoValue_JavaFilterTransformProvider_Configuration.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder setLanguage(String language); + + public abstract Builder setKeep(JavaRowUdf.Configuration keep); + + public abstract Builder setErrorHandling(ErrorHandling errorHandling); + + public abstract Configuration build(); + } + } + + /** A {@link SchemaTransform} for Filter-java. */ + protected static class JavaFilterTransform extends SchemaTransform { + + private final Configuration configuration; + + JavaFilterTransform(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + Schema inputSchema = input.get(INPUT_ROWS_TAG).getSchema(); + JavaRowUdf keepFn; + try { + keepFn = new JavaRowUdf(this.configuration.getKeep(), inputSchema); + } catch (MalformedURLException + | ReflectiveOperationException + | StringCompiler.CompileException exn) { + throw new RuntimeException(exn); + } + if (!keepFn.getOutputType().withNullable(false).equals(Schema.FieldType.BOOLEAN)) { + throw new RuntimeException( + String.format( + "KeepFn %s must return a boolean, but returns %s instead.", + this.configuration.getKeep(), keepFn.getOutputType())); + } + boolean handleErrors = ErrorHandling.hasOutput(configuration.getErrorHandling()); + Schema errorSchema = ErrorHandling.errorSchema(inputSchema); + + PCollectionTuple pcolls = + input + .get(INPUT_ROWS_TAG) + .apply( + "Filter", + ParDo.of(createDoFn(keepFn, errorSchema, handleErrors)) + .withOutputTags(filteredValues, TupleTagList.of(errorValues))); + pcolls.get(filteredValues).setRowSchema(inputSchema); + pcolls.get(errorValues).setRowSchema(errorSchema); + + PCollectionRowTuple result = + PCollectionRowTuple.of(OUTPUT_ROWS_TAG, pcolls.get(filteredValues)); + if (handleErrors) { + result = result.and(configuration.getErrorHandling().getOutput(), pcolls.get(errorValues)); + } + return result; + } + + private static final TupleTag filteredValues = new TupleTag() {}; + private static final TupleTag errorValues = new TupleTag() {}; + + private static DoFn createDoFn( + JavaRowUdf keepFn, Schema errorSchema, boolean handleErrors) { + return new DoFn() { + @ProcessElement + public void processElement(@Element Row inputRow, MultiOutputReceiver out) { + boolean keep = false; + try { + keep = (boolean) keepFn.getFunction().apply(inputRow); + } catch (Exception exn) { + if (handleErrors) { + out.get(errorValues).output(ErrorHandling.errorRecord(errorSchema, inputRow, exn)); + } else { + throw new RuntimeException(exn); + } + } + if (keep) { + out.get(filteredValues).output(inputRow); + } + } + }; + } + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaMapToFieldsTransformProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaMapToFieldsTransformProvider.java new file mode 100644 index 0000000000000..2e2042aef05d7 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaMapToFieldsTransformProvider.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import com.google.auto.service.AutoService; +import com.google.auto.value.AutoValue; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; + +/** + * An implementation of {@link TypedSchemaTransformProvider} for MapToFields for the java language. + * + *

Internal only: This class is actively being worked on, and it will likely change. We + * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam + * repository. + */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +@AutoService(SchemaTransformProvider.class) +public class JavaMapToFieldsTransformProvider + extends TypedSchemaTransformProvider { + protected static final String INPUT_ROWS_TAG = "input"; + protected static final String OUTPUT_ROWS_TAG = "output"; + + @Override + protected Class configurationClass() { + return Configuration.class; + } + + @Override + protected SchemaTransform from(Configuration configuration) { + return new JavaMapToFieldsTransform(configuration); + } + + @Override + public String identifier() { + return "beam:schematransform:org.apache.beam:yaml:map_to_fields-java:v1"; + } + + @Override + public List inputCollectionNames() { + return Collections.singletonList(INPUT_ROWS_TAG); + } + + @Override + public List outputCollectionNames() { + return Collections.singletonList(OUTPUT_ROWS_TAG); + } + + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class Configuration { + @Nullable + public abstract String getLanguage(); + + @Nullable + public abstract Boolean getAppend(); + + @Nullable + public abstract List getDrop(); + + public abstract Map getFields(); + + @Nullable + public abstract ErrorHandling getErrorHandling(); + + public static Builder builder() { + return new AutoValue_JavaMapToFieldsTransformProvider_Configuration.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder setLanguage(String language); + + public abstract Builder setAppend(Boolean append); + + public abstract Builder setDrop(List drop); + + public abstract Builder setFields(Map fields); + + public abstract Builder setErrorHandling(ErrorHandling errorHandling); + + public abstract Configuration build(); + } + } + + /** A {@link SchemaTransform} for MapToFields-java. */ + protected static class JavaMapToFieldsTransform extends SchemaTransform { + + private final Configuration configuration; + + JavaMapToFieldsTransform(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + Schema inputSchema = input.get(INPUT_ROWS_TAG).getSchema(); + Schema.Builder outputSchemaBuilder = new Schema.Builder(); + // TODO(yaml): Consider allowing the full java schema naming syntax + // (perhaps as a different dialect/language). + boolean append = configuration.getAppend() != null && configuration.getAppend(); + List toDrop = + configuration.getDrop() == null ? Collections.emptyList() : configuration.getDrop(); + List udfs = new ArrayList<>(); + if (append) { + for (Schema.Field field : inputSchema.getFields()) { + if (!toDrop.contains(field.getName())) { + try { + udfs.add( + new JavaRowUdf( + JavaRowUdf.Configuration.builder().setExpression(field.getName()).build(), + inputSchema)); + } catch (MalformedURLException + | ReflectiveOperationException + | StringCompiler.CompileException exn) { + throw new RuntimeException(exn); + } + outputSchemaBuilder = outputSchemaBuilder.addField(field); + } + } + } + for (Map.Entry entry : + configuration.getFields().entrySet()) { + if (!"java".equals(configuration.getLanguage())) { + String expr = entry.getValue().getExpression(); + if (expr == null || !inputSchema.hasField(expr)) { + throw new IllegalArgumentException( + "Unknown field or missing language specification for '" + entry.getKey() + "'"); + } + } + try { + JavaRowUdf udf = new JavaRowUdf(entry.getValue(), inputSchema); + udfs.add(udf); + outputSchemaBuilder = outputSchemaBuilder.addField(entry.getKey(), udf.getOutputType()); + } catch (MalformedURLException + | ReflectiveOperationException + | StringCompiler.CompileException exn) { + throw new RuntimeException(exn); + } + } + Schema outputSchema = outputSchemaBuilder.build(); + boolean handleErrors = ErrorHandling.hasOutput(configuration.getErrorHandling()); + Schema errorSchema = ErrorHandling.errorSchema(inputSchema); + + PCollectionTuple pcolls = + input + .get(INPUT_ROWS_TAG) + .apply( + "MapToFields", + ParDo.of(createDoFn(udfs, outputSchema, errorSchema, handleErrors)) + .withOutputTags(mappedValues, TupleTagList.of(errorValues))); + pcolls.get(mappedValues).setRowSchema(outputSchema); + pcolls.get(errorValues).setRowSchema(errorSchema); + + PCollectionRowTuple result = + PCollectionRowTuple.of(OUTPUT_ROWS_TAG, pcolls.get(mappedValues)); + if (handleErrors) { + result = result.and(configuration.getErrorHandling().getOutput(), pcolls.get(errorValues)); + } + return result; + } + + private static final TupleTag mappedValues = new TupleTag() {}; + private static final TupleTag errorValues = new TupleTag() {}; + + private static DoFn createDoFn( + List udfs, Schema outputSchema, Schema errorSchema, boolean handleErrors) { + return new DoFn() { + @ProcessElement + public void processElement(@Element Row inputRow, MultiOutputReceiver out) { + Row outputRow; + try { + Row.Builder builder = Row.withSchema(outputSchema); + for (JavaRowUdf udf : udfs) { + builder.addValue(udf.getFunction().apply(inputRow)); + } + outputRow = builder.build(); + } catch (Exception exn) { + if (handleErrors) { + out.get(errorValues).output(ErrorHandling.errorRecord(errorSchema, inputRow, exn)); + outputRow = null; + } else { + throw new RuntimeException(exn); + } + } + if (outputRow != null) { + out.get(mappedValues).output(outputRow); + } + } + }; + } + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdf.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdf.java new file mode 100644 index 0000000000000..2ec0a9a60cd6c --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdf.java @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.value.AutoValue; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; +import java.lang.reflect.Type; +import java.math.BigDecimal; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLClassLoader; +import java.nio.channels.FileChannel; +import java.nio.channels.ReadableByteChannel; +import java.nio.file.StandardOpenOption; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.annotation.Nullable; +import org.apache.beam.sdk.io.FileSystems; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.FieldValueTypeInformation; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.sdk.schemas.utils.StaticSchemaInference; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.ByteStreams; + +public class JavaRowUdf implements Serializable { + private final Configuration config; + private final Schema inputSchema; + private final Schema.FieldType outputType; + + // Transient so we don't have to worry about issues serializing these dynamically created classes. + // While this is lazily computed, it is always computed on class construction, so any errors + // should still be caught at construction time, and lazily re-computed before any use. + @SuppressFBWarnings("SE_TRANSIENT_FIELD_NOT_RESTORED") + private transient Function function; + + // Find or implement the inverse of StaticSchemaInference.fieldFromType + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class Configuration implements Serializable { + @SchemaFieldDescription("Source code of a java expression in terms of the schema fields.") + @Nullable + public abstract String getExpression(); + + @SchemaFieldDescription( + "Source code of a public class implementing Function for some schema-compatible T.") + @Nullable + public abstract String getCallable(); + + @SchemaFieldDescription("Path to a jar file implementing the function referenced in name.") + @Nullable + public abstract String getPath(); + + @SchemaFieldDescription( + "Fully qualified name of either a class implementing Function (e.g. com.pkg.MyFunction), " + + "or a method taking a single Row argument (e.g. com.pkg.MyClass::methodName). " + + "If a method is passed, it must either be static or belong to a class with a public nullary constructor.") + @Nullable + public abstract String getName(); + + public void validate() { + checkArgument( + Strings.isNullOrEmpty(getPath()) || !Strings.isNullOrEmpty(getName()), + "Specifying a path only allows if a name is provided."); + int totalArgs = + (Strings.isNullOrEmpty(getExpression()) ? 0 : 1) + + (Strings.isNullOrEmpty(getCallable()) ? 0 : 1) + + (Strings.isNullOrEmpty(getName()) ? 0 : 1); + checkArgument( + totalArgs == 1, "Exactly one of expression, callable, or name must be provided."); + } + + public static Configuration.Builder builder() { + return new AutoValue_JavaRowUdf_Configuration.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + public abstract Configuration.Builder setExpression(String expression); + + public abstract Configuration.Builder setCallable(String callable); + + public abstract Configuration.Builder setPath(String path); + + public abstract Configuration.Builder setName(String name); + + public abstract Configuration build(); + } + } + + public JavaRowUdf(Configuration config, Schema inputSchema) + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + this.config = config; + this.inputSchema = inputSchema; + FunctionAndType functionAndType = createFunction(config, inputSchema); + this.outputType = functionAndType.outputType; + this.function = functionAndType.function; + } + + public Schema.FieldType getOutputType() { + return outputType; + } + + public Function getFunction() + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + if (function == null) { + FunctionAndType functionAndType = createFunction(config, inputSchema); + assert functionAndType.outputType.equals(outputType); + function = functionAndType.function; + } + return function; + } + + private static class FunctionAndType { + public final Schema.FieldType outputType; + public final Function function; + + public FunctionAndType(Function function) { + this(outputOf(function), function); + } + + public FunctionAndType(Type outputType, Function function) { + this(TypeDescriptor.of(outputType), function); + } + + public FunctionAndType(TypeDescriptor outputType, Function function) { + this( + StaticSchemaInference.fieldFromType(outputType, new EmptyFieldValueTypeSupplier()), + function); + } + + public FunctionAndType(Schema.FieldType outputType, Function function) { + this.outputType = outputType; + this.function = function; + } + + public static TypeDescriptor outputOf(Function fn) { + return TypeDescriptors.extractFromTypeParameters( + fn, + Function.class, + new TypeDescriptors.TypeVariableExtractor, OutputT>() {}); + } + } + + @SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) + }) + private static FunctionAndType createFunction(Configuration config, Schema inputSchema) + throws ReflectiveOperationException, StringCompiler.CompileException, MalformedURLException { + config.validate(); + if (!Strings.isNullOrEmpty(config.getExpression())) { + return createFunctionFromExpression(config.getExpression(), inputSchema); + } else if (!Strings.isNullOrEmpty(config.getCallable())) { + return createFuctionFromCallable(config.getCallable()); + } else if (!Strings.isNullOrEmpty(config.getName())) { + return createFunctionFromName(config.getName(), config.getPath()); + } else { + throw new UnsupportedOperationException(config.toString()); + } + } + + private static FunctionAndType createFunctionFromExpression(String expression, Schema inputSchema) + throws StringCompiler.CompileException, ReflectiveOperationException { + if (inputSchema.hasField(expression)) { + final int ix = inputSchema.indexOf(expression); + return new FunctionAndType( + inputSchema.getField(expression).getType(), (Row row) -> row.getValue(ix)); + } else { + Map fieldTypes = new HashMap<>(); + for (Schema.Field field : inputSchema.getFields()) { + if (expression.indexOf(field.getName()) != -1) { + fieldTypes.put(field.getName(), typeFromFieldType(field.getType())); + } + } + Type type = StringCompiler.guessExpressionType(expression, fieldTypes); + StringBuilder source = new StringBuilder(); + source.append("import java.util.function.Function;\n"); + source.append("import " + Row.class.getTypeName() + ";\n"); + source.append("public class Eval implements Function {\n"); + source.append(" public Object apply(Row __row__) {\n"); + for (Map.Entry fieldEntry : fieldTypes.entrySet()) { + source.append( + String.format( + " %s %s = (%s) __row__.getValue(%s);%n", + fieldEntry.getValue().getTypeName(), + fieldEntry.getKey(), + fieldEntry.getValue().getTypeName(), + inputSchema.indexOf(fieldEntry.getKey()))); + } + source.append(" return " + expression + ";\n"); + source.append(" }\n"); + source.append("}\n"); + return new FunctionAndType( + type, (Function) StringCompiler.getInstance("Eval", source.toString())); + } + } + + @SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) + }) + private static FunctionAndType createFuctionFromCallable(String callable) + throws StringCompiler.CompileException, ReflectiveOperationException { + Matcher matcher = + Pattern.compile("\\bpublic\\s+class\\s+(\\S+)", Pattern.MULTILINE).matcher(callable); + Preconditions.checkArgument(matcher.find(), "No public class defined in callable source."); + return new FunctionAndType( + (Function) StringCompiler.getInstance(matcher.group(1), callable.toString())); + } + + @SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) + }) + private static FunctionAndType createFunctionFromName(String name, String path) + throws ReflectiveOperationException, MalformedURLException { + if (path != null && !new File(path).exists()) { + try (ReadableByteChannel inChannel = + FileSystems.open(FileSystems.matchNewResource(path, false))) { + File tmpJar = File.createTempFile("map-to-fields-" + name, ".jar"); + try (FileChannel outChannel = FileChannel.open(tmpJar.toPath(), StandardOpenOption.WRITE)) { + ByteStreams.copy(inChannel, outChannel); + } + path = tmpJar.getPath(); + } catch (IOException exn) { + throw new RuntimeException(exn); + } + } + ClassLoader classLoader = + path == null + ? ClassLoader.getSystemClassLoader() + : new URLClassLoader( + new URL[] {new URL("file://" + path)}, ClassLoader.getSystemClassLoader()); + String className, methodName = null; + if (name.indexOf("::") == -1) { + className = name; + methodName = null; + } else { + String[] parts = name.split("::", 2); + className = parts[0]; + methodName = parts[1]; + } + if (methodName == null) { + return new FunctionAndType( + (Function) + classLoader.loadClass(className).getDeclaredConstructor().newInstance()); + } else { + Class clazz = classLoader.loadClass(className); + Method method = clazz.getMethod(methodName, Row.class); + Object base = + Modifier.isStatic(method.getModifiers()) + ? null + : clazz.getDeclaredConstructor().newInstance(); + return new FunctionAndType( + method.getGenericReturnType(), + (Row row) -> { + try { + return method.invoke(base, row); + } catch (IllegalAccessException | InvocationTargetException exn) { + throw new RuntimeException(exn); + } + }); + } + } + + private static class EmptyFieldValueTypeSupplier + implements org.apache.beam.sdk.schemas.utils.FieldValueTypeSupplier { + @Override + public List get(Class clazz) { + return Collections.emptyList(); + } + } + + private static final Map NULLABLE_PRIMITIVES = + ImmutableMap.builder() + .put(Schema.TypeName.BYTE, Byte.class) + .put(Schema.TypeName.INT16, Short.class) + .put(Schema.TypeName.INT32, Integer.class) + .put(Schema.TypeName.INT64, Long.class) + .put(Schema.TypeName.FLOAT, Float.class) + .put(Schema.TypeName.DOUBLE, Double.class) + .put(Schema.TypeName.BOOLEAN, Boolean.class) + .put(Schema.TypeName.BYTES, byte[].class) + .put(Schema.TypeName.STRING, String.class) + .put(Schema.TypeName.DECIMAL, BigDecimal.class) + .build(); + + private static final Map NON_NULLABLE_PRIMITIVES = + ImmutableMap.builder() + .put(Schema.TypeName.BYTE, byte.class) + .put(Schema.TypeName.INT16, short.class) + .put(Schema.TypeName.INT32, int.class) + .put(Schema.TypeName.INT64, long.class) + .put(Schema.TypeName.FLOAT, float.class) + .put(Schema.TypeName.DOUBLE, double.class) + .put(Schema.TypeName.BOOLEAN, boolean.class) + .put(Schema.TypeName.BYTES, byte[].class) + .put(Schema.TypeName.STRING, String.class) + .put(Schema.TypeName.DECIMAL, BigDecimal.class) + .build(); + + private static Type typeFromFieldType(Schema.FieldType fieldType) { + Map primitivesMap = + fieldType.getNullable() ? NULLABLE_PRIMITIVES : NON_NULLABLE_PRIMITIVES; + if (primitivesMap.containsKey(fieldType.getTypeName())) { + return primitivesMap.get(fieldType.getTypeName()); + } else if (fieldType.getRowSchema() != null) { + return Row.class; + } else { + throw new UnsupportedOperationException(fieldType.toString()); + } + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/StringCompiler.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/StringCompiler.java new file mode 100644 index 0000000000000..04730dce80c00 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/StringCompiler.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.lang.reflect.Method; +import java.lang.reflect.Type; +import java.net.URI; +import java.net.URL; +import java.net.URLClassLoader; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.security.SecureClassLoader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.function.Supplier; +import java.util.jar.Attributes; +import java.util.jar.Manifest; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import javax.tools.Diagnostic; +import javax.tools.DiagnosticCollector; +import javax.tools.FileObject; +import javax.tools.ForwardingJavaFileManager; +import javax.tools.JavaCompiler; +import javax.tools.JavaFileObject; +import javax.tools.SimpleJavaFileObject; +import javax.tools.StandardJavaFileManager; +import javax.tools.ToolProvider; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; + +public class StringCompiler { + @SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) + }) + private static final Supplier classpathSupplier = + Suppliers.memoize( + () -> { + List cp = new ArrayList<>(); + cp.add(System.getProperty("java.class.path")); + // Javac doesn't properly handle manifest classpath spec. + ClassLoader cl = StringCompiler.class.getClassLoader(); + if (cl == null) { + cl = ClassLoader.getSystemClassLoader(); + } + if (cl instanceof URLClassLoader) { + for (URL url : ((URLClassLoader) cl).getURLs()) { + File file = new File(url.getFile()); + if (file.exists() && !file.isDirectory()) { + try (ZipFile zipFile = new ZipFile(new File(url.getFile()))) { + ZipEntry manifestEntry = zipFile.getEntry("META-INF/MANIFEST.MF"); + if (manifestEntry != null) { + Manifest manifest = new Manifest(zipFile.getInputStream(manifestEntry)); + cp.add(manifest.getMainAttributes().getValue(Attributes.Name.CLASS_PATH)); + } + } catch (IOException exn) { + throw new RuntimeException(exn); + } + } + } + } + return String.join(System.getProperty("path.separator"), cp); + }); + + public static class CompileException extends Exception { + private final DiagnosticCollector diagnostics; + + public CompileException(DiagnosticCollector diagnostics) { + super(diagnostics.getDiagnostics().toString()); + this.diagnostics = diagnostics; + } + + public DiagnosticCollector getDiagnostics() { + return diagnostics; + } + } + + // TODO(XXX): swap args? + public static Class getClass(String name, String source) + throws CompileException, ClassNotFoundException { + JavaCompiler compiler = ToolProvider.getSystemJavaCompiler(); + InMemoryFileManager fileManager = + new InMemoryFileManager(compiler.getStandardFileManager(null, null, null)); + DiagnosticCollector diagnostics = new DiagnosticCollector<>(); + JavaCompiler.CompilationTask task = + compiler.getTask( + null, + fileManager, + diagnostics, + ImmutableList.of("-classpath", classpathSupplier.get()), + null, + Collections.singletonList(new InMemoryFileManager.InputJavaFileObject(name, source))); + boolean result = task.call(); + if (!result) { + throw new CompileException(diagnostics); + } else { + return (Class) fileManager.getClassLoader().loadClass(name); + } + } + + public static Object getInstance(String name, String source) + throws CompileException, ReflectiveOperationException { + return getClass(name, source).getDeclaredConstructor().newInstance(); + } + + public static Type guessExpressionType(String expression, Map inputTypes) + throws StringCompiler.CompileException, ClassNotFoundException { + + String expectedError = "cannot be converted to __TypeGuesserHelper__.BadReturnType"; + + try { + StringCompiler.getClass( + "__TypeGuesserHelper__", typeGuesserSource(expression, inputTypes, "BadReturnType")); + // Must have returned null. + return Void.class; + } catch (StringCompiler.CompileException exn) { + // Use the error message to derive the actual type. + for (Diagnostic d : exn.getDiagnostics().getDiagnostics()) { + String msg = d.getMessage(Locale.ROOT); + int expectedErrorIndex = msg.indexOf(expectedError); + if (expectedErrorIndex != -1) { + String typeSource = + msg.substring( + 1 + "incompatible types: ".length() + msg.lastIndexOf('\n', expectedErrorIndex), + expectedErrorIndex); + Class clazz = + StringCompiler.getClass( + "__TypeGuesserHelper__", typeGuesserSource(expression, inputTypes, typeSource)); + for (Method method : clazz.getMethods()) { + if (method.getName().equals("method")) { + return method.getGenericReturnType(); + } + } + // We should never get here. + throw new RuntimeException("Unable to locate declared method."); + } + } + // Must have been some other error. + throw exn; + } + } + + private static String typeGuesserSource( + String expression, Map inputTypes, String returnType) { + StringBuilder source = new StringBuilder(); + source.append("class __TypeGuesserHelper__ {\n"); + source.append(" private static class BadReturnType { private BadReturnType() {} }\n"); + source.append(" public static " + returnType + " method(\n"); + boolean first = true; + for (Map.Entry arg : inputTypes.entrySet()) { + if (first) { + first = false; + } else { + source.append(", "); + } + source.append(arg.getValue().getTypeName() + " " + arg.getKey()); + } + source.append(" ) {\n"); + source.append(" return " + expression + ";\n"); + source.append(" }\n"); + source.append("}\n"); + return source.toString(); + } + + private static class InMemoryFileManager + extends ForwardingJavaFileManager { + + private Map outputFileObjects = new HashMap<>(); + + public InMemoryFileManager(StandardJavaFileManager standardManager) { + super(standardManager); + } + + @Override + public JavaFileObject getJavaFileForOutput( + Location location, String className, JavaFileObject.Kind kind, FileObject sibling) { + + OutputJavaFileObject classAsBytes = new OutputJavaFileObject(className, kind); + outputFileObjects.put(className, classAsBytes); + return classAsBytes; + } + + public ClassLoader getClassLoader() { + return AccessController.doPrivileged( + (PrivilegedAction) + () -> + new SecureClassLoader() { + @Override + protected Class findClass(String name) throws ClassNotFoundException { + OutputJavaFileObject fileObject = outputFileObjects.get(name); + if (fileObject == null) { + throw new ClassNotFoundException(name); + } else { + byte[] classBytes = fileObject.getBytes(); + return defineClass(name, classBytes, 0, classBytes.length); + } + } + }); + } + + @Override + public ClassLoader getClassLoader(Location location) { + return getClassLoader(); + } + + private static class InputJavaFileObject extends SimpleJavaFileObject { + private String source; + + public InputJavaFileObject(String name, String source) { + super( + URI.create("input:///" + name.replace('.', '/') + Kind.SOURCE.extension), Kind.SOURCE); + this.source = source; + } + + @Override + public CharSequence getCharContent(boolean ignoreEncodingErrors) { + return source; + } + } + + private static class OutputJavaFileObject extends SimpleJavaFileObject { + + private ByteArrayOutputStream content = new ByteArrayOutputStream(); + + public OutputJavaFileObject(String name, Kind kind) { + super(URI.create("output:///" + name.replace('.', '/') + kind.extension), kind); + } + + public byte[] getBytes() { + return content.toByteArray(); + } + + @Override + public OutputStream openOutputStream() { + return content; + } + } + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/package-info.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/package-info.java new file mode 100644 index 0000000000000..6c5d1cb7c5701 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/package-info.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Defines transforms that work on PCollections with schemas.. + * + *

For further details, see the documentation for each class in this package. + */ +@DefaultAnnotation(NonNull.class) +package org.apache.beam.sdk.schemas.transforms.providers; + +import edu.umd.cs.findbugs.annotations.DefaultAnnotation; +import org.checkerframework.checker.nullness.qual.NonNull; diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroByteBuddyUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroByteBuddyUtils.java deleted file mode 100644 index ab17907f0b136..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroByteBuddyUtils.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas.utils; - -import static org.apache.beam.sdk.util.ByteBuddyUtils.getClassLoadingStrategy; - -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Type; -import java.util.Map; -import net.bytebuddy.ByteBuddy; -import net.bytebuddy.asm.AsmVisitorWrapper; -import net.bytebuddy.description.type.TypeDescription.ForLoadedType; -import net.bytebuddy.dynamic.DynamicType; -import net.bytebuddy.implementation.MethodCall; -import net.bytebuddy.implementation.bytecode.StackManipulation; -import net.bytebuddy.implementation.bytecode.assign.TypeCasting; -import net.bytebuddy.implementation.bytecode.collection.ArrayAccess; -import net.bytebuddy.implementation.bytecode.constant.IntegerConstant; -import net.bytebuddy.implementation.bytecode.member.MethodVariableAccess; -import net.bytebuddy.jar.asm.ClassWriter; -import net.bytebuddy.matcher.ElementMatchers; -import org.apache.avro.specific.SpecificRecord; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.SchemaUserTypeCreator; -import org.apache.beam.sdk.schemas.utils.AvroUtils.AvroTypeConversionFactory; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.InjectPackageStrategy; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.TypeConversion; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.TypeConversionsFactory; -import org.apache.beam.sdk.schemas.utils.ReflectUtils.ClassWithSchema; -import org.apache.beam.sdk.util.common.ReflectHelpers; -import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; - -/** - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.schemas.utils.AvroByteBuddyUtils instead of this - * one. - */ -@SuppressWarnings({ - "nullness", // TODO(https://github.com/apache/beam/issues/20497) - "rawtypes" -}) -@Deprecated -class AvroByteBuddyUtils { - private static final ByteBuddy BYTE_BUDDY = new ByteBuddy(); - - // Cache the generated constructors. - private static final Map CACHED_CREATORS = - Maps.newConcurrentMap(); - - static SchemaUserTypeCreator getCreator( - Class clazz, Schema schema) { - return CACHED_CREATORS.computeIfAbsent( - ClassWithSchema.create(clazz, schema), c -> createCreator(clazz, schema)); - } - - private static SchemaUserTypeCreator createCreator(Class clazz, Schema schema) { - Constructor baseConstructor = null; - Constructor[] constructors = clazz.getDeclaredConstructors(); - for (Constructor constructor : constructors) { - // TODO: This assumes that Avro only generates one constructor with this many fields. - if (constructor.getParameterCount() == schema.getFieldCount()) { - baseConstructor = constructor; - } - } - if (baseConstructor == null) { - throw new RuntimeException("No matching constructor found for class " + clazz); - } - - // Generate a method call to create and invoke the SpecificRecord's constructor. . - MethodCall construct = MethodCall.construct(baseConstructor); - for (int i = 0; i < baseConstructor.getParameterTypes().length; ++i) { - Class baseType = baseConstructor.getParameterTypes()[i]; - construct = construct.with(readAndConvertParameter(baseType, i), baseType); - } - - try { - DynamicType.Builder builder = - BYTE_BUDDY - .with(new InjectPackageStrategy(clazz)) - .subclass(SchemaUserTypeCreator.class) - .method(ElementMatchers.named("create")) - .intercept(construct); - - return builder - .visit(new AsmVisitorWrapper.ForDeclaredMethods().writerFlags(ClassWriter.COMPUTE_FRAMES)) - .make() - .load( - ReflectHelpers.findClassLoader(clazz.getClassLoader()), - getClassLoadingStrategy(clazz)) - .getLoaded() - .getDeclaredConstructor() - .newInstance(); - } catch (InstantiationException - | IllegalAccessException - | NoSuchMethodException - | InvocationTargetException e) { - throw new RuntimeException( - "Unable to generate a getter for class " + clazz + " with schema " + schema); - } - } - - private static StackManipulation readAndConvertParameter( - Class constructorParameterType, int index) { - TypeConversionsFactory typeConversionsFactory = new AvroTypeConversionFactory(); - - // The types in the AVRO-generated constructor might be the types returned by Beam's Row class, - // so we have to convert the types used by Beam's Row class. - // We know that AVRO generates constructor parameters in the same order as fields - // in the schema, so we can just add the parameters sequentially. - TypeConversion convertType = typeConversionsFactory.createTypeConversion(true); - - // Map the AVRO-generated type to the one Beam will use. - ForLoadedType convertedType = - new ForLoadedType((Class) convertType.convert(TypeDescriptor.of(constructorParameterType))); - - // This will run inside the generated creator. Read the parameter and convert it to the - // type required by the SpecificRecord constructor. - StackManipulation readParameter = - new StackManipulation.Compound( - MethodVariableAccess.REFERENCE.loadFrom(1), - IntegerConstant.forValue(index), - ArrayAccess.REFERENCE.load(), - TypeCasting.to(convertedType)); - - // Convert to the parameter accepted by the SpecificRecord constructor. - return typeConversionsFactory - .createSetterConversions(readParameter) - .convert(TypeDescriptor.of(constructorParameterType)); - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroUtils.java deleted file mode 100644 index cd69f139ae614..0000000000000 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroUtils.java +++ /dev/null @@ -1,1396 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas.utils; - -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.lang.reflect.Method; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; -import javax.annotation.Nonnull; -import net.bytebuddy.description.type.TypeDescription.ForLoadedType; -import net.bytebuddy.implementation.bytecode.Duplication; -import net.bytebuddy.implementation.bytecode.StackManipulation; -import net.bytebuddy.implementation.bytecode.StackManipulation.Compound; -import net.bytebuddy.implementation.bytecode.TypeCreation; -import net.bytebuddy.implementation.bytecode.assign.TypeCasting; -import net.bytebuddy.implementation.bytecode.member.MethodInvocation; -import net.bytebuddy.matcher.ElementMatchers; -import org.apache.avro.AvroRuntimeException; -import org.apache.avro.Conversions; -import org.apache.avro.LogicalType; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema.Type; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericFixed; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.avro.reflect.AvroIgnore; -import org.apache.avro.reflect.AvroName; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.specific.SpecificData; -import org.apache.avro.specific.SpecificRecord; -import org.apache.avro.util.Utf8; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.coders.AvroCoder.JodaTimestampConversion; -import org.apache.beam.sdk.schemas.AvroRecordSchema; -import org.apache.beam.sdk.schemas.FieldValueGetter; -import org.apache.beam.sdk.schemas.FieldValueTypeInformation; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.Schema.Field; -import org.apache.beam.sdk.schemas.Schema.FieldType; -import org.apache.beam.sdk.schemas.Schema.TypeName; -import org.apache.beam.sdk.schemas.SchemaCoder; -import org.apache.beam.sdk.schemas.SchemaUserTypeCreator; -import org.apache.beam.sdk.schemas.logicaltypes.EnumerationType; -import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; -import org.apache.beam.sdk.schemas.logicaltypes.FixedString; -import org.apache.beam.sdk.schemas.logicaltypes.OneOfType; -import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; -import org.apache.beam.sdk.schemas.logicaltypes.VariableBytes; -import org.apache.beam.sdk.schemas.logicaltypes.VariableString; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.ConvertType; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.ConvertValueForGetter; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.ConvertValueForSetter; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.TypeConversion; -import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.TypeConversionsFactory; -import org.apache.beam.sdk.transforms.SerializableFunction; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.values.Row; -import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.CaseFormat; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.Days; -import org.joda.time.Duration; -import org.joda.time.Instant; -import org.joda.time.ReadableInstant; - -/** - * Utils to convert AVRO records to Beam rows. Imposes a mapping between common avro types and Beam - * portable schemas (https://s.apache.org/beam-schemas): - * - *

- *   Avro                Beam Field Type
- *   INT         <-----> INT32
- *   LONG        <-----> INT64
- *   FLOAT       <-----> FLOAT
- *   DOUBLE      <-----> DOUBLE
- *   BOOLEAN     <-----> BOOLEAN
- *   STRING      <-----> STRING
- *   BYTES       <-----> BYTES
- *               <------ LogicalType(urn="beam:logical_type:var_bytes:v1")
- *   FIXED       <-----> LogicalType(urn="beam:logical_type:fixed_bytes:v1")
- *   ARRAY       <-----> ARRAY
- *   ENUM        <-----> LogicalType(EnumerationType)
- *   MAP         <-----> MAP
- *   RECORD      <-----> ROW
- *   UNION       <-----> LogicalType(OneOfType)
- *   LogicalTypes.Date              <-----> LogicalType(DATE)
- *                                  <------ LogicalType(urn="beam:logical_type:date:v1")
- *   LogicalTypes.TimestampMillis   <-----> DATETIME
- *   LogicalTypes.Decimal           <-----> DECIMAL
- * 
- * - * For SQL CHAR/VARCHAR types, an Avro schema - * - *
- *   LogicalType({"type":"string","logicalType":"char","maxLength":MAX_LENGTH}) or
- *   LogicalType({"type":"string","logicalType":"varchar","maxLength":MAX_LENGTH})
- * 
- * - * is used. - * - * @deprecated Avro related classes are deprecated in module beam-sdks-java-core and - * will be eventually removed. Please, migrate to a new module - * beam-sdks-java-extensions-avro by importing - * org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils instead of this one. - */ -@SuppressWarnings({ - "nullness", // TODO(https://github.com/apache/beam/issues/20497) - "rawtypes" -}) -@Deprecated -public class AvroUtils { - static { - // This works around a bug in the Avro library (AVRO-1891) around SpecificRecord's handling - // of DateTime types. - SpecificData.get().addLogicalTypeConversion(new JodaTimestampConversion()); - GenericData.get().addLogicalTypeConversion(new JodaTimestampConversion()); - } - - /** Unwrap an AVRO schema into the base type an whether it is nullable. */ - public static class TypeWithNullability { - final org.apache.avro.Schema type; - final boolean nullable; - - public static TypeWithNullability create(org.apache.avro.Schema avroSchema) { - return new TypeWithNullability(avroSchema); - } - - TypeWithNullability(org.apache.avro.Schema avroSchema) { - if (avroSchema.getType() == org.apache.avro.Schema.Type.UNION) { - List types = avroSchema.getTypes(); - - // optional fields in AVRO have form of: - // {"name": "foo", "type": ["null", "something"]} - - // don't need recursion because nested unions aren't supported in AVRO - List nonNullTypes = - types.stream() - .filter(x -> x.getType() != org.apache.avro.Schema.Type.NULL) - .collect(Collectors.toList()); - - if (nonNullTypes.size() == types.size() || nonNullTypes.isEmpty()) { - // union without `null` or all 'null' union, keep as is. - type = avroSchema; - nullable = false; - } else if (nonNullTypes.size() > 1) { - type = org.apache.avro.Schema.createUnion(nonNullTypes); - nullable = true; - } else { - // One non-null type. - type = nonNullTypes.get(0); - nullable = true; - } - } else { - type = avroSchema; - nullable = false; - } - } - - public Boolean isNullable() { - return nullable; - } - - public org.apache.avro.Schema getType() { - return type; - } - } - - /** Wrapper for fixed byte fields. */ - public static class FixedBytesField { - private final int size; - - private FixedBytesField(int size) { - this.size = size; - } - - /** Create a {@link FixedBytesField} with the specified size. */ - public static FixedBytesField withSize(int size) { - return new FixedBytesField(size); - } - - /** Create a {@link FixedBytesField} from a Beam {@link FieldType}. */ - public static @Nullable FixedBytesField fromBeamFieldType(FieldType fieldType) { - if (fieldType.getTypeName().isLogicalType() - && fieldType.getLogicalType().getIdentifier().equals(FixedBytes.IDENTIFIER)) { - int length = fieldType.getLogicalType(FixedBytes.class).getLength(); - return new FixedBytesField(length); - } else { - return null; - } - } - - /** Create a {@link FixedBytesField} from an AVRO type. */ - public static @Nullable FixedBytesField fromAvroType(org.apache.avro.Schema type) { - if (type.getType().equals(Type.FIXED)) { - return new FixedBytesField(type.getFixedSize()); - } else { - return null; - } - } - - /** Get the size. */ - public int getSize() { - return size; - } - - /** Convert to a Beam type. */ - public FieldType toBeamType() { - return Schema.FieldType.logicalType(FixedBytes.of(size)); - } - - /** Convert to an AVRO type. */ - public org.apache.avro.Schema toAvroType(String name, String namespace) { - return org.apache.avro.Schema.createFixed(name, null, namespace, size); - } - } - - public static class AvroConvertType extends ConvertType { - public AvroConvertType(boolean returnRawType) { - super(returnRawType); - } - - @Override - protected java.lang.reflect.Type convertDefault(TypeDescriptor type) { - if (type.isSubtypeOf(TypeDescriptor.of(GenericFixed.class))) { - return byte[].class; - } else { - return super.convertDefault(type); - } - } - } - - public static class AvroConvertValueForGetter extends ConvertValueForGetter { - AvroConvertValueForGetter(StackManipulation readValue) { - super(readValue); - } - - @Override - protected TypeConversionsFactory getFactory() { - return new AvroTypeConversionFactory(); - } - - @Override - protected StackManipulation convertDefault(TypeDescriptor type) { - if (type.isSubtypeOf(TypeDescriptor.of(GenericFixed.class))) { - // Generate the following code: - // return value.bytes(); - return new Compound( - readValue, - MethodInvocation.invoke( - new ForLoadedType(GenericFixed.class) - .getDeclaredMethods() - .filter( - ElementMatchers.named("bytes") - .and(ElementMatchers.returns(new ForLoadedType(byte[].class)))) - .getOnly())); - } - return super.convertDefault(type); - } - } - - public static class AvroConvertValueForSetter extends ConvertValueForSetter { - AvroConvertValueForSetter(StackManipulation readValue) { - super(readValue); - } - - @Override - protected TypeConversionsFactory getFactory() { - return new AvroTypeConversionFactory(); - } - - @Override - protected StackManipulation convertDefault(TypeDescriptor type) { - final ForLoadedType byteArrayType = new ForLoadedType(byte[].class); - if (type.isSubtypeOf(TypeDescriptor.of(GenericFixed.class))) { - // Generate the following code: - // return new T((byte[]) value); - ForLoadedType loadedType = new ForLoadedType(type.getRawType()); - return new Compound( - TypeCreation.of(loadedType), - Duplication.SINGLE, - // Load the parameter and cast it to a byte[]. - readValue, - TypeCasting.to(byteArrayType), - // Create a new instance that wraps this byte[]. - MethodInvocation.invoke( - loadedType - .getDeclaredMethods() - .filter( - ElementMatchers.isConstructor() - .and(ElementMatchers.takesArguments(byteArrayType))) - .getOnly())); - } - return super.convertDefault(type); - } - } - - static class AvroTypeConversionFactory implements TypeConversionsFactory { - - @Override - public TypeConversion createTypeConversion(boolean returnRawTypes) { - return new AvroConvertType(returnRawTypes); - } - - @Override - public TypeConversion createGetterConversions(StackManipulation readValue) { - return new AvroConvertValueForGetter(readValue); - } - - @Override - public TypeConversion createSetterConversions(StackManipulation readValue) { - return new AvroConvertValueForSetter(readValue); - } - } - - /** Get Beam Field from avro Field. */ - public static Schema.Field toBeamField(org.apache.avro.Schema.Field field) { - TypeWithNullability nullableType = new TypeWithNullability(field.schema()); - FieldType beamFieldType = toFieldType(nullableType); - return Field.of(field.name(), beamFieldType); - } - - /** Get Avro Field from Beam Field. */ - public static org.apache.avro.Schema.Field toAvroField(Schema.Field field, String namespace) { - org.apache.avro.Schema fieldSchema = - getFieldSchema(field.getType(), field.getName(), namespace); - return new org.apache.avro.Schema.Field( - field.getName(), fieldSchema, field.getDescription(), (Object) null); - } - - private AvroUtils() {} - - /** - * Converts AVRO schema to Beam row schema. - * - * @param schema schema of type RECORD - */ - public static Schema toBeamSchema(org.apache.avro.Schema schema) { - Schema.Builder builder = Schema.builder(); - - for (org.apache.avro.Schema.Field field : schema.getFields()) { - Field beamField = toBeamField(field); - if (field.doc() != null) { - beamField = beamField.withDescription(field.doc()); - } - builder.addField(beamField); - } - - return builder.build(); - } - - /** Converts a Beam Schema into an AVRO schema. */ - public static org.apache.avro.Schema toAvroSchema( - Schema beamSchema, @Nullable String name, @Nullable String namespace) { - final String schemaName = Strings.isNullOrEmpty(name) ? "topLevelRecord" : name; - final String schemaNamespace = namespace == null ? "" : namespace; - String childNamespace = - !"".equals(schemaNamespace) ? schemaNamespace + "." + schemaName : schemaName; - List fields = Lists.newArrayList(); - for (Schema.Field field : beamSchema.getFields()) { - org.apache.avro.Schema.Field recordField = toAvroField(field, childNamespace); - fields.add(recordField); - } - return org.apache.avro.Schema.createRecord(schemaName, null, schemaNamespace, false, fields); - } - - public static org.apache.avro.Schema toAvroSchema(Schema beamSchema) { - return toAvroSchema(beamSchema, null, null); - } - - /** - * Strict conversion from AVRO to Beam, strict because it doesn't do widening or narrowing during - * conversion. If Schema is not provided, one is inferred from the AVRO schema. - */ - public static Row toBeamRowStrict(GenericRecord record, @Nullable Schema schema) { - if (schema == null) { - schema = toBeamSchema(record.getSchema()); - } - - Row.Builder builder = Row.withSchema(schema); - org.apache.avro.Schema avroSchema = record.getSchema(); - - for (Schema.Field field : schema.getFields()) { - Object value = record.get(field.getName()); - org.apache.avro.Schema fieldAvroSchema = avroSchema.getField(field.getName()).schema(); - builder.addValue(convertAvroFieldStrict(value, fieldAvroSchema, field.getType())); - } - - return builder.build(); - } - - /** - * Convert from a Beam Row to an AVRO GenericRecord. The Avro Schema is inferred from the Beam - * schema on the row. - */ - public static GenericRecord toGenericRecord(Row row) { - return toGenericRecord(row, null); - } - - /** - * Convert from a Beam Row to an AVRO GenericRecord. If a Schema is not provided, one is inferred - * from the Beam schema on the row. - */ - public static GenericRecord toGenericRecord( - Row row, org.apache.avro.@Nullable Schema avroSchema) { - Schema beamSchema = row.getSchema(); - // Use the provided AVRO schema if present, otherwise infer an AVRO schema from the row - // schema. - if (avroSchema != null && avroSchema.getFields().size() != beamSchema.getFieldCount()) { - throw new IllegalArgumentException( - "AVRO schema doesn't match row schema. Row schema " - + beamSchema - + ". AVRO schema + " - + avroSchema); - } - if (avroSchema == null) { - avroSchema = toAvroSchema(beamSchema); - } - - GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema); - for (int i = 0; i < beamSchema.getFieldCount(); ++i) { - Schema.Field field = beamSchema.getField(i); - builder.set( - field.getName(), - genericFromBeamField( - field.getType(), avroSchema.getField(field.getName()).schema(), row.getValue(i))); - } - return builder.build(); - } - - @SuppressWarnings("unchecked") - public static SerializableFunction getToRowFunction( - Class clazz, org.apache.avro.@Nullable Schema schema) { - if (GenericRecord.class.equals(clazz)) { - Schema beamSchema = toBeamSchema(schema); - return (SerializableFunction) getGenericRecordToRowFunction(beamSchema); - } else { - return new AvroRecordSchema().toRowFunction(TypeDescriptor.of(clazz)); - } - } - - @SuppressWarnings("unchecked") - public static SerializableFunction getFromRowFunction(Class clazz) { - return GenericRecord.class.equals(clazz) - ? (SerializableFunction) getRowToGenericRecordFunction(null) - : new AvroRecordSchema().fromRowFunction(TypeDescriptor.of(clazz)); - } - - public static @Nullable Schema getSchema( - Class clazz, org.apache.avro.@Nullable Schema schema) { - if (schema != null) { - return schema.getType().equals(Type.RECORD) ? toBeamSchema(schema) : null; - } - if (GenericRecord.class.equals(clazz)) { - throw new IllegalArgumentException("No schema provided for getSchema(GenericRecord)"); - } - return new AvroRecordSchema().schemaFor(TypeDescriptor.of(clazz)); - } - - /** Returns a function mapping encoded AVRO {@link GenericRecord}s to Beam {@link Row}s. */ - public static SimpleFunction getAvroBytesToRowFunction(Schema beamSchema) { - return new AvroBytesToRowFn(beamSchema); - } - - private static class AvroBytesToRowFn extends SimpleFunction { - private final AvroCoder coder; - private final Schema beamSchema; - - AvroBytesToRowFn(Schema beamSchema) { - org.apache.avro.Schema avroSchema = toAvroSchema(beamSchema); - coder = AvroCoder.of(avroSchema); - this.beamSchema = beamSchema; - } - - @Override - public Row apply(byte[] bytes) { - try { - ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); - GenericRecord record = coder.decode(inputStream); - return AvroUtils.toBeamRowStrict(record, beamSchema); - } catch (Exception e) { - throw new AvroRuntimeException( - "Could not decode avro record from given bytes " - + new String(bytes, StandardCharsets.UTF_8), - e); - } - } - } - - /** Returns a function mapping Beam {@link Row}s to encoded AVRO {@link GenericRecord}s. */ - public static SimpleFunction getRowToAvroBytesFunction(Schema beamSchema) { - return new RowToAvroBytesFn(beamSchema); - } - - private static class RowToAvroBytesFn extends SimpleFunction { - private final transient org.apache.avro.Schema avroSchema; - private final AvroCoder coder; - - RowToAvroBytesFn(Schema beamSchema) { - avroSchema = toAvroSchema(beamSchema); - coder = AvroCoder.of(avroSchema); - } - - @Override - public byte[] apply(Row row) { - try { - GenericRecord record = toGenericRecord(row, avroSchema); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - coder.encode(record, outputStream); - return outputStream.toByteArray(); - } catch (Exception e) { - throw new AvroRuntimeException( - String.format("Could not encode avro from given row: %s", row), e); - } - } - } - - /** - * Returns a function mapping AVRO {@link GenericRecord}s to Beam {@link Row}s for use in {@link - * org.apache.beam.sdk.values.PCollection#setSchema}. - */ - public static SerializableFunction getGenericRecordToRowFunction( - @Nullable Schema schema) { - return new GenericRecordToRowFn(schema); - } - - private static class GenericRecordToRowFn implements SerializableFunction { - private final Schema schema; - - GenericRecordToRowFn(Schema schema) { - this.schema = schema; - } - - @Override - public Row apply(GenericRecord input) { - return toBeamRowStrict(input, schema); - } - - @Override - public boolean equals(@Nullable Object other) { - if (this == other) { - return true; - } - if (other == null || getClass() != other.getClass()) { - return false; - } - GenericRecordToRowFn that = (GenericRecordToRowFn) other; - return Objects.equals(this.schema, that.schema); - } - - @Override - public int hashCode() { - return Objects.hash(schema); - } - } - - /** - * Returns a function mapping Beam {@link Row}s to AVRO {@link GenericRecord}s for use in {@link - * org.apache.beam.sdk.values.PCollection#setSchema}. - */ - public static SerializableFunction getRowToGenericRecordFunction( - org.apache.avro.@Nullable Schema avroSchema) { - return new RowToGenericRecordFn(avroSchema); - } - - private static class RowToGenericRecordFn implements SerializableFunction { - private transient org.apache.avro.Schema avroSchema; - - RowToGenericRecordFn(org.apache.avro.@Nullable Schema avroSchema) { - this.avroSchema = avroSchema; - } - - @Override - public GenericRecord apply(Row input) { - return toGenericRecord(input, avroSchema); - } - - @Override - public boolean equals(@Nullable Object other) { - if (this == other) { - return true; - } - if (other == null || getClass() != other.getClass()) { - return false; - } - RowToGenericRecordFn that = (RowToGenericRecordFn) other; - return Objects.equals(this.avroSchema, that.avroSchema); - } - - @Override - public int hashCode() { - return Objects.hash(avroSchema); - } - - private void writeObject(ObjectOutputStream out) throws IOException { - final String avroSchemaAsString = (avroSchema == null) ? null : avroSchema.toString(); - out.writeObject(avroSchemaAsString); - } - - private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { - final String avroSchemaAsString = (String) in.readObject(); - avroSchema = - (avroSchemaAsString == null) - ? null - : new org.apache.avro.Schema.Parser().parse(avroSchemaAsString); - } - } - - /** - * Returns an {@code SchemaCoder} instance for the provided element type. - * - * @param the element type - */ - public static SchemaCoder schemaCoder(TypeDescriptor type) { - @SuppressWarnings("unchecked") - Class clazz = (Class) type.getRawType(); - org.apache.avro.Schema avroSchema = new ReflectData(clazz.getClassLoader()).getSchema(clazz); - Schema beamSchema = toBeamSchema(avroSchema); - return SchemaCoder.of( - beamSchema, type, getToRowFunction(clazz, avroSchema), getFromRowFunction(clazz)); - } - - /** - * Returns an {@code SchemaCoder} instance for the provided element class. - * - * @param the element type - */ - public static SchemaCoder schemaCoder(Class clazz) { - return schemaCoder(TypeDescriptor.of(clazz)); - } - - /** - * Returns an {@code SchemaCoder} instance for the Avro schema. The implicit type is - * GenericRecord. - */ - public static SchemaCoder schemaCoder(org.apache.avro.Schema schema) { - Schema beamSchema = toBeamSchema(schema); - return SchemaCoder.of( - beamSchema, - TypeDescriptor.of(GenericRecord.class), - getGenericRecordToRowFunction(beamSchema), - getRowToGenericRecordFunction(schema)); - } - - /** - * Returns an {@code SchemaCoder} instance for the provided element type using the provided Avro - * schema. - * - *

If the type argument is GenericRecord, the schema may be arbitrary. Otherwise, the schema - * must correspond to the type provided. - * - * @param the element type - */ - public static SchemaCoder schemaCoder(Class clazz, org.apache.avro.Schema schema) { - return SchemaCoder.of( - getSchema(clazz, schema), - TypeDescriptor.of(clazz), - getToRowFunction(clazz, schema), - getFromRowFunction(clazz)); - } - - /** - * Returns an {@code SchemaCoder} instance based on the provided AvroCoder for the element type. - * - * @param the element type - */ - public static SchemaCoder schemaCoder(AvroCoder avroCoder) { - return schemaCoder(avroCoder.getType(), avroCoder.getSchema()); - } - - private static final class AvroSpecificRecordFieldValueTypeSupplier - implements FieldValueTypeSupplier { - @Override - public List get(Class clazz) { - throw new RuntimeException("Unexpected call."); - } - - @Override - public List get(Class clazz, Schema schema) { - Map mapping = getMapping(schema); - List methods = ReflectUtils.getMethods(clazz); - List types = Lists.newArrayList(); - for (int i = 0; i < methods.size(); ++i) { - Method method = methods.get(i); - if (ReflectUtils.isGetter(method)) { - FieldValueTypeInformation fieldValueTypeInformation = - FieldValueTypeInformation.forGetter(method, i); - String name = mapping.get(fieldValueTypeInformation.getName()); - if (name != null) { - types.add(fieldValueTypeInformation.withName(name)); - } - } - } - - // Return the list ordered by the schema fields. - return StaticSchemaInference.sortBySchema(types, schema); - } - - private Map getMapping(Schema schema) { - Map mapping = Maps.newHashMap(); - for (Field field : schema.getFields()) { - String fieldName = field.getName(); - String getter; - if (fieldName.contains("_")) { - if (Character.isLowerCase(fieldName.charAt(0))) { - // field_name -> fieldName - getter = CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, fieldName); - } else { - // FIELD_NAME -> fIELDNAME - // must remove underscore and then convert to match compiled Avro schema getter name - getter = CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, fieldName.replace("_", "")); - } - } else if (Character.isUpperCase(fieldName.charAt(0))) { - // FieldName -> fieldName - getter = CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, fieldName); - } else { - // If the field is in camel case already, then it's the identity mapping. - getter = fieldName; - } - mapping.put(getter, fieldName); - // The Avro compiler might add a $ at the end of a getter to disambiguate. - mapping.put(getter + "$", fieldName); - } - return mapping; - } - } - - private static final class AvroPojoFieldValueTypeSupplier implements FieldValueTypeSupplier { - @Override - public List get(Class clazz) { - List classFields = ReflectUtils.getFields(clazz); - Map types = Maps.newHashMap(); - for (int i = 0; i < classFields.size(); ++i) { - java.lang.reflect.Field f = classFields.get(i); - if (!f.isAnnotationPresent(AvroIgnore.class)) { - FieldValueTypeInformation typeInformation = FieldValueTypeInformation.forField(f, i); - AvroName avroname = f.getAnnotation(AvroName.class); - if (avroname != null) { - typeInformation = typeInformation.withName(avroname.value()); - } - types.put(typeInformation.getName(), typeInformation); - } - } - return Lists.newArrayList(types.values()); - } - } - - /** Get field types for an AVRO-generated SpecificRecord or a POJO. */ - public static List getFieldTypes(Class clazz, Schema schema) { - if (TypeDescriptor.of(clazz).isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { - return JavaBeanUtils.getFieldTypes( - clazz, schema, new AvroSpecificRecordFieldValueTypeSupplier()); - } else { - return POJOUtils.getFieldTypes(clazz, schema, new AvroPojoFieldValueTypeSupplier()); - } - } - - /** Get generated getters for an AVRO-generated SpecificRecord or a POJO. */ - public static List getGetters(Class clazz, Schema schema) { - if (TypeDescriptor.of(clazz).isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { - return JavaBeanUtils.getGetters( - clazz, - schema, - new AvroSpecificRecordFieldValueTypeSupplier(), - new AvroTypeConversionFactory()); - } else { - return POJOUtils.getGetters( - clazz, schema, new AvroPojoFieldValueTypeSupplier(), new AvroTypeConversionFactory()); - } - } - - /** Get an object creator for an AVRO-generated SpecificRecord. */ - public static SchemaUserTypeCreator getCreator(Class clazz, Schema schema) { - if (TypeDescriptor.of(clazz).isSubtypeOf(TypeDescriptor.of(SpecificRecord.class))) { - return AvroByteBuddyUtils.getCreator((Class) clazz, schema); - } else { - return POJOUtils.getSetFieldCreator( - clazz, schema, new AvroPojoFieldValueTypeSupplier(), new AvroTypeConversionFactory()); - } - } - - /** Converts AVRO schema to Beam field. */ - private static Schema.FieldType toFieldType(TypeWithNullability type) { - Schema.FieldType fieldType = null; - org.apache.avro.Schema avroSchema = type.type; - - LogicalType logicalType = LogicalTypes.fromSchema(avroSchema); - if (logicalType != null) { - if (logicalType instanceof LogicalTypes.Decimal) { - fieldType = FieldType.DECIMAL; - } else if (logicalType instanceof LogicalTypes.TimestampMillis) { - // TODO: There is a desire to move Beam schema DATETIME to a micros representation. When - // this is done, this logical type needs to be changed. - fieldType = FieldType.DATETIME; - } else if (logicalType instanceof LogicalTypes.Date) { - fieldType = FieldType.DATETIME; - } - } - - if (fieldType == null) { - switch (type.type.getType()) { - case RECORD: - fieldType = Schema.FieldType.row(toBeamSchema(avroSchema)); - break; - - case ENUM: - fieldType = FieldType.logicalType(EnumerationType.create(type.type.getEnumSymbols())); - break; - - case ARRAY: - Schema.FieldType elementType = - toFieldType(new TypeWithNullability(avroSchema.getElementType())); - fieldType = Schema.FieldType.array(elementType); - break; - - case MAP: - fieldType = - Schema.FieldType.map( - Schema.FieldType.STRING, - toFieldType(new TypeWithNullability(avroSchema.getValueType()))); - break; - - case FIXED: - fieldType = FixedBytesField.fromAvroType(type.type).toBeamType(); - break; - - case STRING: - fieldType = Schema.FieldType.STRING; - break; - - case BYTES: - fieldType = Schema.FieldType.BYTES; - break; - - case INT: - fieldType = Schema.FieldType.INT32; - break; - - case LONG: - fieldType = Schema.FieldType.INT64; - break; - - case FLOAT: - fieldType = Schema.FieldType.FLOAT; - break; - - case DOUBLE: - fieldType = Schema.FieldType.DOUBLE; - break; - - case BOOLEAN: - fieldType = Schema.FieldType.BOOLEAN; - break; - - case UNION: - fieldType = - FieldType.logicalType( - OneOfType.create( - avroSchema.getTypes().stream() - .map(x -> Field.of(x.getName(), toFieldType(new TypeWithNullability(x)))) - .collect(Collectors.toList()))); - break; - case NULL: - throw new IllegalArgumentException("Can't convert 'null' to FieldType"); - - default: - throw new AssertionError("Unexpected AVRO Schema.Type: " + avroSchema.getType()); - } - } - fieldType = fieldType.withNullable(type.nullable); - return fieldType; - } - - private static org.apache.avro.Schema getFieldSchema( - Schema.FieldType fieldType, String fieldName, String namespace) { - org.apache.avro.Schema baseType; - switch (fieldType.getTypeName()) { - case BYTE: - case INT16: - case INT32: - baseType = org.apache.avro.Schema.create(Type.INT); - break; - - case INT64: - baseType = org.apache.avro.Schema.create(Type.LONG); - break; - - case DECIMAL: - baseType = - LogicalTypes.decimal(Integer.MAX_VALUE) - .addToSchema(org.apache.avro.Schema.create(Type.BYTES)); - break; - - case FLOAT: - baseType = org.apache.avro.Schema.create(Type.FLOAT); - break; - - case DOUBLE: - baseType = org.apache.avro.Schema.create(Type.DOUBLE); - break; - - case STRING: - baseType = org.apache.avro.Schema.create(Type.STRING); - break; - - case DATETIME: - // TODO: There is a desire to move Beam schema DATETIME to a micros representation. When - // this is done, this logical type needs to be changed. - baseType = - LogicalTypes.timestampMillis().addToSchema(org.apache.avro.Schema.create(Type.LONG)); - break; - - case BOOLEAN: - baseType = org.apache.avro.Schema.create(Type.BOOLEAN); - break; - - case BYTES: - baseType = org.apache.avro.Schema.create(Type.BYTES); - break; - - case LOGICAL_TYPE: - String identifier = fieldType.getLogicalType().getIdentifier(); - if (FixedBytes.IDENTIFIER.equals(identifier)) { - FixedBytesField fixedBytesField = - checkNotNull(FixedBytesField.fromBeamFieldType(fieldType)); - baseType = fixedBytesField.toAvroType("fixed", namespace + "." + fieldName); - } else if (VariableBytes.IDENTIFIER.equals(identifier)) { - // treat VARBINARY as bytes as that is what avro supports - baseType = org.apache.avro.Schema.create(Type.BYTES); - } else if (FixedString.IDENTIFIER.equals(identifier) - || "CHAR".equals(identifier) - || "NCHAR".equals(identifier)) { - baseType = - buildHiveLogicalTypeSchema("char", (int) fieldType.getLogicalType().getArgument()); - } else if (VariableString.IDENTIFIER.equals(identifier) - || "NVARCHAR".equals(identifier) - || "VARCHAR".equals(identifier) - || "LONGNVARCHAR".equals(identifier) - || "LONGVARCHAR".equals(identifier)) { - baseType = - buildHiveLogicalTypeSchema("varchar", (int) fieldType.getLogicalType().getArgument()); - } else if (EnumerationType.IDENTIFIER.equals(identifier)) { - EnumerationType enumerationType = fieldType.getLogicalType(EnumerationType.class); - baseType = - org.apache.avro.Schema.createEnum(fieldName, "", "", enumerationType.getValues()); - } else if (OneOfType.IDENTIFIER.equals(identifier)) { - OneOfType oneOfType = fieldType.getLogicalType(OneOfType.class); - baseType = - org.apache.avro.Schema.createUnion( - oneOfType.getOneOfSchema().getFields().stream() - .map(x -> getFieldSchema(x.getType(), x.getName(), namespace)) - .collect(Collectors.toList())); - } else if ("DATE".equals(identifier) || SqlTypes.DATE.getIdentifier().equals(identifier)) { - baseType = LogicalTypes.date().addToSchema(org.apache.avro.Schema.create(Type.INT)); - } else if ("TIME".equals(identifier)) { - baseType = LogicalTypes.timeMillis().addToSchema(org.apache.avro.Schema.create(Type.INT)); - } else { - throw new RuntimeException( - "Unhandled logical type " + fieldType.getLogicalType().getIdentifier()); - } - break; - - case ARRAY: - case ITERABLE: - baseType = - org.apache.avro.Schema.createArray( - getFieldSchema(fieldType.getCollectionElementType(), fieldName, namespace)); - break; - - case MAP: - if (fieldType.getMapKeyType().getTypeName().isStringType()) { - // Avro only supports string keys in maps. - baseType = - org.apache.avro.Schema.createMap( - getFieldSchema(fieldType.getMapValueType(), fieldName, namespace)); - } else { - throw new IllegalArgumentException("Avro only supports maps with string keys"); - } - break; - - case ROW: - baseType = toAvroSchema(fieldType.getRowSchema(), fieldName, namespace); - break; - - default: - throw new IllegalArgumentException("Unexpected type " + fieldType); - } - return fieldType.getNullable() ? ReflectData.makeNullable(baseType) : baseType; - } - - private static @Nullable Object genericFromBeamField( - Schema.FieldType fieldType, org.apache.avro.Schema avroSchema, @Nullable Object value) { - TypeWithNullability typeWithNullability = new TypeWithNullability(avroSchema); - if (!fieldType.getNullable().equals(typeWithNullability.nullable)) { - throw new IllegalArgumentException( - "FieldType " - + fieldType - + " and AVRO schema " - + avroSchema - + " don't have matching nullability"); - } - - if (value == null) { - return value; - } - - switch (fieldType.getTypeName()) { - case BYTE: - case INT16: - case INT32: - case INT64: - case FLOAT: - case DOUBLE: - case BOOLEAN: - return value; - - case STRING: - return new Utf8((String) value); - - case DECIMAL: - BigDecimal decimal = (BigDecimal) value; - LogicalType logicalType = typeWithNullability.type.getLogicalType(); - return new Conversions.DecimalConversion().toBytes(decimal, null, logicalType); - - case DATETIME: - if (typeWithNullability.type.getType() == Type.INT) { - ReadableInstant instant = (ReadableInstant) value; - return (int) Days.daysBetween(Instant.EPOCH, instant).getDays(); - } else if (typeWithNullability.type.getType() == Type.LONG) { - ReadableInstant instant = (ReadableInstant) value; - return (long) instant.getMillis(); - } else { - throw new IllegalArgumentException( - "Can't represent " + fieldType + " as " + typeWithNullability.type.getType()); - } - - case BYTES: - return ByteBuffer.wrap((byte[]) value); - - case LOGICAL_TYPE: - String identifier = fieldType.getLogicalType().getIdentifier(); - if (FixedBytes.IDENTIFIER.equals(identifier)) { - FixedBytesField fixedBytesField = - checkNotNull(FixedBytesField.fromBeamFieldType(fieldType)); - byte[] byteArray = (byte[]) value; - if (byteArray.length != fixedBytesField.getSize()) { - throw new IllegalArgumentException("Incorrectly sized byte array."); - } - return GenericData.get().createFixed(null, (byte[]) value, typeWithNullability.type); - } else if (VariableBytes.IDENTIFIER.equals(identifier)) { - return GenericData.get().createFixed(null, (byte[]) value, typeWithNullability.type); - } else if (FixedString.IDENTIFIER.equals(identifier) - || "CHAR".equals(identifier) - || "NCHAR".equals(identifier)) { - return new Utf8((String) value); - } else if (VariableString.IDENTIFIER.equals(identifier) - || "NVARCHAR".equals(identifier) - || "VARCHAR".equals(identifier) - || "LONGNVARCHAR".equals(identifier) - || "LONGVARCHAR".equals(identifier)) { - return new Utf8((String) value); - } else if (EnumerationType.IDENTIFIER.equals(identifier)) { - EnumerationType enumerationType = fieldType.getLogicalType(EnumerationType.class); - return GenericData.get() - .createEnum( - enumerationType.toString((EnumerationType.Value) value), - typeWithNullability.type); - } else if (OneOfType.IDENTIFIER.equals(identifier)) { - OneOfType oneOfType = fieldType.getLogicalType(OneOfType.class); - OneOfType.Value oneOfValue = (OneOfType.Value) value; - FieldType innerFieldType = oneOfType.getFieldType(oneOfValue); - if (typeWithNullability.nullable && oneOfValue.getValue() == null) { - return null; - } else { - return genericFromBeamField( - innerFieldType.withNullable(false), - typeWithNullability.type.getTypes().get(oneOfValue.getCaseType().getValue()), - oneOfValue.getValue()); - } - } else if ("DATE".equals(identifier)) { - // "Date" is backed by joda.time.Instant - return Days.daysBetween(Instant.EPOCH, (Instant) value).getDays(); - } else if (SqlTypes.DATE.getIdentifier().equals(identifier)) { - // portable SqlTypes.DATE is backed by java.time.LocalDate - return ((java.time.LocalDate) value).toEpochDay(); - } else if ("TIME".equals(identifier)) { - return (int) ((Instant) value).getMillis(); - } else { - throw new RuntimeException("Unhandled logical type " + identifier); - } - - case ARRAY: - case ITERABLE: - Iterable iterable = (Iterable) value; - List translatedArray = Lists.newArrayListWithExpectedSize(Iterables.size(iterable)); - - for (Object arrayElement : iterable) { - translatedArray.add( - genericFromBeamField( - fieldType.getCollectionElementType(), - typeWithNullability.type.getElementType(), - arrayElement)); - } - return translatedArray; - - case MAP: - Map map = Maps.newHashMap(); - Map valueMap = (Map) value; - for (Map.Entry entry : valueMap.entrySet()) { - Utf8 key = new Utf8((String) entry.getKey()); - map.put( - key, - genericFromBeamField( - fieldType.getMapValueType(), - typeWithNullability.type.getValueType(), - entry.getValue())); - } - return map; - - case ROW: - return toGenericRecord((Row) value, typeWithNullability.type); - - default: - throw new IllegalArgumentException("Unsupported type " + fieldType); - } - } - - /** - * Strict conversion from AVRO to Beam, strict because it doesn't do widening or narrowing during - * conversion. - * - * @param value {@link GenericRecord} or any nested value - * @param avroSchema schema for value - * @param fieldType target beam field type - * @return value converted for {@link Row} - */ - @SuppressWarnings("unchecked") - public static @Nullable Object convertAvroFieldStrict( - @Nullable Object value, - @Nonnull org.apache.avro.Schema avroSchema, - @Nonnull Schema.FieldType fieldType) { - if (value == null) { - return null; - } - - TypeWithNullability type = new TypeWithNullability(avroSchema); - LogicalType logicalType = LogicalTypes.fromSchema(type.type); - if (logicalType != null) { - if (logicalType instanceof LogicalTypes.Decimal) { - ByteBuffer byteBuffer = (ByteBuffer) value; - BigDecimal bigDecimal = - new Conversions.DecimalConversion() - .fromBytes(byteBuffer.duplicate(), type.type, logicalType); - return convertDecimal(bigDecimal, fieldType); - } else if (logicalType instanceof LogicalTypes.TimestampMillis) { - if (value instanceof ReadableInstant) { - return convertDateTimeStrict(((ReadableInstant) value).getMillis(), fieldType); - } else { - return convertDateTimeStrict((Long) value, fieldType); - } - } else if (logicalType instanceof LogicalTypes.Date) { - if (value instanceof ReadableInstant) { - int epochDays = Days.daysBetween(Instant.EPOCH, (ReadableInstant) value).getDays(); - return convertDateStrict(epochDays, fieldType); - } else if (value instanceof java.time.LocalDate) { - return convertDateStrict((int) ((java.time.LocalDate) value).toEpochDay(), fieldType); - } else { - return convertDateStrict((Integer) value, fieldType); - } - } - } - - switch (type.type.getType()) { - case FIXED: - return convertFixedStrict((GenericFixed) value, fieldType); - - case BYTES: - return convertBytesStrict((ByteBuffer) value, fieldType); - - case STRING: - return convertStringStrict((CharSequence) value, fieldType); - - case INT: - return convertIntStrict((Integer) value, fieldType); - - case LONG: - return convertLongStrict((Long) value, fieldType); - - case FLOAT: - return convertFloatStrict((Float) value, fieldType); - - case DOUBLE: - return convertDoubleStrict((Double) value, fieldType); - - case BOOLEAN: - return convertBooleanStrict((Boolean) value, fieldType); - - case RECORD: - return convertRecordStrict((GenericRecord) value, fieldType); - - case ENUM: - // enums are either Java enums, or GenericEnumSymbol, - // they don't share common interface, but override toString() - return convertEnumStrict(value, fieldType); - - case ARRAY: - return convertArrayStrict((List) value, type.type.getElementType(), fieldType); - - case MAP: - return convertMapStrict( - (Map) value, type.type.getValueType(), fieldType); - - case UNION: - return convertUnionStrict(value, type.type, fieldType); - - case NULL: - throw new IllegalArgumentException("Can't convert 'null' to non-nullable field"); - - default: - throw new AssertionError("Unexpected AVRO Schema.Type: " + type.type.getType()); - } - } - - private static Object convertRecordStrict(GenericRecord record, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.ROW, "record"); - return toBeamRowStrict(record, fieldType.getRowSchema()); - } - - private static Object convertBytesStrict(ByteBuffer bb, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.BYTES, "bytes"); - - byte[] bytes = new byte[bb.remaining()]; - bb.duplicate().get(bytes); - return bytes; - } - - private static Object convertFixedStrict(GenericFixed fixed, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), TypeName.LOGICAL_TYPE, "fixed"); - checkArgument(FixedBytes.IDENTIFIER.equals(fieldType.getLogicalType().getIdentifier())); - return fixed.bytes().clone(); // clone because GenericFixed is mutable - } - - private static Object convertStringStrict(CharSequence value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.STRING, "string"); - return value.toString(); - } - - private static Object convertIntStrict(Integer value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.INT32, "int"); - return value; - } - - private static Object convertLongStrict(Long value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.INT64, "long"); - return value; - } - - private static Object convertDecimal(BigDecimal value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), TypeName.DECIMAL, "decimal"); - return value; - } - - private static Object convertDateStrict(Integer epochDays, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), TypeName.DATETIME, "date"); - return Instant.EPOCH.plus(Duration.standardDays(epochDays)); - } - - private static Object convertDateTimeStrict(Long value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), TypeName.DATETIME, "dateTime"); - return new Instant(value); - } - - private static Object convertFloatStrict(Float value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.FLOAT, "float"); - return value; - } - - private static Object convertDoubleStrict(Double value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.DOUBLE, "double"); - return value; - } - - private static Object convertBooleanStrict(Boolean value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.BOOLEAN, "boolean"); - return value; - } - - private static Object convertEnumStrict(Object value, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), TypeName.LOGICAL_TYPE, "enum"); - checkArgument(fieldType.getLogicalType().getIdentifier().equals(EnumerationType.IDENTIFIER)); - EnumerationType enumerationType = fieldType.getLogicalType(EnumerationType.class); - return enumerationType.valueOf(value.toString()); - } - - private static Object convertUnionStrict( - Object value, org.apache.avro.Schema unionAvroSchema, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), TypeName.LOGICAL_TYPE, "oneOfType"); - checkArgument(fieldType.getLogicalType().getIdentifier().equals(OneOfType.IDENTIFIER)); - OneOfType oneOfType = fieldType.getLogicalType(OneOfType.class); - int fieldNumber = GenericData.get().resolveUnion(unionAvroSchema, value); - FieldType baseFieldType = oneOfType.getOneOfSchema().getField(fieldNumber).getType(); - Object convertedValue = - convertAvroFieldStrict(value, unionAvroSchema.getTypes().get(fieldNumber), baseFieldType); - return oneOfType.createValue(fieldNumber, convertedValue); - } - - private static Object convertArrayStrict( - List values, org.apache.avro.Schema elemAvroSchema, Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.ARRAY, "array"); - - List ret = new ArrayList<>(values.size()); - Schema.FieldType elemFieldType = fieldType.getCollectionElementType(); - - for (Object value : values) { - ret.add(convertAvroFieldStrict(value, elemAvroSchema, elemFieldType)); - } - - return ret; - } - - private static Object convertMapStrict( - Map values, - org.apache.avro.Schema valueAvroSchema, - Schema.FieldType fieldType) { - checkTypeName(fieldType.getTypeName(), Schema.TypeName.MAP, "map"); - checkNotNull(fieldType.getMapKeyType()); - checkNotNull(fieldType.getMapValueType()); - - if (!fieldType.getMapKeyType().equals(Schema.FieldType.STRING)) { - throw new IllegalArgumentException( - "Can't convert 'string' map keys to " + fieldType.getMapKeyType()); - } - - Map ret = new HashMap<>(); - - for (Map.Entry value : values.entrySet()) { - ret.put( - convertStringStrict(value.getKey(), fieldType.getMapKeyType()), - convertAvroFieldStrict(value.getValue(), valueAvroSchema, fieldType.getMapValueType())); - } - - return ret; - } - - private static void checkTypeName(Schema.TypeName got, Schema.TypeName expected, String label) { - checkArgument( - got.equals(expected), "Can't convert '%s' to %s, expected: %s", label, got, expected); - } - - /** - * Helper factory to build Avro Logical types schemas for SQL *CHAR types. This method represents - * the logical as Hive does. - */ - private static org.apache.avro.Schema buildHiveLogicalTypeSchema( - String hiveLogicalType, int size) { - String schemaJson = - String.format( - "{\"type\": \"string\", \"logicalType\": \"%s\", \"maxLength\": %s}", - hiveLogicalType, size); - return new org.apache.avro.Schema.Parser().parse(schemaJson); - } -} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JsonUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JsonUtils.java index a724664ceaf18..18f5813c6cc93 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JsonUtils.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/JsonUtils.java @@ -58,7 +58,7 @@ * } * *

Note: This functionality has been tested with {@code everit-json-schema} version - * 1.14.1. + * 1.14.2. * *

JSON-Schema supported features

* diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java index afd6a6ccb151f..85cb2df9deab1 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java @@ -49,6 +49,8 @@ public class ResourceHints { private static final String MIN_RAM_URN = "beam:resources:min_ram_bytes:v1"; private static final String ACCELERATOR_URN = "beam:resources:accelerator:v1"; + private static final String CPU_COUNT_URN = "beam:resources:cpu_count:v1"; + // TODO: reference this from a common location in all packages that use this. private static String getUrn(ProtocolMessageEnum value) { return value.getValueDescriptor().getOptions().getExtension(RunnerApi.beamUrn); @@ -57,6 +59,7 @@ private static String getUrn(ProtocolMessageEnum value) { static { checkState(MIN_RAM_URN.equals(getUrn(StandardResourceHints.Enum.MIN_RAM_BYTES))); checkState(ACCELERATOR_URN.equals(getUrn(StandardResourceHints.Enum.ACCELERATOR))); + checkState(CPU_COUNT_URN.equals(getUrn(StandardResourceHints.Enum.CPU_COUNT))); } private static ImmutableMap hintNameToUrn = @@ -64,12 +67,15 @@ private static String getUrn(ProtocolMessageEnum value) { .put("minRam", MIN_RAM_URN) .put("min_ram", MIN_RAM_URN) // Courtesy alias. .put("accelerator", ACCELERATOR_URN) + .put("cpuCount", CPU_COUNT_URN) + .put("cpu_count", CPU_COUNT_URN) // Courtesy alias. .build(); private static ImmutableMap> parsers = ImmutableMap.>builder() .put(MIN_RAM_URN, s -> new BytesHint(BytesHint.parse(s))) .put(ACCELERATOR_URN, s -> new StringHint(s)) + .put(CPU_COUNT_URN, s -> new IntHint(IntHint.parse(s))) .build(); private static final ResourceHints EMPTY = new ResourceHints(ImmutableMap.of()); @@ -212,6 +218,46 @@ public int hashCode() { } } + /*package*/ static class IntHint extends ResourceHint { + private final int value; + + @Override + public boolean equals(@Nullable Object other) { + if (other == null) { + return false; + } else if (this == other) { + return true; + } else if (other instanceof IntHint) { + return ((IntHint) other).value == value; + } else { + return false; + } + } + + @Override + public int hashCode() { + return Integer.hashCode(value); + } + + public IntHint(int value) { + this.value = value; + } + + public static int parse(String s) { + return Integer.parseInt(s, 10); + } + + @Override + public ResourceHint mergeWithOuter(ResourceHint outer) { + return new IntHint(Math.max(value, ((IntHint) outer).value)); + } + + @Override + public byte[] toBytes() { + return String.valueOf(value).getBytes(Charsets.US_ASCII); + } + } + /** * Sets desired minimal available RAM size to have in transform's execution environment. * @@ -264,6 +310,23 @@ public ResourceHints withHint(String urn, ResourceHint hint) { return new ResourceHints(newHints.build()); } + /** + * Sets desired minimal CPU or vCPU count to have in transform's execution environment. + * + * @param cpuCount specifies a positive CPU count. + */ + public ResourceHints withCPUCount(int cpuCount) { + if (cpuCount <= 0) { + LOG.error( + "Encountered invalid non-positive cpu count hint value {}.\n" + + "The value is ignored. In the future, The method will require an object Long type " + + "and throw an IllegalArgumentException for invalid values.", + cpuCount); + return this; + } + return withHint(CPU_COUNT_URN, new IntHint(cpuCount)); + } + public Map hints() { return hints; } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/HistogramData.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/HistogramData.java index b28e1cfd5af2e..dd2193d9d3354 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/HistogramData.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/HistogramData.java @@ -23,6 +23,7 @@ import java.util.Arrays; import java.util.Objects; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.math.DoubleMath; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.math.IntMath; import org.checkerframework.checker.nullness.qual.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -77,6 +78,55 @@ public static HistogramData linear(double start, double width, int numBuckets) { return new HistogramData(LinearBuckets.of(start, width, numBuckets)); } + /** + * Returns a histogram object with exponential boundaries. The input parameter {@code scale} + * determines a coefficient 'base' which species bucket boundaries. + * + *
+   * base = 2**(2**(-scale)) e.g.
+   * scale=1 => base=2**(1/2)=sqrt(2)
+   * scale=0 => base=2**(1)=2
+   * scale=-1 => base=2**(2)=4
+   * 
+ * + * This bucketing strategy makes it simple/numerically stable to compute bucket indexes for + * datapoints. + * + *
+   * Bucket boundaries are given by the following table where n=numBuckets.
+   * | 'Bucket Index' | Bucket Boundaries   |
+   * |---------------|---------------------|
+   * | Underflow     | (-inf, 0)           |
+   * | 0             | [0, base)           |
+   * | 1             | [base, base^2)      |
+   * | 2             | [base^2, base^3)    |
+   * | i             | [base^i, base^(i+1))|
+   * | n-1           | [base^(n-1), base^n)|
+   * | Overflow      | [base^n, inf)       |
+   * 
+ * + *
+   * Example scale/boundaries:
+   * When scale=1, buckets 0,1,2...i have lowerbounds 0, 2^(1/2), 2^(2/2), ... 2^(i/2).
+   * When scale=0, buckets 0,1,2...i have lowerbounds 0, 2, 2^2, ... 2^(i).
+   * When scale=-1, buckets 0,1,2...i have lowerbounds 0, 4, 4^2, ... 4^(i).
+   * 
+ * + * Scale parameter is similar to + * OpenTelemetry's notion of ExponentialHistogram. Bucket boundaries are modified to make them + * compatible with GCP's exponential histogram. + * + * @param numBuckets The number of buckets. Clipped so that the largest bucket's lower bound is + * not greater than 2^32-1 (uint32 max). + * @param scale Integer between [-3, 3] which determines bucket boundaries. Larger values imply + * more fine grained buckets. + * @return a new Histogram instance. + */ + public static HistogramData exponential(int scale, int numBuckets) { + return new HistogramData(ExponentialBuckets.of(scale, numBuckets)); + } + public void record(double... values) { for (double value : values) { record(value); @@ -227,6 +277,150 @@ public interface BucketType extends Serializable { double getAccumulatedBucketSize(int endIndex); } + @AutoValue + public abstract static class ExponentialBuckets implements BucketType { + + // Minimum scale factor. Bucket boundaries can grow at a rate of at most: 2^(2^3)=2^8=256 + private static final int MINIMUM_SCALE = -3; + + // Minimum scale factor. Bucket boundaries must grow at a rate of at least 2^(2^-3)=2^(1/8) + private static final int MAXIMUM_SCALE = 3; + + // Maximum number of buckets that is supported when 'scale' is zero. + private static final int ZERO_SCALE_MAX_NUM_BUCKETS = 32; + + public abstract double getBase(); + + public abstract int getScale(); + + /** + * Set to 2**scale which is equivalent to 1/log_2(base). Precomputed to use in {@code + * getBucketIndexPositiveScale} + */ + public abstract double getInvLog2GrowthFactor(); + + @Override + public abstract int getNumBuckets(); + + /* Precomputed since this value is used everytime a datapoint is recorded. */ + @Override + public abstract double getRangeTo(); + + public static ExponentialBuckets of(int scale, int numBuckets) { + if (scale < MINIMUM_SCALE) { + throw new IllegalArgumentException( + String.format("Scale should be greater than %d: %d", MINIMUM_SCALE, scale)); + } + + if (scale > MAXIMUM_SCALE) { + throw new IllegalArgumentException( + String.format("Scale should be less than %d: %d", MAXIMUM_SCALE, scale)); + } + if (numBuckets <= 0) { + throw new IllegalArgumentException( + String.format("numBuckets should be positive: %d", numBuckets)); + } + + double invLog2GrowthFactor = Math.pow(2, scale); + double base = Math.pow(2, Math.pow(2, -scale)); + int clippedNumBuckets = ExponentialBuckets.computeNumberOfBuckets(scale, numBuckets); + double rangeTo = Math.pow(base, clippedNumBuckets); + return new AutoValue_HistogramData_ExponentialBuckets( + base, scale, invLog2GrowthFactor, clippedNumBuckets, rangeTo); + } + + /** + * numBuckets is clipped so that the largest bucket's lower bound is not greater than 2^32-1 + * (uint32 max). This value is log_base(2^32) which simplifies as follows: + * + *
+     * log_base(2^32)
+     * = log_2(2^32)/log_2(base)
+     * = 32/(2**-scale)
+     * = 32*(2**scale)
+     * 
+ */ + private static int computeNumberOfBuckets(int scale, int inputNumBuckets) { + if (scale == 0) { + // When base=2 then the bucket at index 31 contains [2^31, 2^32). + return Math.min(ZERO_SCALE_MAX_NUM_BUCKETS, inputNumBuckets); + } else if (scale > 0) { + // When scale is positive 32*(2**scale) is equivalent to a right bit-shift. + return Math.min(inputNumBuckets, ZERO_SCALE_MAX_NUM_BUCKETS << scale); + } else { + // When scale is negative 32*(2**scale) is equivalent to a left bit-shift. + return Math.min(inputNumBuckets, ZERO_SCALE_MAX_NUM_BUCKETS >> -scale); + } + } + + @Override + public int getBucketIndex(double value) { + if (value < getBase()) { + return 0; + } + + // When scale is non-positive, 'base' and 'bucket boundaries' will be integers. + // In this scenario `value` and `floor(value)` will belong to the same bucket. + int index; + if (getScale() > 0) { + index = getBucketIndexPositiveScale(value); + } else if (getScale() < 0) { + index = getBucketIndexNegativeScale(DoubleMath.roundToInt(value, RoundingMode.FLOOR)); + } else { + index = getBucketIndexZeroScale(DoubleMath.roundToInt(value, RoundingMode.FLOOR)); + } + // Ensure that a valid index is returned in the off chance of a numerical instability error. + return Math.max(Math.min(index, getNumBuckets() - 1), 0); + } + + private int getBucketIndexZeroScale(int value) { + return IntMath.log2(value, RoundingMode.FLOOR); + } + + private int getBucketIndexNegativeScale(int value) { + return getBucketIndexZeroScale(value) >> (-getScale()); + } + + // This method is valid for all 'scale' values but we fallback to more efficient methods for + // non-positive scales. + // For a value>base we would like to find an i s.t. : + // base^i <= value < base^(i+1) + // i <= log_base(value) < i+1 + // i = floor(log_base(value)) + // i = floor(log_2(value)/log_2(base)) + private int getBucketIndexPositiveScale(double value) { + return DoubleMath.roundToInt( + getInvLog2GrowthFactor() * DoubleMath.log2(value), RoundingMode.FLOOR); + } + + @Override + public double getBucketSize(int index) { + if (index < 0) { + return 0; + } + if (index == 0) { + return getBase(); + } + + // bucketSize = (base)^(i+1) - (base)^i + // = (base)^i(base - 1) + return Math.pow(getBase(), index) * (getBase() - 1); + } + + @Override + public double getAccumulatedBucketSize(int endIndex) { + if (endIndex < 0) { + return 0; + } + return Math.pow(getBase(), endIndex + 1); + } + + @Override + public double getRangeFrom() { + return 0; + } + } + @AutoValue public abstract static class LinearBuckets implements BucketType { public abstract double getStart(); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java index 9f5546597c90f..f5c6c7fcf34a4 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java @@ -786,12 +786,12 @@ public FieldValueBuilder withFieldValues(Map values) { // withFieldValue or // withFieldValues. - public Builder addValue(@Nullable Object values) { - this.values.add(values); + public Builder addValue(@Nullable Object value) { + this.values.add(value); return this; } - public Builder addValues(List values) { + public Builder addValues(List<@Nullable Object> values) { this.values.addAll(values); return this; } @@ -822,7 +822,7 @@ public Builder addIterable(Iterable values) { // method is largely // used internal to Beam. @Internal - public Row attachValues(List attachedValues) { + public Row attachValues(List<@Nullable Object> attachedValues) { checkState(this.values.isEmpty()); return new RowWithStorage(schema, attachedValues); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/AvroCoderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/AvroCoderTest.java deleted file mode 100644 index 2508a4d363613..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/AvroCoderTest.java +++ /dev/null @@ -1,1106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.coders; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.equalTo; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.SortedMap; -import java.util.SortedSet; -import java.util.TreeMap; -import java.util.TreeSet; -import org.apache.avro.AvroRuntimeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.reflect.AvroName; -import org.apache.avro.reflect.AvroSchema; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.reflect.Stringable; -import org.apache.avro.reflect.Union; -import org.apache.avro.specific.SpecificData; -import org.apache.avro.specific.SpecificRecord; -import org.apache.avro.util.Utf8; -import org.apache.beam.sdk.coders.Coder.Context; -import org.apache.beam.sdk.coders.Coder.NonDeterministicException; -import org.apache.beam.sdk.schemas.TestAvro; -import org.apache.beam.sdk.schemas.TestAvroNested; -import org.apache.beam.sdk.schemas.TestEnum; -import org.apache.beam.sdk.schemas.fixed4; -import org.apache.beam.sdk.testing.CoderProperties; -import org.apache.beam.sdk.testing.InterceptingUrlClassLoader; -import org.apache.beam.sdk.testing.NeedsRunner; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.util.CoderUtils; -import org.apache.beam.sdk.util.InstanceBuilder; -import org.apache.beam.sdk.util.SerializableUtils; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.hamcrest.Description; -import org.hamcrest.Matcher; -import org.hamcrest.Matchers; -import org.hamcrest.TypeSafeMatcher; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.LocalDate; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; -import org.objenesis.strategy.StdInstantiatorStrategy; - -/** Tests for {@link AvroCoder}. */ -@RunWith(JUnit4.class) -public class AvroCoderTest { - - public static final DateTime DATETIME_A = - new DateTime().withDate(1994, 10, 31).withZone(DateTimeZone.UTC); - public static final DateTime DATETIME_B = - new DateTime().withDate(1997, 4, 25).withZone(DateTimeZone.UTC); - private static final TestAvroNested AVRO_NESTED_SPECIFIC_RECORD = new TestAvroNested(true, 42); - private static final TestAvro AVRO_SPECIFIC_RECORD = - new TestAvro( - true, - 43, - 44L, - 44.1f, - 44.2d, - "mystring", - ByteBuffer.wrap(new byte[] {1, 2, 3, 4}), - new fixed4(new byte[] {1, 2, 3, 4}), - new LocalDate(1979, 3, 14), - new DateTime().withDate(1979, 3, 14).withTime(1, 2, 3, 4), - TestEnum.abc, - AVRO_NESTED_SPECIFIC_RECORD, - ImmutableList.of(AVRO_NESTED_SPECIFIC_RECORD, AVRO_NESTED_SPECIFIC_RECORD), - ImmutableMap.of("k1", AVRO_NESTED_SPECIFIC_RECORD, "k2", AVRO_NESTED_SPECIFIC_RECORD)); - - @DefaultCoder(AvroCoder.class) - private static class Pojo { - public String text; - public int count; - - @AvroSchema("{\"type\": \"long\", \"logicalType\": \"timestamp-millis\"}") - public DateTime timestamp; - - // Empty constructor required for Avro decoding. - @SuppressWarnings("unused") - public Pojo() {} - - public Pojo(String text, int count, DateTime timestamp) { - this.text = text; - this.count = count; - this.timestamp = timestamp; - } - - // auto-generated - @Override - public boolean equals(@Nullable Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - - Pojo pojo = (Pojo) o; - - if (count != pojo.count) { - return false; - } - if (text != null ? !text.equals(pojo.text) : pojo.text != null) { - return false; - } - if (timestamp != null ? !timestamp.equals(pojo.timestamp) : pojo.timestamp != null) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - return 0; - } - - @Override - public String toString() { - return "Pojo{" - + "text='" - + text - + '\'' - + ", count=" - + count - + ", timestamp=" - + timestamp - + '}'; - } - } - - private static class GetTextFn extends DoFn { - @ProcessElement - public void processElement(ProcessContext c) { - c.output(c.element().text); - } - } - - @Rule public TestPipeline pipeline = TestPipeline.create(); - - @Test - public void testAvroCoderEncoding() throws Exception { - AvroCoder coder = AvroCoder.of(Pojo.class); - CoderProperties.coderSerializable(coder); - AvroCoder copy = SerializableUtils.clone(coder); - - Pojo pojo = new Pojo("foo", 3, DATETIME_A); - Pojo equalPojo = new Pojo("foo", 3, DATETIME_A); - Pojo otherPojo = new Pojo("bar", -19, DATETIME_B); - CoderProperties.coderConsistentWithEquals(coder, pojo, equalPojo); - CoderProperties.coderConsistentWithEquals(copy, pojo, equalPojo); - CoderProperties.coderConsistentWithEquals(coder, pojo, otherPojo); - CoderProperties.coderConsistentWithEquals(copy, pojo, otherPojo); - } - - /** - * Tests that {@link AvroCoder} works around issues in Avro where cache classes might be from the - * wrong ClassLoader, causing confusing "Cannot cast X to X" error messages. - */ - @SuppressWarnings("ReturnValueIgnored") - @Test - public void testTwoClassLoaders() throws Exception { - ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader(); - ClassLoader loader1 = - new InterceptingUrlClassLoader(contextClassLoader, AvroCoderTestPojo.class.getName()); - ClassLoader loader2 = - new InterceptingUrlClassLoader(contextClassLoader, AvroCoderTestPojo.class.getName()); - - Class pojoClass1 = loader1.loadClass(AvroCoderTestPojo.class.getName()); - Class pojoClass2 = loader2.loadClass(AvroCoderTestPojo.class.getName()); - - Object pojo1 = InstanceBuilder.ofType(pojoClass1).withArg(String.class, "hello").build(); - Object pojo2 = InstanceBuilder.ofType(pojoClass2).withArg(String.class, "goodbye").build(); - - // Confirm incompatibility - try { - pojoClass2.cast(pojo1); - fail("Expected ClassCastException; without it, this test is vacuous"); - } catch (ClassCastException e) { - // g2g - } - - // The first coder is expected to populate the Avro SpecificData cache - // The second coder is expected to be corrupted if the caching is done wrong. - AvroCoder avroCoder1 = (AvroCoder) AvroCoder.of(pojoClass1); - AvroCoder avroCoder2 = (AvroCoder) AvroCoder.of(pojoClass2); - - Object cloned1 = CoderUtils.clone(avroCoder1, pojo1); - Object cloned2 = CoderUtils.clone(avroCoder2, pojo2); - - // Confirming that the uncorrupted coder is fine - pojoClass1.cast(cloned1); - - // Confirmed to fail prior to the fix - pojoClass2.cast(cloned2); - } - - /** - * Confirm that we can serialize and deserialize an AvroCoder object and still decode after. - * (https://github.com/apache/beam/issues/18022). - * - * @throws Exception - */ - @Test - public void testTransientFieldInitialization() throws Exception { - Pojo value = new Pojo("Hello", 42, DATETIME_A); - AvroCoder coder = AvroCoder.of(Pojo.class); - - // Serialization of object - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - ObjectOutputStream out = new ObjectOutputStream(bos); - out.writeObject(coder); - - // De-serialization of object - ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray()); - ObjectInputStream in = new ObjectInputStream(bis); - AvroCoder copied = (AvroCoder) in.readObject(); - - CoderProperties.coderDecodeEncodeEqual(copied, value); - } - - /** - * Confirm that we can serialize and deserialize an AvroCoder object using Kryo. (BEAM-626). - * - * @throws Exception - */ - @Test - public void testKryoSerialization() throws Exception { - Pojo value = new Pojo("Hello", 42, DATETIME_A); - AvroCoder coder = AvroCoder.of(Pojo.class); - - // Kryo instantiation - Kryo kryo = new Kryo(); - kryo.setInstantiatorStrategy(new StdInstantiatorStrategy()); - - // Serialization of object without any memoization - ByteArrayOutputStream coderWithoutMemoizationBos = new ByteArrayOutputStream(); - try (Output output = new Output(coderWithoutMemoizationBos)) { - kryo.writeObject(output, coder); - } - - // Force thread local memoization to store values. - CoderProperties.coderDecodeEncodeEqual(coder, value); - - // Serialization of object with memoized fields - ByteArrayOutputStream coderWithMemoizationBos = new ByteArrayOutputStream(); - try (Output output = new Output(coderWithMemoizationBos)) { - kryo.writeObject(output, coder); - } - - // Copy empty and memoized variants of the Coder - ByteArrayInputStream bisWithoutMemoization = - new ByteArrayInputStream(coderWithoutMemoizationBos.toByteArray()); - AvroCoder copiedWithoutMemoization = - (AvroCoder) kryo.readObject(new Input(bisWithoutMemoization), AvroCoder.class); - ByteArrayInputStream bisWithMemoization = - new ByteArrayInputStream(coderWithMemoizationBos.toByteArray()); - AvroCoder copiedWithMemoization = - (AvroCoder) kryo.readObject(new Input(bisWithMemoization), AvroCoder.class); - - CoderProperties.coderDecodeEncodeEqual(copiedWithoutMemoization, value); - CoderProperties.coderDecodeEncodeEqual(copiedWithMemoization, value); - } - - @Test - public void testPojoEncoding() throws Exception { - Pojo value = new Pojo("Hello", 42, DATETIME_A); - AvroCoder coder = AvroCoder.of(Pojo.class); - - CoderProperties.coderDecodeEncodeEqual(coder, value); - } - - @Test - public void testSpecificRecordEncoding() throws Exception { - AvroCoder coder = - AvroCoder.of(TestAvro.class, AVRO_SPECIFIC_RECORD.getSchema(), false); - - assertTrue(SpecificRecord.class.isAssignableFrom(coder.getType())); - CoderProperties.coderDecodeEncodeEqual(coder, AVRO_SPECIFIC_RECORD); - } - - @Test - public void testReflectRecordEncoding() throws Exception { - AvroCoder coder = AvroCoder.of(TestAvro.class, true); - AvroCoder coderWithSchema = - AvroCoder.of(TestAvro.class, AVRO_SPECIFIC_RECORD.getSchema(), true); - - assertTrue(SpecificRecord.class.isAssignableFrom(coder.getType())); - assertTrue(SpecificRecord.class.isAssignableFrom(coderWithSchema.getType())); - - CoderProperties.coderDecodeEncodeEqual(coder, AVRO_SPECIFIC_RECORD); - CoderProperties.coderDecodeEncodeEqual(coderWithSchema, AVRO_SPECIFIC_RECORD); - } - - @Test - public void testDisableReflectionEncoding() { - try { - AvroCoder.of(Pojo.class, false); - fail("When userReclectApi is disable, schema should not be generated through reflection"); - } catch (AvroRuntimeException e) { - String message = - "avro.shaded.com.google.common.util.concurrent.UncheckedExecutionException: " - + "org.apache.avro.AvroRuntimeException: " - + "Not a Specific class: class org.apache.beam.sdk.coders.AvroCoderTest$Pojo"; - assertEquals(message, e.getMessage()); - } - } - - @Test - public void testGenericRecordEncoding() throws Exception { - String schemaString = - "{\"namespace\": \"example.avro\",\n" - + " \"type\": \"record\",\n" - + " \"name\": \"User\",\n" - + " \"fields\": [\n" - + " {\"name\": \"name\", \"type\": \"string\"},\n" - + " {\"name\": \"favorite_number\", \"type\": [\"int\", \"null\"]},\n" - + " {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n" - + " ]\n" - + "}"; - Schema schema = new Schema.Parser().parse(schemaString); - - GenericRecord before = new GenericData.Record(schema); - before.put("name", "Bob"); - before.put("favorite_number", 256); - // Leave favorite_color null - - AvroCoder coder = AvroCoder.of(GenericRecord.class, schema); - - CoderProperties.coderDecodeEncodeEqual(coder, before); - assertEquals(schema, coder.getSchema()); - } - - @Test - public void testEncodingNotBuffered() throws Exception { - // This test ensures that the coder doesn't read ahead and buffer data. - // Reading ahead causes a problem if the stream consists of records of different - // types. - Pojo before = new Pojo("Hello", 42, DATETIME_A); - - AvroCoder coder = AvroCoder.of(Pojo.class); - SerializableCoder intCoder = SerializableCoder.of(Integer.class); - - ByteArrayOutputStream outStream = new ByteArrayOutputStream(); - - Context context = Context.NESTED; - coder.encode(before, outStream, context); - intCoder.encode(10, outStream, context); - - ByteArrayInputStream inStream = new ByteArrayInputStream(outStream.toByteArray()); - - Pojo after = coder.decode(inStream, context); - assertEquals(before, after); - - Integer intAfter = intCoder.decode(inStream, context); - assertEquals(Integer.valueOf(10), intAfter); - } - - @Test - @Category(NeedsRunner.class) - public void testDefaultCoder() throws Exception { - // Use MyRecord as input and output types without explicitly specifying - // a coder (this uses the default coders, which may not be AvroCoder). - PCollection output = - pipeline - .apply(Create.of(new Pojo("hello", 1, DATETIME_A), new Pojo("world", 2, DATETIME_B))) - .apply(ParDo.of(new GetTextFn())); - - PAssert.that(output).containsInAnyOrder("hello", "world"); - pipeline.run(); - } - - @Test - public void testAvroCoderIsSerializable() throws Exception { - AvroCoder coder = AvroCoder.of(Pojo.class); - - // Check that the coder is serializable using the regular JSON approach. - SerializableUtils.ensureSerializable(coder); - } - - @Test - public void testAvroSpecificCoderIsSerializable() throws Exception { - AvroCoder coder = AvroCoder.of(TestAvro.class, false); - - // Check that the coder is serializable using the regular JSON approach. - SerializableUtils.ensureSerializable(coder); - } - - private void assertDeterministic(AvroCoder coder) { - try { - coder.verifyDeterministic(); - } catch (NonDeterministicException e) { - fail("Expected " + coder + " to be deterministic, but got:\n" + e); - } - } - - private void assertNonDeterministic(AvroCoder coder, Matcher reason1) { - try { - coder.verifyDeterministic(); - fail("Expected " + coder + " to be non-deterministic."); - } catch (NonDeterministicException e) { - assertThat(e.getReasons(), Matchers.iterableWithSize(1)); - assertThat(e.getReasons(), Matchers.contains(reason1)); - } - } - - @Test - public void testDeterministicInteger() { - assertDeterministic(AvroCoder.of(Integer.class)); - } - - @Test - public void testDeterministicInt() { - assertDeterministic(AvroCoder.of(int.class)); - } - - private static class SimpleDeterministicClass { - @SuppressWarnings("unused") - private Integer intField; - - @SuppressWarnings("unused") - private char charField; - - @SuppressWarnings("unused") - private Integer[] intArray; - - @SuppressWarnings("unused") - private Utf8 utf8field; - } - - @Test - public void testDeterministicSimple() { - assertDeterministic(AvroCoder.of(SimpleDeterministicClass.class)); - } - - private static class UnorderedMapClass { - @SuppressWarnings("unused") - private Map mapField; - } - - private Matcher reason(final String prefix, final String messagePart) { - return new TypeSafeMatcher(String.class) { - @Override - public void describeTo(Description description) { - description.appendText( - String.format("Reason starting with '%s:' containing '%s'", prefix, messagePart)); - } - - @Override - protected boolean matchesSafely(String item) { - return item.startsWith(prefix + ":") && item.contains(messagePart); - } - }; - } - - private Matcher reasonClass(Class clazz, String message) { - return reason(clazz.getName(), message); - } - - private Matcher reasonField(Class clazz, String field, String message) { - return reason(clazz.getName() + "#" + field, message); - } - - @Test - public void testDeterministicUnorderedMap() { - assertNonDeterministic( - AvroCoder.of(UnorderedMapClass.class), - reasonField( - UnorderedMapClass.class, - "mapField", - "java.util.Map " - + "may not be deterministically ordered")); - } - - private static class NonDeterministicArray { - @SuppressWarnings("unused") - private UnorderedMapClass[] arrayField; - } - - @Test - public void testDeterministicNonDeterministicArray() { - assertNonDeterministic( - AvroCoder.of(NonDeterministicArray.class), - reasonField( - UnorderedMapClass.class, - "mapField", - "java.util.Map" - + " may not be deterministically ordered")); - } - - private static class SubclassOfUnorderedMapClass extends UnorderedMapClass {} - - @Test - public void testDeterministicNonDeterministicChild() { - // Super class has non deterministic fields. - assertNonDeterministic( - AvroCoder.of(SubclassOfUnorderedMapClass.class), - reasonField(UnorderedMapClass.class, "mapField", "may not be deterministically ordered")); - } - - private static class SubclassHidingParent extends UnorderedMapClass { - @SuppressWarnings("unused") - @AvroName("mapField2") // AvroName is not enough - private int mapField; - } - - @Test - public void testAvroProhibitsShadowing() { - // This test verifies that Avro won't serialize a class with two fields of - // the same name. This is important for our error reporting, and also how - // we lookup a field. - try { - ReflectData.get().getSchema(SubclassHidingParent.class); - fail("Expected AvroTypeException"); - } catch (AvroRuntimeException e) { - assertThat(e.getMessage(), containsString("mapField")); - assertThat(e.getMessage(), containsString("two fields named")); - } - } - - private static class FieldWithAvroName { - @AvroName("name") - @SuppressWarnings("unused") - private int someField; - } - - @Test - public void testDeterministicWithAvroName() { - assertDeterministic(AvroCoder.of(FieldWithAvroName.class)); - } - - @Test - public void testDeterminismSortedMap() { - assertDeterministic(AvroCoder.of(StringSortedMapField.class)); - } - - private static class StringSortedMapField { - @SuppressWarnings("unused") - SortedMap sortedMapField; - } - - @Test - public void testDeterminismTreeMapValue() { - // The value is non-deterministic, so we should fail. - assertNonDeterministic( - AvroCoder.of(TreeMapNonDetValue.class), - reasonField( - UnorderedMapClass.class, - "mapField", - "java.util.Map " - + "may not be deterministically ordered")); - } - - private static class TreeMapNonDetValue { - @SuppressWarnings("unused") - TreeMap nonDeterministicField; - } - - @Test - public void testDeterminismUnorderedMap() { - // LinkedHashMap is not deterministically ordered, so we should fail. - assertNonDeterministic( - AvroCoder.of(LinkedHashMapField.class), - reasonField( - LinkedHashMapField.class, - "nonDeterministicMap", - "java.util.LinkedHashMap " - + "may not be deterministically ordered")); - } - - private static class LinkedHashMapField { - @SuppressWarnings("unused") - LinkedHashMap nonDeterministicMap; - } - - @Test - public void testDeterminismCollection() { - assertNonDeterministic( - AvroCoder.of(StringCollection.class), - reasonField( - StringCollection.class, - "stringCollection", - "java.util.Collection may not be deterministically ordered")); - } - - private static class StringCollection { - @SuppressWarnings("unused") - Collection stringCollection; - } - - @Test - public void testDeterminismList() { - assertDeterministic(AvroCoder.of(StringList.class)); - assertDeterministic(AvroCoder.of(StringArrayList.class)); - } - - private static class StringList { - @SuppressWarnings("unused") - List stringCollection; - } - - private static class StringArrayList { - @SuppressWarnings("unused") - ArrayList stringCollection; - } - - @Test - public void testDeterminismSet() { - assertDeterministic(AvroCoder.of(StringSortedSet.class)); - assertDeterministic(AvroCoder.of(StringTreeSet.class)); - assertNonDeterministic( - AvroCoder.of(StringHashSet.class), - reasonField( - StringHashSet.class, - "stringCollection", - "java.util.HashSet may not be deterministically ordered")); - } - - private static class StringSortedSet { - @SuppressWarnings("unused") - SortedSet stringCollection; - } - - private static class StringTreeSet { - @SuppressWarnings("unused") - TreeSet stringCollection; - } - - private static class StringHashSet { - @SuppressWarnings("unused") - HashSet stringCollection; - } - - @Test - public void testDeterminismCollectionValue() { - assertNonDeterministic( - AvroCoder.of(OrderedSetOfNonDetValues.class), - reasonField(UnorderedMapClass.class, "mapField", "may not be deterministically ordered")); - assertNonDeterministic( - AvroCoder.of(ListOfNonDetValues.class), - reasonField(UnorderedMapClass.class, "mapField", "may not be deterministically ordered")); - } - - private static class OrderedSetOfNonDetValues { - @SuppressWarnings("unused") - SortedSet set; - } - - private static class ListOfNonDetValues { - @SuppressWarnings("unused") - List set; - } - - @Test - public void testDeterminismUnion() { - assertDeterministic(AvroCoder.of(DeterministicUnionBase.class)); - assertNonDeterministic( - AvroCoder.of(NonDeterministicUnionBase.class), - reasonField(UnionCase3.class, "mapField", "may not be deterministically ordered")); - } - - @Test - public void testDeterminismStringable() { - assertDeterministic(AvroCoder.of(String.class)); - assertNonDeterministic( - AvroCoder.of(StringableClass.class), - reasonClass(StringableClass.class, "may not have deterministic #toString()")); - } - - @Stringable - private static class StringableClass {} - - @Test - public void testDeterminismCyclicClass() { - assertNonDeterministic( - AvroCoder.of(Cyclic.class), - reasonField(Cyclic.class, "cyclicField", "appears recursively")); - assertNonDeterministic( - AvroCoder.of(CyclicField.class), - reasonField(Cyclic.class, "cyclicField", Cyclic.class.getName() + " appears recursively")); - assertNonDeterministic( - AvroCoder.of(IndirectCycle1.class), - reasonField( - IndirectCycle2.class, - "field2", - IndirectCycle1.class.getName() + " appears recursively")); - } - - private static class Cyclic { - @SuppressWarnings("unused") - int intField; - - @SuppressWarnings("unused") - Cyclic cyclicField; - } - - private static class CyclicField { - @SuppressWarnings("unused") - Cyclic cyclicField2; - } - - private static class IndirectCycle1 { - @SuppressWarnings("unused") - IndirectCycle2 field1; - } - - private static class IndirectCycle2 { - @SuppressWarnings("unused") - IndirectCycle1 field2; - } - - @Test - public void testDeterminismHasGenericRecord() { - assertDeterministic(AvroCoder.of(HasGenericRecord.class)); - } - - private static class HasGenericRecord { - @AvroSchema( - "{\"name\": \"bar\", \"type\": \"record\", \"fields\": [" - + "{\"name\": \"foo\", \"type\": \"int\"}]}") - GenericRecord genericRecord; - } - - @Test - public void testDeterminismHasCustomSchema() { - assertNonDeterministic( - AvroCoder.of(HasCustomSchema.class), - reasonField( - HasCustomSchema.class, - "withCustomSchema", - "Custom schemas are only supported for subtypes of IndexedRecord.")); - } - - private static class HasCustomSchema { - @AvroSchema( - "{\"name\": \"bar\", \"type\": \"record\", \"fields\": [" - + "{\"name\": \"foo\", \"type\": \"int\"}]}") - int withCustomSchema; - } - - @Test - public void testAvroCoderTreeMapDeterminism() throws Exception, NonDeterministicException { - TreeMapField size1 = new TreeMapField(); - TreeMapField size2 = new TreeMapField(); - - // Different order for entries - size1.field.put("hello", "world"); - size1.field.put("another", "entry"); - - size2.field.put("another", "entry"); - size2.field.put("hello", "world"); - - AvroCoder coder = AvroCoder.of(TreeMapField.class); - coder.verifyDeterministic(); - - ByteArrayOutputStream outStream1 = new ByteArrayOutputStream(); - ByteArrayOutputStream outStream2 = new ByteArrayOutputStream(); - - Context context = Context.NESTED; - coder.encode(size1, outStream1, context); - coder.encode(size2, outStream2, context); - - assertArrayEquals(outStream1.toByteArray(), outStream2.toByteArray()); - } - - private static class TreeMapField { - private TreeMap field = new TreeMap<>(); - } - - @Union({UnionCase1.class, UnionCase2.class}) - private abstract static class DeterministicUnionBase {} - - @Union({UnionCase1.class, UnionCase2.class, UnionCase3.class}) - private abstract static class NonDeterministicUnionBase {} - - private static class UnionCase1 extends DeterministicUnionBase {} - - private static class UnionCase2 extends DeterministicUnionBase { - @SuppressWarnings("unused") - String field; - } - - private static class UnionCase3 extends NonDeterministicUnionBase { - @SuppressWarnings("unused") - private Map mapField; - } - - @Test - public void testAvroCoderSimpleSchemaDeterminism() { - assertDeterministic(AvroCoder.of(SchemaBuilder.record("someRecord").fields().endRecord())); - assertDeterministic( - AvroCoder.of( - SchemaBuilder.record("someRecord") - .fields() - .name("int") - .type() - .intType() - .noDefault() - .endRecord())); - assertDeterministic( - AvroCoder.of( - SchemaBuilder.record("someRecord") - .fields() - .name("string") - .type() - .stringType() - .noDefault() - .endRecord())); - - assertNonDeterministic( - AvroCoder.of( - SchemaBuilder.record("someRecord") - .fields() - .name("map") - .type() - .map() - .values() - .stringType() - .noDefault() - .endRecord()), - reason("someRecord.map", "HashMap to represent MAPs")); - - assertDeterministic( - AvroCoder.of( - SchemaBuilder.record("someRecord") - .fields() - .name("array") - .type() - .array() - .items() - .stringType() - .noDefault() - .endRecord())); - - assertDeterministic( - AvroCoder.of( - SchemaBuilder.record("someRecord") - .fields() - .name("enum") - .type() - .enumeration("anEnum") - .symbols("s1", "s2") - .enumDefault("s1") - .endRecord())); - - assertDeterministic( - AvroCoder.of( - SchemaBuilder.unionOf() - .intType() - .and() - .record("someRecord") - .fields() - .nullableString("someField", "") - .endRecord() - .endUnion())); - } - - @Test - public void testAvroCoderStrings() { - // Custom Strings in Records - assertDeterministic( - AvroCoder.of( - SchemaBuilder.record("someRecord") - .fields() - .name("string") - .prop(SpecificData.CLASS_PROP, "java.lang.String") - .type() - .stringType() - .noDefault() - .endRecord())); - assertNonDeterministic( - AvroCoder.of( - SchemaBuilder.record("someRecord") - .fields() - .name("string") - .prop(SpecificData.CLASS_PROP, "unknownString") - .type() - .stringType() - .noDefault() - .endRecord()), - reason("someRecord.string", "unknownString is not known to be deterministic")); - - // Custom Strings in Unions - assertNonDeterministic( - AvroCoder.of( - SchemaBuilder.unionOf() - .intType() - .and() - .record("someRecord") - .fields() - .name("someField") - .prop(SpecificData.CLASS_PROP, "unknownString") - .type() - .stringType() - .noDefault() - .endRecord() - .endUnion()), - reason("someRecord.someField", "unknownString is not known to be deterministic")); - } - - @Test - public void testAvroCoderNestedRecords() { - // Nested Record - assertDeterministic( - AvroCoder.of( - SchemaBuilder.record("nestedRecord") - .fields() - .name("subRecord") - .type() - .record("subRecord") - .fields() - .name("innerField") - .type() - .stringType() - .noDefault() - .endRecord() - .noDefault() - .endRecord())); - } - - @Test - public void testAvroCoderCyclicRecords() { - // Recursive record - assertNonDeterministic( - AvroCoder.of( - SchemaBuilder.record("cyclicRecord") - .fields() - .name("cycle") - .type("cyclicRecord") - .noDefault() - .endRecord()), - reason("cyclicRecord.cycle", "cyclicRecord appears recursively")); - } - - private static class NullableField { - @SuppressWarnings("unused") - private @Nullable String nullable; - } - - @Test - public void testNullableField() { - assertDeterministic(AvroCoder.of(NullableField.class)); - } - - private static class NullableNonDeterministicField { - @SuppressWarnings("unused") - private @Nullable NonDeterministicArray nullableNonDetArray; - } - - private static class NullableCyclic { - @SuppressWarnings("unused") - private @Nullable NullableCyclic nullableNullableCyclicField; - } - - private static class NullableCyclicField { - @SuppressWarnings("unused") - private @Nullable Cyclic nullableCyclicField; - } - - @Test - public void testNullableNonDeterministicField() { - assertNonDeterministic( - AvroCoder.of(NullableCyclic.class), - reasonField( - NullableCyclic.class, - "nullableNullableCyclicField", - NullableCyclic.class.getName() + " appears recursively")); - assertNonDeterministic( - AvroCoder.of(NullableCyclicField.class), - reasonField(Cyclic.class, "cyclicField", Cyclic.class.getName() + " appears recursively")); - assertNonDeterministic( - AvroCoder.of(NullableNonDeterministicField.class), - reasonField(UnorderedMapClass.class, "mapField", " may not be deterministically ordered")); - } - - /** - * Tests that a parameterized class can have an automatically generated schema if the generic - * field is annotated with a union tag. - */ - @Test - public void testGenericClassWithUnionAnnotation() throws Exception { - // Cast is safe as long as the same coder is used for encoding and decoding. - @SuppressWarnings({"unchecked", "rawtypes"}) - AvroCoder> coder = - (AvroCoder) AvroCoder.of(GenericWithAnnotation.class); - - assertThat( - coder.getSchema().getField("onlySomeTypesAllowed").schema().getType(), - equalTo(Schema.Type.UNION)); - - CoderProperties.coderDecodeEncodeEqual(coder, new GenericWithAnnotation<>("hello")); - } - - private static class GenericWithAnnotation { - @AvroSchema("[\"string\", \"int\"]") - private T onlySomeTypesAllowed; - - public GenericWithAnnotation(T value) { - onlySomeTypesAllowed = value; - } - - // For deserialization only - @SuppressWarnings("unused") - protected GenericWithAnnotation() {} - - @Override - public boolean equals(@Nullable Object other) { - return other instanceof GenericWithAnnotation - && onlySomeTypesAllowed.equals(((GenericWithAnnotation) other).onlySomeTypesAllowed); - } - - @Override - public int hashCode() { - return Objects.hash(getClass(), onlySomeTypesAllowed); - } - } - - @Test - public void testAvroCoderForGenerics() throws Exception { - Schema fooSchema = AvroCoder.of(Foo.class).getSchema(); - Schema schema = - new Schema.Parser() - .parse( - "{" - + "\"type\":\"record\"," - + "\"name\":\"SomeGeneric\"," - + "\"namespace\":\"ns\"," - + "\"fields\":[" - + " {\"name\":\"foo\", \"type\":" - + fooSchema.toString() - + "}" - + "]}"); - @SuppressWarnings("rawtypes") - AvroCoder coder = AvroCoder.of(SomeGeneric.class, schema); - - assertNonDeterministic(coder, reasonField(SomeGeneric.class, "foo", "erasure")); - } - - @Test - public void testEncodedTypeDescriptor() throws Exception { - AvroCoder coder = AvroCoder.of(Pojo.class); - assertThat(coder.getEncodedTypeDescriptor(), equalTo(TypeDescriptor.of(Pojo.class))); - } - - private static class SomeGeneric { - @SuppressWarnings("unused") - private T foo; - } - - private static class Foo { - @SuppressWarnings("unused") - String id; - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/CoderRegistryTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/CoderRegistryTest.java index 5a8d7e83c4425..36966c2d35d61 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/CoderRegistryTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/CoderRegistryTest.java @@ -463,7 +463,7 @@ public void testCoderPrecedence() throws Exception { CoderRegistry registry = CoderRegistry.createDefault(); // DefaultCoder precedes CoderProviderRegistrar - assertEquals(AvroCoder.of(MyValueA.class), registry.getCoder(MyValueA.class)); + assertEquals(MockDefaultCoder.of(MyValueA.class), registry.getCoder(MyValueA.class)); // CoderProviderRegistrar precedes SerializableCoder assertEquals(MyValueBCoder.INSTANCE, registry.getCoder(MyValueB.class)); @@ -472,7 +472,7 @@ public void testCoderPrecedence() throws Exception { assertEquals(SerializableCoder.of(MyValueC.class), registry.getCoder(MyValueC.class)); } - @DefaultCoder(AvroCoder.class) + @DefaultCoder(MockDefaultCoder.class) private static class MyValueA implements Serializable {} private static class MyValueB implements Serializable {} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/DefaultCoderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/DefaultCoderTest.java index d120ec07f571e..62d6d7e1d0491 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/DefaultCoderTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/DefaultCoderTest.java @@ -39,7 +39,7 @@ public class DefaultCoderTest { @Rule public ExpectedException thrown = ExpectedException.none(); - @DefaultCoder(AvroCoder.class) + @DefaultCoder(MockDefaultCoder.class) private static class AvroRecord {} private static class SerializableBase implements Serializable {} @@ -111,7 +111,7 @@ public Coder coderFor( public void testCodersWithoutComponents() throws Exception { CoderRegistry registry = CoderRegistry.createDefault(); registry.registerCoderProvider(new DefaultCoderProvider()); - assertThat(registry.getCoder(AvroRecord.class), instanceOf(AvroCoder.class)); + assertThat(registry.getCoder(AvroRecord.class), instanceOf(MockDefaultCoder.class)); assertThat(registry.getCoder(SerializableRecord.class), instanceOf(SerializableCoder.class)); assertThat(registry.getCoder(CustomRecord.class), instanceOf(CustomSerializableCoder.class)); assertThat( @@ -125,7 +125,7 @@ public void testDefaultCoderInCollection() throws Exception { Coder> avroRecordCoder = registry.getCoder(new TypeDescriptor>() {}); assertThat(avroRecordCoder, instanceOf(ListCoder.class)); - assertThat(((ListCoder) avroRecordCoder).getElemCoder(), instanceOf(AvroCoder.class)); + assertThat(((ListCoder) avroRecordCoder).getElemCoder(), instanceOf(MockDefaultCoder.class)); assertThat( registry.getCoder(new TypeDescriptor>() {}), Matchers.equalTo(ListCoder.of(SerializableCoder.of(SerializableRecord.class)))); diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/MockDefaultCoder.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/MockDefaultCoder.java new file mode 100644 index 0000000000000..4f20e86bf6bb2 --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/MockDefaultCoder.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.coders; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.List; +import org.apache.beam.sdk.values.TypeDescriptor; + +/** + * Used only for tests. + * + * @param + */ +class MockDefaultCoder extends CustomCoder { + private static final MockDefaultCoder INSTANCE = new MockDefaultCoder(); + + @Override + public void encode(T value, OutputStream outStream) throws IOException {} + + @Override + public T decode(InputStream inStream) throws IOException { + return null; + } + + public static MockDefaultCoder of(Class clazz) { + return INSTANCE; + } + + public static CoderProvider getCoderProvider() { + return new MockAvroCoderProvider(); + } + + static class MockAvroCoderProvider extends CoderProvider { + @Override + public Coder coderFor( + TypeDescriptor typeDescriptor, List> componentCoders) { + return (Coder) MockDefaultCoder.INSTANCE; + } + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroIOTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroIOTest.java deleted file mode 100644 index 0126dfb0c02fb..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroIOTest.java +++ /dev/null @@ -1,1627 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import static org.apache.avro.file.DataFileConstants.SNAPPY_CODEC; -import static org.apache.beam.sdk.io.Compression.AUTO; -import static org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions.RESOLVE_FILE; -import static org.apache.beam.sdk.transforms.Contextful.fn; -import static org.apache.beam.sdk.transforms.Requirements.requiresSideInputs; -import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects.firstNonNull; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.containsInAnyOrder; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.Serializable; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Random; -import java.util.stream.Collectors; -import org.apache.avro.Schema; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.file.CodecFactory; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.file.DataFileStream; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.avro.io.DatumWriter; -import org.apache.avro.io.Encoder; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.reflect.ReflectDatumReader; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.coders.CoderException; -import org.apache.beam.sdk.coders.DefaultCoder; -import org.apache.beam.sdk.coders.KvCoder; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.options.ValueProvider; -import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; -import org.apache.beam.sdk.testing.NeedsRunner; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.TestStream; -import org.apache.beam.sdk.testing.UsesTestStream; -import org.apache.beam.sdk.testing.UsesUnboundedSplittableParDo; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.SerializableFunction; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.transforms.View; -import org.apache.beam.sdk.transforms.Watch; -import org.apache.beam.sdk.transforms.display.DisplayData; -import org.apache.beam.sdk.transforms.windowing.AfterPane; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.PaneInfo; -import org.apache.beam.sdk.transforms.windowing.Repeatedly; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.util.SerializableUtils; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PCollectionView; -import org.apache.beam.sdk.values.TimestampedValue; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ArrayListMultimap; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterators; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Multimap; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.Duration; -import org.joda.time.Instant; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.rules.ExpectedException; -import org.junit.rules.TemporaryFolder; -import org.junit.rules.Timeout; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; -import org.junit.runners.Parameterized; - -/** Tests for AvroIO Read and Write transforms. */ -@SuppressWarnings({ - "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) -}) -public class AvroIOTest implements Serializable { - /** Unit tests. */ - @RunWith(JUnit4.class) - public static class SimpleTests implements Serializable { - @Test - public void testAvroIOGetName() { - assertEquals("AvroIO.Read", AvroIO.read(String.class).from("/tmp/foo*/baz").getName()); - assertEquals("AvroIO.Write", AvroIO.write(String.class).to("/tmp/foo/baz").getName()); - } - - @Test - public void testWriteWithDefaultCodec() { - AvroIO.Write write = AvroIO.write(String.class).to("/tmp/foo/baz"); - assertEquals(CodecFactory.snappyCodec().toString(), write.inner.getCodec().toString()); - } - - @Test - public void testWriteWithCustomCodec() { - AvroIO.Write write = - AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.snappyCodec()); - assertEquals(SNAPPY_CODEC, write.inner.getCodec().toString()); - } - - @Test - public void testWriteWithSerDeCustomDeflateCodec() { - AvroIO.Write write = - AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.deflateCodec(9)); - - assertEquals( - CodecFactory.deflateCodec(9).toString(), - SerializableUtils.clone(write.inner.getCodec()).getCodec().toString()); - } - - @Test - public void testWriteWithSerDeCustomXZCodec() { - AvroIO.Write write = - AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.xzCodec(9)); - - assertEquals( - CodecFactory.xzCodec(9).toString(), - SerializableUtils.clone(write.inner.getCodec()).getCodec().toString()); - } - - @Test - public void testReadDisplayData() { - AvroIO.Read read = AvroIO.read(String.class).from("/foo.*"); - - DisplayData displayData = DisplayData.from(read); - assertThat(displayData, hasDisplayItem("filePattern", "/foo.*")); - } - } - - /** NeedsRunner tests. */ - @RunWith(Parameterized.class) - @Category(NeedsRunner.class) - public static class NeedsRunnerTests implements Serializable { - @Rule public transient TestPipeline writePipeline = TestPipeline.create(); - - @Rule public transient TestPipeline readPipeline = TestPipeline.create(); - - @Rule public transient TestPipeline windowedAvroWritePipeline = TestPipeline.create(); - - @Rule public transient TemporaryFolder tmpFolder = new TemporaryFolder(); - - @Rule public transient ExpectedException expectedException = ExpectedException.none(); - - @Rule public transient Timeout globalTimeout = Timeout.seconds(1200); - - @Parameterized.Parameters(name = "{index}: {0}") - public static Collection params() { - return Arrays.asList(new Object[][] {{true}, {false}}); - } - - @Parameterized.Parameter public boolean withBeamSchemas; - - @DefaultCoder(AvroCoder.class) - static class GenericClass { - int intField; - String stringField; - - GenericClass() {} - - GenericClass(int intField, String stringField) { - this.intField = intField; - this.stringField = stringField; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(getClass()) - .add("intField", intField) - .add("stringField", stringField) - .toString(); - } - - @Override - public int hashCode() { - return Objects.hash(intField, stringField); - } - - @Override - public boolean equals(@Nullable Object other) { - if (other == null || !(other instanceof GenericClass)) { - return false; - } - GenericClass o = (GenericClass) other; - return intField == o.intField && Objects.equals(stringField, o.stringField); - } - } - - private static class ParseGenericClass - implements SerializableFunction { - @Override - public GenericClass apply(GenericRecord input) { - return new GenericClass((int) input.get("intField"), input.get("stringField").toString()); - } - - @Test - public void testWriteDisplayData() { - AvroIO.Write write = - AvroIO.write(GenericClass.class) - .to("/foo") - .withShardNameTemplate("-SS-of-NN-") - .withSuffix("bar") - .withNumShards(100) - .withCodec(CodecFactory.deflateCodec(6)); - - DisplayData displayData = DisplayData.from(write); - - assertThat(displayData, hasDisplayItem("filePrefix", "/foo")); - assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-")); - assertThat(displayData, hasDisplayItem("fileSuffix", "bar")); - assertThat( - displayData, - hasDisplayItem( - "schema", - "{\"type\":\"record\",\"name\":\"GenericClass\",\"namespace\":\"org.apache.beam.sdk.io" - + ".AvroIOTest$\",\"fields\":[{\"name\":\"intField\",\"type\":\"int\"}," - + "{\"name\":\"stringField\",\"type\":\"string\"}]}")); - assertThat(displayData, hasDisplayItem("numShards", 100)); - assertThat(displayData, hasDisplayItem("codec", CodecFactory.deflateCodec(6).toString())); - } - } - - private enum Sharding { - RUNNER_DETERMINED, - WITHOUT_SHARDING, - FIXED_3_SHARDS - } - - private enum WriteMethod { - AVROIO_WRITE, - AVROIO_SINK_WITH_CLASS, - AVROIO_SINK_WITH_SCHEMA, - /** @deprecated Test code for the deprecated {AvroIO.RecordFormatter}. */ - @Deprecated - AVROIO_SINK_WITH_FORMATTER - } - - private static final String SCHEMA_STRING = - "{\"namespace\": \"example.avro\",\n" - + " \"type\": \"record\",\n" - + " \"name\": \"AvroGeneratedUser\",\n" - + " \"fields\": [\n" - + " {\"name\": \"name\", \"type\": \"string\"},\n" - + " {\"name\": \"favorite_number\", \"type\": [\"int\", \"null\"]},\n" - + " {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n" - + " ]\n" - + "}"; - - private static final Schema SCHEMA = new Schema.Parser().parse(SCHEMA_STRING); - - @Test - @Category(NeedsRunner.class) - public void testWriteThenReadJavaClass() throws Throwable { - List values = - ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.write(GenericClass.class) - .to(writePipeline.newProvider(outputFile.getAbsolutePath())) - .withoutSharding()); - writePipeline.run(); - - PAssert.that( - readPipeline.apply( - "Read", - AvroIO.read(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .from(readPipeline.newProvider(outputFile.getAbsolutePath())))) - .containsInAnyOrder(values); - - readPipeline.run(); - } - - @Test - @Category(NeedsRunner.class) - public void testReadWithFilename() throws Throwable { - List values = - ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.write(GenericClass.class) - .to(writePipeline.newProvider(outputFile.getAbsolutePath())) - .withoutSharding()); - writePipeline.run(); - - SerializableFunction> createSource = - input -> - AvroSource.from(ValueProvider.StaticValueProvider.of(input)) - .withSchema(GenericClass.class); - - final PCollection> lines = - readPipeline - .apply(Create.of(Collections.singletonList(outputFile.getAbsolutePath()))) - .apply(FileIO.matchAll()) - .apply(FileIO.readMatches().withCompression(AUTO)) - .apply( - new ReadAllViaFileBasedSourceWithFilename<>( - 10, - createSource, - KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GenericClass.class)))); - - PAssert.that(lines) - .containsInAnyOrder( - values.stream() - .map(v -> KV.of(outputFile.getAbsolutePath(), v)) - .collect(Collectors.toList())); - readPipeline.run(); - } - - @Test - @Category(NeedsRunner.class) - public void testWriteThenReadCustomType() throws Throwable { - List values = Arrays.asList(0L, 1L, 2L); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.writeCustomType() - .to(writePipeline.newProvider(outputFile.getAbsolutePath())) - .withFormatFunction(new CreateGenericClass()) - .withSchema(ReflectData.get().getSchema(GenericClass.class)) - .withoutSharding()); - writePipeline.run(); - - PAssert.that( - readPipeline - .apply( - "Read", - AvroIO.read(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .from(readPipeline.newProvider(outputFile.getAbsolutePath()))) - .apply( - MapElements.via( - new SimpleFunction() { - @Override - public Long apply(GenericClass input) { - return (long) input.intField; - } - }))) - .containsInAnyOrder(values); - - readPipeline.run(); - } - - private void testWriteThenReadGeneratedClass( - AvroIO.Write writeTransform, AvroIO.Read readTransform) throws Exception { - File outputFile = tmpFolder.newFile("output.avro"); - - List values = - ImmutableList.of( - (T) new AvroGeneratedUser("Bob", 256, null), - (T) new AvroGeneratedUser("Alice", 128, null), - (T) new AvroGeneratedUser("Ted", null, "white")); - - writePipeline - .apply(Create.of(values)) - .apply( - writeTransform - .to(writePipeline.newProvider(outputFile.getAbsolutePath())) - .withoutSharding()); - writePipeline.run(); - - PAssert.that( - readPipeline.apply( - "Read", - readTransform.from(readPipeline.newProvider(outputFile.getAbsolutePath())))) - .containsInAnyOrder(values); - - readPipeline.run(); - } - - @Test - @Category(NeedsRunner.class) - public void testWriteThenReadGeneratedClassWithClass() throws Throwable { - testWriteThenReadGeneratedClass( - AvroIO.write(AvroGeneratedUser.class), - AvroIO.read(AvroGeneratedUser.class).withBeamSchemas(withBeamSchemas)); - } - - @Test - @Category(NeedsRunner.class) - public void testWriteThenReadGeneratedClassWithSchema() throws Throwable { - testWriteThenReadGeneratedClass( - AvroIO.writeGenericRecords(SCHEMA), - AvroIO.readGenericRecords(SCHEMA).withBeamSchemas(withBeamSchemas)); - } - - @Test - @Category(NeedsRunner.class) - public void testWriteThenReadGeneratedClassWithSchemaString() throws Throwable { - testWriteThenReadGeneratedClass( - AvroIO.writeGenericRecords(SCHEMA.toString()), - AvroIO.readGenericRecords(SCHEMA.toString()).withBeamSchemas(withBeamSchemas)); - } - - @Test - @Category(NeedsRunner.class) - public void testWriteSingleFileThenReadUsingAllMethods() throws Throwable { - List values = - ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding()); - writePipeline.run(); - - // Test the same data using all versions of read(). - PCollection path = - readPipeline.apply("Create path", Create.of(outputFile.getAbsolutePath())); - PAssert.that( - readPipeline.apply( - "Read", - AvroIO.read(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .from(outputFile.getAbsolutePath()))) - .containsInAnyOrder(values); - PAssert.that( - readPipeline.apply( - "Read withHintMatchesManyFiles", - AvroIO.read(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .from(outputFile.getAbsolutePath()) - .withHintMatchesManyFiles())) - .containsInAnyOrder(values); - PAssert.that( - path.apply("MatchAllReadFiles", FileIO.matchAll()) - .apply("ReadMatchesReadFiles", FileIO.readMatches().withCompression(AUTO)) - .apply( - "ReadFiles", - AvroIO.readFiles(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(values); - PAssert.that( - path.apply( - "ReadAll", - AvroIO.readAll(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(values); - PAssert.that( - readPipeline.apply( - "Parse", - AvroIO.parseGenericRecords(new ParseGenericClass()) - .from(outputFile.getAbsolutePath()) - .withCoder(AvroCoder.of(GenericClass.class)))) - .containsInAnyOrder(values); - PAssert.that( - readPipeline.apply( - "Parse withHintMatchesManyFiles", - AvroIO.parseGenericRecords(new ParseGenericClass()) - .from(outputFile.getAbsolutePath()) - .withCoder(AvroCoder.of(GenericClass.class)) - .withHintMatchesManyFiles())) - .containsInAnyOrder(values); - PAssert.that( - path.apply("MatchAllParseFilesGenericRecords", FileIO.matchAll()) - .apply( - "ReadMatchesParseFilesGenericRecords", - FileIO.readMatches() - .withDirectoryTreatment(FileIO.ReadMatches.DirectoryTreatment.PROHIBIT)) - .apply( - "ParseFilesGenericRecords", - AvroIO.parseFilesGenericRecords(new ParseGenericClass()) - .withCoder(AvroCoder.of(GenericClass.class)) - .withUsesReshuffle(false) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(values); - PAssert.that( - path.apply("MatchAllParseFilesGenericRecordsWithShuffle", FileIO.matchAll()) - .apply( - "ReadMatchesParseFilesGenericRecordsWithShuffle", - FileIO.readMatches() - .withDirectoryTreatment(FileIO.ReadMatches.DirectoryTreatment.PROHIBIT)) - .apply( - "ParseFilesGenericRecordsWithShuffle", - AvroIO.parseFilesGenericRecords(new ParseGenericClass()) - .withCoder(AvroCoder.of(GenericClass.class)) - .withUsesReshuffle(true) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(values); - PAssert.that( - path.apply( - "ParseAllGenericRecords", - AvroIO.parseAllGenericRecords(new ParseGenericClass()) - .withCoder(AvroCoder.of(GenericClass.class)) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(values); - - readPipeline.run(); - } - - @Test - @Category(NeedsRunner.class) - public void testWriteThenReadMultipleFilepatterns() { - List firstValues = new ArrayList<>(); - List secondValues = new ArrayList<>(); - for (int i = 0; i < 10; ++i) { - firstValues.add(new GenericClass(i, "a" + i)); - secondValues.add(new GenericClass(i, "b" + i)); - } - writePipeline - .apply("Create first", Create.of(firstValues)) - .apply( - "Write first", - AvroIO.write(GenericClass.class) - .to(tmpFolder.getRoot().getAbsolutePath() + "/first") - .withNumShards(2)); - writePipeline - .apply("Create second", Create.of(secondValues)) - .apply( - "Write second", - AvroIO.write(GenericClass.class) - .to(tmpFolder.getRoot().getAbsolutePath() + "/second") - .withNumShards(3)); - writePipeline.run(); - - // Test readFiles(), readAll(), parseFilesGenericRecords() and parseAllGenericRecords(). - PCollection paths = - readPipeline.apply( - "Create paths", - Create.of( - tmpFolder.getRoot().getAbsolutePath() + "/first*", - tmpFolder.getRoot().getAbsolutePath() + "/second*")); - PAssert.that( - paths - .apply("MatchAllReadFiles", FileIO.matchAll()) - .apply("ReadMatchesReadFiles", FileIO.readMatches().withCompression(AUTO)) - .apply( - "ReadFiles", - AvroIO.readFiles(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - PAssert.that( - paths.apply( - "ReadAll", - AvroIO.readAll(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - PAssert.that( - paths - .apply("MatchAllParseFilesGenericRecords", FileIO.matchAll()) - .apply( - "ReadMatchesParseFilesGenericRecords", - FileIO.readMatches() - .withDirectoryTreatment(FileIO.ReadMatches.DirectoryTreatment.PROHIBIT)) - .apply( - "ParseFilesGenericRecords", - AvroIO.parseFilesGenericRecords(new ParseGenericClass()) - .withCoder(AvroCoder.of(GenericClass.class)) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - PAssert.that( - paths.apply( - "ParseAllGenericRecords", - AvroIO.parseAllGenericRecords(new ParseGenericClass()) - .withCoder(AvroCoder.of(GenericClass.class)) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - - readPipeline.run(); - } - - private static class CreateGenericClass extends SimpleFunction { - @Override - public GenericClass apply(Long i) { - return new GenericClass(i.intValue(), "value" + i); - } - } - - @Test - @Category({NeedsRunner.class, UsesUnboundedSplittableParDo.class}) - public void testContinuouslyWriteAndReadMultipleFilepatterns() { - SimpleFunction mapFn = new CreateGenericClass(); - List firstValues = new ArrayList<>(); - List secondValues = new ArrayList<>(); - for (int i = 0; i < 7; ++i) { - (i < 3 ? firstValues : secondValues).add(mapFn.apply((long) i)); - } - // Configure windowing of the input so that it fires every time a new element is generated, - // so that files are written continuously. - Window window = - Window.into(FixedWindows.of(Duration.millis(100))) - .withAllowedLateness(Duration.ZERO) - .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) - .discardingFiredPanes(); - readPipeline - .apply("Sequence first", GenerateSequence.from(0).to(3).withRate(1, Duration.millis(300))) - .apply("Window first", window) - .apply("Map first", MapElements.via(mapFn)) - .apply( - "Write first", - AvroIO.write(GenericClass.class) - .to(tmpFolder.getRoot().getAbsolutePath() + "/first") - .withNumShards(2) - .withWindowedWrites()); - readPipeline - .apply( - "Sequence second", GenerateSequence.from(3).to(7).withRate(1, Duration.millis(300))) - .apply("Window second", window) - .apply("Map second", MapElements.via(mapFn)) - .apply( - "Write second", - AvroIO.write(GenericClass.class) - .to(tmpFolder.getRoot().getAbsolutePath() + "/second") - .withNumShards(3) - .withWindowedWrites()); - - // Test read(), readFiles(), readAll(), parse(), parseFilesGenericRecords() and - // parseAllGenericRecords() with watchForNewFiles(). - PAssert.that( - readPipeline.apply( - "Read", - AvroIO.read(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .from(tmpFolder.getRoot().getAbsolutePath() + "/first*") - .watchForNewFiles( - Duration.millis(100), - Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))))) - .containsInAnyOrder(firstValues); - PAssert.that( - readPipeline.apply( - "Parse", - AvroIO.parseGenericRecords(new ParseGenericClass()) - .from(tmpFolder.getRoot().getAbsolutePath() + "/first*") - .watchForNewFiles( - Duration.millis(100), - Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))))) - .containsInAnyOrder(firstValues); - - PCollection paths = - readPipeline.apply( - "Create paths", - Create.of( - tmpFolder.getRoot().getAbsolutePath() + "/first*", - tmpFolder.getRoot().getAbsolutePath() + "/second*")); - PAssert.that( - paths - .apply( - "Match All Read files", - FileIO.matchAll() - .continuously( - Duration.millis(100), - Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3)))) - .apply( - "Read Matches Read files", - FileIO.readMatches() - .withDirectoryTreatment(FileIO.ReadMatches.DirectoryTreatment.PROHIBIT)) - .apply( - "Read files", - AvroIO.readFiles(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - PAssert.that( - paths.apply( - "Read all", - AvroIO.readAll(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .watchForNewFiles( - Duration.millis(100), - Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - PAssert.that( - paths - .apply( - "Match All ParseFilesGenericRecords", - FileIO.matchAll() - .continuously( - Duration.millis(100), - Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3)))) - .apply( - "Match Matches ParseFilesGenericRecords", - FileIO.readMatches() - .withDirectoryTreatment(FileIO.ReadMatches.DirectoryTreatment.PROHIBIT)) - .apply( - "ParseFilesGenericRecords", - AvroIO.parseFilesGenericRecords(new ParseGenericClass()) - .withCoder(AvroCoder.of(GenericClass.class)) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - PAssert.that( - paths.apply( - "ParseAllGenericRecords", - AvroIO.parseAllGenericRecords(new ParseGenericClass()) - .withCoder(AvroCoder.of(GenericClass.class)) - .watchForNewFiles( - Duration.millis(100), - Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))) - .withDesiredBundleSizeBytes(10))) - .containsInAnyOrder(Iterables.concat(firstValues, secondValues)); - readPipeline.run(); - } - - @Test - @SuppressWarnings("unchecked") - @Category(NeedsRunner.class) - public void testCompressedWriteAndReadASingleFile() throws Throwable { - List values = - ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.write(GenericClass.class) - .to(outputFile.getAbsolutePath()) - .withoutSharding() - .withCodec(CodecFactory.deflateCodec(9))); - writePipeline.run(); - - PAssert.that( - readPipeline.apply( - AvroIO.read(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .from(outputFile.getAbsolutePath()))) - .containsInAnyOrder(values); - readPipeline.run(); - - try (DataFileStream dataFileStream = - new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { - assertEquals("deflate", dataFileStream.getMetaString("avro.codec")); - } - } - - @Test - @SuppressWarnings("unchecked") - @Category(NeedsRunner.class) - public void testWriteThenReadASingleFileWithNullCodec() throws Throwable { - List values = - ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.write(GenericClass.class) - .to(outputFile.getAbsolutePath()) - .withoutSharding() - .withCodec(CodecFactory.nullCodec())); - writePipeline.run(); - - PAssert.that( - readPipeline.apply( - AvroIO.read(GenericClass.class) - .withBeamSchemas(withBeamSchemas) - .from(outputFile.getAbsolutePath()))) - .containsInAnyOrder(values); - readPipeline.run(); - - try (DataFileStream dataFileStream = - new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { - assertEquals("null", dataFileStream.getMetaString("avro.codec")); - } - } - - @DefaultCoder(AvroCoder.class) - static class GenericClassV2 { - int intField; - String stringField; - @org.apache.avro.reflect.Nullable String nullableField; - - GenericClassV2() {} - - GenericClassV2(int intValue, String stringValue, String nullableValue) { - this.intField = intValue; - this.stringField = stringValue; - this.nullableField = nullableValue; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(getClass()) - .add("intField", intField) - .add("stringField", stringField) - .add("nullableField", nullableField) - .toString(); - } - - @Override - public int hashCode() { - return Objects.hash(intField, stringField, nullableField); - } - - @Override - public boolean equals(@Nullable Object other) { - if (!(other instanceof GenericClassV2)) { - return false; - } - GenericClassV2 o = (GenericClassV2) other; - return intField == o.intField - && Objects.equals(stringField, o.stringField) - && Objects.equals(nullableField, o.nullableField); - } - } - - /** - * Tests that {@code AvroIO} can read an upgraded version of an old class, as long as the schema - * resolution process succeeds. This test covers the case when a new, {@code @Nullable} field - * has been added. - * - *

For more information, see http://avro.apache.org/docs/1.7.7/spec.html#Schema+Resolution - */ - @Test - @Category(NeedsRunner.class) - public void testWriteThenReadSchemaUpgrade() throws Throwable { - List values = - ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding()); - writePipeline.run(); - - List expected = - ImmutableList.of(new GenericClassV2(3, "hi", null), new GenericClassV2(5, "bar", null)); - - PAssert.that( - readPipeline.apply( - AvroIO.read(GenericClassV2.class) - .withBeamSchemas(withBeamSchemas) - .from(outputFile.getAbsolutePath()))) - .containsInAnyOrder(expected); - readPipeline.run(); - } - - private static class WindowedFilenamePolicy extends FilenamePolicy { - final ResourceId outputFilePrefix; - - WindowedFilenamePolicy(ResourceId outputFilePrefix) { - this.outputFilePrefix = outputFilePrefix; - } - - @Override - public ResourceId windowedFilename( - int shardNumber, - int numShards, - BoundedWindow window, - PaneInfo paneInfo, - OutputFileHints outputFileHints) { - String filenamePrefix = - outputFilePrefix.isDirectory() ? "" : firstNonNull(outputFilePrefix.getFilename(), ""); - - IntervalWindow interval = (IntervalWindow) window; - String windowStr = - String.format("%s-%s", interval.start().toString(), interval.end().toString()); - String filename = - String.format( - "%s-%s-%s-of-%s-pane-%s%s%s.avro", - filenamePrefix, - windowStr, - shardNumber, - numShards, - paneInfo.getIndex(), - paneInfo.isLast() ? "-last" : "", - outputFileHints.getSuggestedFilenameSuffix()); - return outputFilePrefix.getCurrentDirectory().resolve(filename, RESOLVE_FILE); - } - - @Override - public ResourceId unwindowedFilename( - int shardNumber, int numShards, OutputFileHints outputFileHints) { - throw new UnsupportedOperationException("Expecting windowed outputs only"); - } - - @Override - public void populateDisplayData(DisplayData.Builder builder) { - builder.add( - DisplayData.item("fileNamePrefix", outputFilePrefix.toString()) - .withLabel("File Name Prefix")); - } - } - - @Test - @Category({NeedsRunner.class, UsesTestStream.class}) - public void testWriteWindowed() throws Throwable { - testWindowedAvroIOWriteUsingMethod(WriteMethod.AVROIO_WRITE); - } - - @Test - @Category({NeedsRunner.class, UsesTestStream.class}) - public void testWindowedAvroIOWriteViaSink() throws Throwable { - testWindowedAvroIOWriteUsingMethod(WriteMethod.AVROIO_SINK_WITH_CLASS); - } - - void testWindowedAvroIOWriteUsingMethod(WriteMethod method) throws IOException { - Path baseDir = Files.createTempDirectory(tmpFolder.getRoot().toPath(), "testwrite"); - final String baseFilename = baseDir.resolve("prefix").toString(); - - Instant base = new Instant(0); - ArrayList allElements = new ArrayList<>(); - ArrayList> firstWindowElements = new ArrayList<>(); - ArrayList firstWindowTimestamps = - Lists.newArrayList( - base.plus(Duration.ZERO), base.plus(Duration.standardSeconds(10)), - base.plus(Duration.standardSeconds(20)), base.plus(Duration.standardSeconds(30))); - - Random random = new Random(); - for (int i = 0; i < 100; ++i) { - GenericClass item = new GenericClass(i, String.valueOf(i)); - allElements.add(item); - firstWindowElements.add( - TimestampedValue.of( - item, firstWindowTimestamps.get(random.nextInt(firstWindowTimestamps.size())))); - } - - ArrayList> secondWindowElements = new ArrayList<>(); - ArrayList secondWindowTimestamps = - Lists.newArrayList( - base.plus(Duration.standardSeconds(60)), base.plus(Duration.standardSeconds(70)), - base.plus(Duration.standardSeconds(80)), base.plus(Duration.standardSeconds(90))); - for (int i = 100; i < 200; ++i) { - GenericClass item = new GenericClass(i, String.valueOf(i)); - allElements.add(new GenericClass(i, String.valueOf(i))); - secondWindowElements.add( - TimestampedValue.of( - item, secondWindowTimestamps.get(random.nextInt(secondWindowTimestamps.size())))); - } - - TimestampedValue[] firstWindowArray = - firstWindowElements.toArray(new TimestampedValue[100]); - TimestampedValue[] secondWindowArray = - secondWindowElements.toArray(new TimestampedValue[100]); - - TestStream values = - TestStream.create(AvroCoder.of(GenericClass.class)) - .advanceWatermarkTo(new Instant(0)) - .addElements( - firstWindowArray[0], - Arrays.copyOfRange(firstWindowArray, 1, firstWindowArray.length)) - .advanceWatermarkTo(new Instant(0).plus(Duration.standardMinutes(1))) - .addElements( - secondWindowArray[0], - Arrays.copyOfRange(secondWindowArray, 1, secondWindowArray.length)) - .advanceWatermarkToInfinity(); - - final PTransform, WriteFilesResult> write; - switch (method) { - case AVROIO_WRITE: - { - FilenamePolicy policy = - new WindowedFilenamePolicy( - FileBasedSink.convertToFileResourceIfPossible(baseFilename)); - write = - AvroIO.write(GenericClass.class) - .to(policy) - .withTempDirectory( - StaticValueProvider.of( - FileSystems.matchNewResource(baseDir.toString(), true))) - .withWindowedWrites() - .withNumShards(2) - .withOutputFilenames(); - break; - } - - case AVROIO_SINK_WITH_CLASS: - { - write = - FileIO.write() - .via(AvroIO.sink(GenericClass.class)) - .to(baseDir.toString()) - .withPrefix("prefix") - .withSuffix(".avro") - .withTempDirectory(baseDir.toString()) - .withNumShards(2); - break; - } - - default: - throw new UnsupportedOperationException(); - } - windowedAvroWritePipeline - .apply(values) - .apply(Window.into(FixedWindows.of(Duration.standardMinutes(1)))) - .apply(write); - windowedAvroWritePipeline.run(); - - // Validate that the data written matches the expected elements in the expected order - List expectedFiles = new ArrayList<>(); - for (int shard = 0; shard < 2; shard++) { - for (int window = 0; window < 2; window++) { - Instant windowStart = new Instant(0).plus(Duration.standardMinutes(window)); - IntervalWindow iw = new IntervalWindow(windowStart, Duration.standardMinutes(1)); - String baseAndWindow = baseFilename + "-" + iw.start() + "-" + iw.end(); - switch (method) { - case AVROIO_WRITE: - expectedFiles.add(new File(baseAndWindow + "-" + shard + "-of-2-pane-0-last.avro")); - break; - case AVROIO_SINK_WITH_CLASS: - expectedFiles.add(new File(baseAndWindow + "-0000" + shard + "-of-00002.avro")); - break; - default: - throw new UnsupportedOperationException("Unknown write method " + method); - } - } - } - - List actualElements = new ArrayList<>(); - for (File outputFile : expectedFiles) { - assertTrue("Expected output file " + outputFile.getAbsolutePath(), outputFile.exists()); - try (DataFileReader reader = - new DataFileReader<>( - outputFile, - new ReflectDatumReader<>(ReflectData.get().getSchema(GenericClass.class)))) { - Iterators.addAll(actualElements, reader); - } - outputFile.delete(); - } - assertThat(actualElements, containsInAnyOrder(allElements.toArray())); - } - - private static final String SCHEMA_TEMPLATE_STRING = - "{\"namespace\": \"example.avro\",\n" - + " \"type\": \"record\",\n" - + " \"name\": \"$$TestTemplateSchema\",\n" - + " \"fields\": [\n" - + " {\"name\": \"$$full\", \"type\": \"string\"},\n" - + " {\"name\": \"$$suffix\", \"type\": [\"string\", \"null\"]}\n" - + " ]\n" - + "}"; - - private static String schemaFromPrefix(String prefix) { - return SCHEMA_TEMPLATE_STRING.replace("$$", prefix); - } - - private static GenericRecord createRecord(String record, String prefix, Schema schema) { - GenericRecord genericRecord = new GenericData.Record(schema); - genericRecord.put(prefix + "full", record); - genericRecord.put(prefix + "suffix", record.substring(1)); - return genericRecord; - } - - private static class TestDynamicDestinations - extends DynamicAvroDestinations { - final ResourceId baseDir; - final PCollectionView> schemaView; - - TestDynamicDestinations(ResourceId baseDir, PCollectionView> schemaView) { - this.baseDir = baseDir; - this.schemaView = schemaView; - } - - @Override - public Schema getSchema(String destination) { - // Return a per-destination schema. - String schema = sideInput(schemaView).get(destination); - return new Schema.Parser().parse(schema); - } - - @Override - public List> getSideInputs() { - return ImmutableList.of(schemaView); - } - - @Override - public GenericRecord formatRecord(String record) { - String prefix = record.substring(0, 1); - return createRecord(record, prefix, getSchema(prefix)); - } - - @Override - public String getDestination(String element) { - // Destination is based on first character of string. - return element.substring(0, 1); - } - - @Override - public String getDefaultDestination() { - return ""; - } - - @Override - public FilenamePolicy getFilenamePolicy(String destination) { - return DefaultFilenamePolicy.fromStandardParameters( - StaticValueProvider.of(baseDir.resolve("file_" + destination, RESOLVE_FILE)), - "-SSSSS-of-NNNNN", - ".avro", - false); - } - } - - /** - * Example of a {@link Coder} for a collection of Avro records with different schemas. - * - *

All the schemas are known at pipeline construction, and are keyed internally on the prefix - * character (lower byte only for UTF-8 data). - */ - private static class AvroMultiplexCoder extends Coder { - - /** Lookup table for the possible schemas, keyed on the prefix character. */ - private final Map> coderMap = Maps.newHashMap(); - - protected AvroMultiplexCoder(Map schemaMap) { - for (Map.Entry entry : schemaMap.entrySet()) { - coderMap.put( - entry.getKey().charAt(0), AvroCoder.of(new Schema.Parser().parse(entry.getValue()))); - } - } - - @Override - public void encode(GenericRecord value, OutputStream outStream) throws IOException { - char prefix = value.getSchema().getName().charAt(0); - outStream.write(prefix); // Only reads and writes the low byte. - coderMap.get(prefix).encode(value, outStream); - } - - @Override - public GenericRecord decode(InputStream inStream) throws CoderException, IOException { - char prefix = (char) inStream.read(); - return coderMap.get(prefix).decode(inStream); - } - - @Override - public List> getCoderArguments() { - return Collections.emptyList(); - } - - @Override - public void verifyDeterministic() throws NonDeterministicException { - for (AvroCoder internalCoder : coderMap.values()) { - internalCoder.verifyDeterministic(); - } - } - } - - private void testDynamicDestinationsUnwindowedWithSharding( - WriteMethod writeMethod, Sharding sharding) throws Exception { - final ResourceId baseDir = - FileSystems.matchNewResource( - Files.createTempDirectory(tmpFolder.getRoot().toPath(), "testDynamicDestinations") - .toString(), - true); - - List elements = Lists.newArrayList("aaaa", "aaab", "baaa", "baab", "caaa", "caab"); - Multimap expectedElements = ArrayListMultimap.create(); - Map schemaMap = Maps.newHashMap(); - for (String element : elements) { - String prefix = element.substring(0, 1); - String jsonSchema = schemaFromPrefix(prefix); - schemaMap.put(prefix, jsonSchema); - expectedElements.put( - prefix, createRecord(element, prefix, new Schema.Parser().parse(jsonSchema))); - } - final PCollectionView> schemaView = - writePipeline.apply("createSchemaView", Create.of(schemaMap)).apply(View.asMap()); - - PCollection input = - writePipeline.apply("createInput", Create.of(elements).withCoder(StringUtf8Coder.of())); - - switch (writeMethod) { - case AVROIO_WRITE: - { - AvroIO.TypedWrite write = - AvroIO.writeCustomTypeToGenericRecords() - .to(new TestDynamicDestinations(baseDir, schemaView)) - .withTempDirectory(baseDir); - - switch (sharding) { - case RUNNER_DETERMINED: - break; - case WITHOUT_SHARDING: - write = write.withoutSharding(); - break; - case FIXED_3_SHARDS: - write = write.withNumShards(3); - break; - default: - throw new IllegalArgumentException("Unknown sharding " + sharding); - } - - input.apply(write); - break; - } - - case AVROIO_SINK_WITH_SCHEMA: - { - FileIO.Write write = - FileIO.writeDynamic() - .by( - fn( - (element, c) -> { - c.sideInput(schemaView); // Ignore result - return element.getSchema().getName().substring(0, 1); - }, - requiresSideInputs(schemaView))) - .via( - fn( - (dest, c) -> { - Schema schema = - new Schema.Parser().parse(c.sideInput(schemaView).get(dest)); - return AvroIO.sink(schema); - }, - requiresSideInputs(schemaView))) - .to(baseDir.toString()) - .withNaming( - fn( - (dest, c) -> { - c.sideInput(schemaView); // Ignore result - return FileIO.Write.defaultNaming("file_" + dest, ".avro"); - }, - requiresSideInputs(schemaView))) - .withTempDirectory(baseDir.toString()) - .withDestinationCoder(StringUtf8Coder.of()) - .withIgnoreWindowing(); - switch (sharding) { - case RUNNER_DETERMINED: - break; - case WITHOUT_SHARDING: - write = write.withNumShards(1); - break; - case FIXED_3_SHARDS: - write = write.withNumShards(3); - break; - default: - throw new IllegalArgumentException("Unknown sharding " + sharding); - } - - MapElements toRecord = - MapElements.via( - new SimpleFunction() { - @Override - public GenericRecord apply(String element) { - String prefix = element.substring(0, 1); - GenericRecord record = - new GenericData.Record( - new Schema.Parser().parse(schemaFromPrefix(prefix))); - record.put(prefix + "full", element); - record.put(prefix + "suffix", element.substring(1)); - return record; - } - }); - - input.apply(toRecord).setCoder(new AvroMultiplexCoder(schemaMap)).apply(write); - break; - } - - case AVROIO_SINK_WITH_FORMATTER: - { - final AvroIO.RecordFormatter formatter = - (element, schema) -> { - String prefix = element.substring(0, 1); - GenericRecord record = new GenericData.Record(schema); - record.put(prefix + "full", element); - record.put(prefix + "suffix", element.substring(1)); - return record; - }; - FileIO.Write write = - FileIO.writeDynamic() - .by( - fn( - (element, c) -> { - c.sideInput(schemaView); // Ignore result - return element.substring(0, 1); - }, - requiresSideInputs(schemaView))) - .via( - fn( - (dest, c) -> { - Schema schema = - new Schema.Parser().parse(c.sideInput(schemaView).get(dest)); - return AvroIO.sinkViaGenericRecords(schema, formatter); - }, - requiresSideInputs(schemaView))) - .to(baseDir.toString()) - .withNaming( - fn( - (dest, c) -> { - c.sideInput(schemaView); // Ignore result - return FileIO.Write.defaultNaming("file_" + dest, ".avro"); - }, - requiresSideInputs(schemaView))) - .withTempDirectory(baseDir.toString()) - .withDestinationCoder(StringUtf8Coder.of()) - .withIgnoreWindowing(); - switch (sharding) { - case RUNNER_DETERMINED: - break; - case WITHOUT_SHARDING: - write = write.withNumShards(1); - break; - case FIXED_3_SHARDS: - write = write.withNumShards(3); - break; - default: - throw new IllegalArgumentException("Unknown sharding " + sharding); - } - - input.apply(write); - break; - } - default: - throw new UnsupportedOperationException("Unknown write method " + writeMethod); - } - - writePipeline.run(); - - // Validate that the data written matches the expected elements in the expected order. - - for (String prefix : expectedElements.keySet()) { - String shardPattern; - switch (sharding) { - case RUNNER_DETERMINED: - shardPattern = "-*"; - break; - case WITHOUT_SHARDING: - shardPattern = "-00000-of-00001"; - break; - case FIXED_3_SHARDS: - shardPattern = "-*-of-00003"; - break; - default: - throw new IllegalArgumentException("Unknown sharding " + sharding); - } - String expectedFilepattern = - baseDir.resolve("file_" + prefix + shardPattern + ".avro", RESOLVE_FILE).toString(); - - PCollection records = - readPipeline.apply( - "read_" + prefix, - AvroIO.readGenericRecords(schemaFromPrefix(prefix)) - .withBeamSchemas(withBeamSchemas) - .from(expectedFilepattern)); - PAssert.that(records).containsInAnyOrder(expectedElements.get(prefix)); - } - readPipeline.run(); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsRunnerDeterminedSharding() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_WRITE, Sharding.RUNNER_DETERMINED); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsWithoutSharding() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_WRITE, Sharding.WITHOUT_SHARDING); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsWithNumShards() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_WRITE, Sharding.FIXED_3_SHARDS); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsViaSinkRunnerDeterminedSharding() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_SINK_WITH_SCHEMA, Sharding.RUNNER_DETERMINED); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsViaSinkWithoutSharding() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_SINK_WITH_SCHEMA, Sharding.WITHOUT_SHARDING); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsViaSinkWithNumShards() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_SINK_WITH_SCHEMA, Sharding.FIXED_3_SHARDS); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsViaSinkWithFormatterRunnerDeterminedSharding() - throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_SINK_WITH_FORMATTER, Sharding.RUNNER_DETERMINED); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsViaSinkWithFormatterWithoutSharding() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_SINK_WITH_FORMATTER, Sharding.WITHOUT_SHARDING); - } - - @Test - @Category(NeedsRunner.class) - public void testDynamicDestinationsViaSinkWithFormatterWithNumShards() throws Exception { - testDynamicDestinationsUnwindowedWithSharding( - WriteMethod.AVROIO_SINK_WITH_FORMATTER, Sharding.FIXED_3_SHARDS); - } - - @Test - @SuppressWarnings("unchecked") - @Category(NeedsRunner.class) - public void testMetadata() throws Exception { - List values = - ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); - File outputFile = tmpFolder.newFile("output.avro"); - - writePipeline - .apply(Create.of(values)) - .apply( - AvroIO.write(GenericClass.class) - .to(outputFile.getAbsolutePath()) - .withoutSharding() - .withMetadata( - ImmutableMap.of( - "stringKey", - "stringValue", - "longKey", - 100L, - "bytesKey", - "bytesValue".getBytes(Charsets.UTF_8)))); - writePipeline.run(); - - try (DataFileStream dataFileStream = - new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { - assertEquals("stringValue", dataFileStream.getMetaString("stringKey")); - assertEquals(100L, dataFileStream.getMetaLong("longKey")); - assertArrayEquals( - "bytesValue".getBytes(Charsets.UTF_8), dataFileStream.getMeta("bytesKey")); - } - } - - // using AvroCoder#createDatumReader for tests. - private void runTestWrite(String[] expectedElements, int numShards) throws IOException { - File baseOutputFile = new File(tmpFolder.getRoot(), "prefix"); - String outputFilePrefix = baseOutputFile.getAbsolutePath(); - - AvroIO.Write write = - AvroIO.write(String.class).to(outputFilePrefix).withSuffix(".avro"); - if (numShards > 1) { - write = write.withNumShards(numShards); - } else { - write = write.withoutSharding(); - } - writePipeline.apply(Create.of(ImmutableList.copyOf(expectedElements))).apply(write); - writePipeline.run(); - - String shardNameTemplate = - firstNonNull( - write.inner.getShardTemplate(), - DefaultFilenamePolicy.DEFAULT_UNWINDOWED_SHARD_TEMPLATE); - - assertTestOutputs(expectedElements, numShards, outputFilePrefix, shardNameTemplate); - } - - static void assertTestOutputs( - String[] expectedElements, int numShards, String outputFilePrefix, String shardNameTemplate) - throws IOException { - // Validate that the data written matches the expected elements in the expected order - List expectedFiles = new ArrayList<>(); - for (int i = 0; i < numShards; i++) { - expectedFiles.add( - new File( - DefaultFilenamePolicy.constructName( - FileBasedSink.convertToFileResourceIfPossible(outputFilePrefix), - shardNameTemplate, - ".avro", - i, - numShards, - null, - null) - .toString())); - } - - List actualElements = new ArrayList<>(); - for (File outputFile : expectedFiles) { - assertTrue("Expected output file " + outputFile.getName(), outputFile.exists()); - try (DataFileReader reader = - new DataFileReader<>( - outputFile, new ReflectDatumReader(ReflectData.get().getSchema(String.class)))) { - Iterators.addAll(actualElements, reader); - } - } - assertThat(actualElements, containsInAnyOrder(expectedElements)); - } - - @Test - @Category(NeedsRunner.class) - public void testAvroSinkWrite() throws Exception { - String[] expectedElements = new String[] {"first", "second", "third"}; - - runTestWrite(expectedElements, 1); - } - - @Test - @Category(NeedsRunner.class) - public void testAvroSinkShardedWrite() throws Exception { - String[] expectedElements = new String[] {"first", "second", "third", "fourth", "fifth"}; - - runTestWrite(expectedElements, 4); - } - - @Test - @Category(NeedsRunner.class) - public void testAvroSinkWriteWithCustomFactory() throws Exception { - Integer[] expectedElements = new Integer[] {1, 2, 3, 4, 5}; - - File baseOutputFile = new File(tmpFolder.getRoot(), "prefix"); - String outputFilePrefix = baseOutputFile.getAbsolutePath(); - - Schema recordSchema = SchemaBuilder.record("root").fields().requiredInt("i1").endRecord(); - - AvroIO.TypedWrite write = - AvroIO.writeCustomType() - .to(outputFilePrefix) - .withSchema(recordSchema) - .withFormatFunction(f -> f) - .withDatumWriterFactory( - f -> - new DatumWriter() { - private DatumWriter inner = new GenericDatumWriter<>(f); - - @Override - public void setSchema(Schema schema) { - inner.setSchema(schema); - } - - @Override - public void write(Integer datum, Encoder out) throws IOException { - GenericRecord record = - new GenericRecordBuilder(f).set("i1", datum).build(); - inner.write(record, out); - } - }) - .withSuffix(".avro"); - - write = write.withoutSharding(); - - writePipeline.apply(Create.of(ImmutableList.copyOf(expectedElements))).apply(write); - writePipeline.run(); - - File expectedFile = - new File( - DefaultFilenamePolicy.constructName( - FileBasedSink.convertToFileResourceIfPossible(outputFilePrefix), - "", - ".avro", - 1, - 1, - null, - null) - .toString()); - - assertTrue("Expected output file " + expectedFile.getName(), expectedFile.exists()); - DataFileReader dataFileReader = - new DataFileReader<>(expectedFile, new GenericDatumReader<>(recordSchema)); - - List actualRecords = new ArrayList<>(); - Iterators.addAll(actualRecords, dataFileReader); - - GenericRecord[] expectedRecords = - Arrays.stream(expectedElements) - .map(i -> new GenericRecordBuilder(recordSchema).set("i1", i).build()) - .toArray(GenericRecord[]::new); - - assertThat(actualRecords, containsInAnyOrder(expectedRecords)); - } - - // TODO: for Write only, test withSuffix, - // withShardNameTemplate and withoutSharding. - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroSchemaIOProviderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroSchemaIOProviderTest.java deleted file mode 100644 index cf68633dc7ae3..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroSchemaIOProviderTest.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import static org.junit.Assert.assertEquals; - -import java.io.File; -import java.time.Duration; -import java.util.Arrays; -import java.util.List; -import org.apache.beam.sdk.coders.RowCoder; -import org.apache.beam.sdk.io.fs.MatchResult; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.io.SchemaIO; -import org.apache.beam.sdk.testing.NeedsRunner; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.TestStream; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.Row; -import org.apache.beam.sdk.values.TimestampedValue; -import org.joda.time.Instant; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Test for AvroSchemaIOProvider. */ -@RunWith(JUnit4.class) -public class AvroSchemaIOProviderTest { - @Rule public TestPipeline writePipeline = TestPipeline.create(); - @Rule public TestPipeline readPipeline = TestPipeline.create(); - @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); - - private static final Schema SCHEMA = - Schema.builder().addInt64Field("age").addStringField("age_str").build(); - - private Row createRow(long l) { - return Row.withSchema(SCHEMA).addValues(l, Long.valueOf(l).toString()).build(); - } - - @Test - @Category({NeedsRunner.class}) - public void testWriteAndReadTable() { - File destinationFile = new File(tempFolder.getRoot(), "person-info.avro"); - - AvroSchemaIOProvider provider = new AvroSchemaIOProvider(); - Row configuration = Row.withSchema(provider.configurationSchema()).addValue(null).build(); - SchemaIO io = provider.from(destinationFile.getAbsolutePath(), configuration, SCHEMA); - - List rowsList = Arrays.asList(createRow(1L), createRow(3L), createRow(4L)); - PCollection rows = - writePipeline.apply("Create", Create.of(rowsList).withCoder(RowCoder.of(SCHEMA))); - rows.apply(io.buildWriter()); - writePipeline.run(); - - PCollection read = readPipeline.begin().apply(io.buildReader()); - PAssert.that(read).containsInAnyOrder(rowsList); - readPipeline.run(); - } - - @Test - @Category({NeedsRunner.class}) - public void testStreamingWriteDefault() throws Exception { - File destinationFile = new File(tempFolder.getRoot(), "person-info"); - - AvroSchemaIOProvider provider = new AvroSchemaIOProvider(); - Row config = Row.withSchema(provider.configurationSchema()).addValue(null).build(); - SchemaIO writeIO = provider.from(destinationFile.getAbsolutePath(), config, SCHEMA); - - TestStream createEvents = - TestStream.create(RowCoder.of(SCHEMA)) - .addElements(TimestampedValue.of(createRow(1L), new Instant(1L))) - .addElements(TimestampedValue.of(createRow(2L), Instant.ofEpochSecond(120L))) - .advanceWatermarkToInfinity(); - - writePipeline.apply("create", createEvents).apply("write", writeIO.buildWriter()); - writePipeline.run(); - - // Verify we wrote two files. - String wildcardPath = destinationFile.getAbsolutePath() + "*"; - MatchResult result = FileSystems.match(wildcardPath); - assertEquals(2, result.metadata().size()); - - // Verify results of the files. - SchemaIO readIO = provider.from(wildcardPath, config, SCHEMA); - PCollection read = readPipeline.begin().apply("read", readIO.buildReader()); - PAssert.that(read).containsInAnyOrder(createRow(1L), createRow(2L)); - readPipeline.run(); - } - - @Test - @Category({NeedsRunner.class}) - public void testStreamingCustomWindowSize() throws Exception { - File destinationFile = new File(tempFolder.getRoot(), "person-info"); - - AvroSchemaIOProvider provider = new AvroSchemaIOProvider(); - Row config = - Row.withSchema(provider.configurationSchema()) - .addValue(Duration.ofMinutes(4).getSeconds()) - .build(); - SchemaIO writeIO = provider.from(destinationFile.getAbsolutePath(), config, SCHEMA); - - TestStream createEvents = - TestStream.create(RowCoder.of(SCHEMA)) - .addElements(TimestampedValue.of(createRow(1L), new Instant(1L))) - .addElements(TimestampedValue.of(createRow(2L), Instant.ofEpochSecond(120L))) - .advanceWatermarkToInfinity(); - - writePipeline.apply("create", createEvents).apply("write", writeIO.buildWriter()); - writePipeline.run(); - - // Verify we wrote one file. - String wildcardPath = destinationFile.getAbsolutePath() + "*"; - MatchResult result = FileSystems.match(wildcardPath); - assertEquals(1, result.metadata().size()); - - // Verify results of the files. - SchemaIO readIO = provider.from(wildcardPath, config, SCHEMA); - PCollection read = readPipeline.begin().apply("read", readIO.buildReader()); - PAssert.that(read).containsInAnyOrder(createRow(1L), createRow(2L)); - readPipeline.run(); - } - - @Test - @Category({NeedsRunner.class}) - public void testBatchCustomWindowSize() throws Exception { - File destinationFile = new File(tempFolder.getRoot(), "person-info"); - - AvroSchemaIOProvider provider = new AvroSchemaIOProvider(); - Row config = - Row.withSchema(provider.configurationSchema()) - .addValue(Duration.ofMinutes(4).getSeconds()) - .build(); - SchemaIO writeIO = provider.from(destinationFile.getAbsolutePath(), config, SCHEMA); - - List rowsList = Arrays.asList(createRow(1L), createRow(3L), createRow(4L)); - PCollection rows = - writePipeline.apply("Create", Create.of(rowsList).withCoder(RowCoder.of(SCHEMA))); - - rows.apply("write", writeIO.buildWriter()); - writePipeline.run(); - - // Verify we wrote one file. - String wildcardPath = destinationFile.getAbsolutePath() + "*"; - MatchResult result = FileSystems.match(wildcardPath); - assertEquals(1, result.metadata().size()); - - // Verify results of the files. - SchemaIO readIO = provider.from(wildcardPath, config, SCHEMA); - PCollection read = readPipeline.begin().apply("read", readIO.buildReader()); - PAssert.that(read).containsInAnyOrder(rowsList); - readPipeline.run(); - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroSourceTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroSourceTest.java deleted file mode 100644 index 577fdb19f0138..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/AvroSourceTest.java +++ /dev/null @@ -1,843 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.containsInAnyOrder; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertSame; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.Objects; -import java.util.Random; -import java.util.stream.Collectors; -import org.apache.avro.Schema; -import org.apache.avro.file.CodecFactory; -import org.apache.avro.file.DataFileConstants; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumWriter; -import org.apache.avro.io.Decoder; -import org.apache.avro.reflect.AvroDefault; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.reflect.ReflectDatumWriter; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.coders.DefaultCoder; -import org.apache.beam.sdk.io.AvroSource.AvroMetadata; -import org.apache.beam.sdk.io.BlockBasedSource.BlockBasedReader; -import org.apache.beam.sdk.io.BoundedSource.BoundedReader; -import org.apache.beam.sdk.io.fs.MatchResult.Metadata; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.testing.SourceTestUtils; -import org.apache.beam.sdk.transforms.display.DisplayData; -import org.apache.beam.sdk.util.SerializableUtils; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.hamcrest.Matchers; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.ExpectedException; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Tests for AvroSource. */ -@RunWith(JUnit4.class) -public class AvroSourceTest { - @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); - - @Rule public ExpectedException expectedException = ExpectedException.none(); - - private enum SyncBehavior { - SYNC_REGULAR, // Sync at regular, user defined intervals - SYNC_RANDOM, // Sync at random intervals - SYNC_DEFAULT // Sync at default intervals (i.e., no manual syncing). - } - - private static final int DEFAULT_RECORD_COUNT = 1000; - - /** - * Generates an input Avro file containing the given records in the temporary directory and - * returns the full path of the file. - */ - private String generateTestFile( - String filename, - List elems, - SyncBehavior syncBehavior, - int syncInterval, - AvroCoder coder, - String codec) - throws IOException { - Random random = new Random(0); - File tmpFile = tmpFolder.newFile(filename); - String path = tmpFile.toString(); - - FileOutputStream os = new FileOutputStream(tmpFile); - DatumWriter datumWriter = - coder.getType().equals(GenericRecord.class) - ? new GenericDatumWriter<>(coder.getSchema()) - : new ReflectDatumWriter<>(coder.getSchema()); - try (DataFileWriter writer = new DataFileWriter<>(datumWriter)) { - writer.setCodec(CodecFactory.fromString(codec)); - writer.create(coder.getSchema(), os); - - int recordIndex = 0; - int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0; - - for (T elem : elems) { - writer.append(elem); - recordIndex++; - - switch (syncBehavior) { - case SYNC_REGULAR: - if (recordIndex == syncInterval) { - recordIndex = 0; - writer.sync(); - } - break; - case SYNC_RANDOM: - if (recordIndex == syncIndex) { - recordIndex = 0; - writer.sync(); - syncIndex = random.nextInt(syncInterval); - } - break; - case SYNC_DEFAULT: - default: - } - } - } - return path; - } - - @Test - public void testReadWithDifferentCodecs() throws Exception { - // Test reading files generated using all codecs. - String[] codecs = { - DataFileConstants.NULL_CODEC, - DataFileConstants.BZIP2_CODEC, - DataFileConstants.DEFLATE_CODEC, - DataFileConstants.SNAPPY_CODEC, - DataFileConstants.XZ_CODEC, - }; - // As Avro's default block size is 64KB, write 64K records to ensure at least one full block. - // We could make this smaller than 64KB assuming each record is at least B bytes, but then the - // test could silently stop testing the failure condition from BEAM-422. - List expected = createRandomRecords(1 << 16); - - for (String codec : codecs) { - String filename = - generateTestFile( - codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); - AvroSource source = AvroSource.from(filename).withSchema(Bird.class); - List actual = SourceTestUtils.readFromSource(source, null); - assertThat(expected, containsInAnyOrder(actual.toArray())); - } - } - - @Test - public void testSplitAtFraction() throws Exception { - // A reduced dataset is enough here. - List expected = createFixedRecords(DEFAULT_RECORD_COUNT); - // Create an AvroSource where each block is 1/10th of the total set of records. - String filename = - generateTestFile( - "tmp.avro", - expected, - SyncBehavior.SYNC_REGULAR, - DEFAULT_RECORD_COUNT / 10 /* max records per block */, - AvroCoder.of(FixedRecord.class), - DataFileConstants.NULL_CODEC); - File file = new File(filename); - - AvroSource source = AvroSource.from(filename).withSchema(FixedRecord.class); - List> splits = source.split(file.length() / 3, null); - for (BoundedSource subSource : splits) { - int items = SourceTestUtils.readFromSource(subSource, null).size(); - // Shouldn't split while unstarted. - SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null); - SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null); - SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null); - SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent( - subSource, DEFAULT_RECORD_COUNT / 100, 0.7, null); - SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent( - subSource, DEFAULT_RECORD_COUNT / 10, 0.1, null); - SourceTestUtils.assertSplitAtFractionFails( - subSource, DEFAULT_RECORD_COUNT / 10 + 1, 0.1, null); - SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null); - SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null); - SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null); - SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null); - } - } - - @Test - public void testGetProgressFromUnstartedReader() throws Exception { - List records = createFixedRecords(DEFAULT_RECORD_COUNT); - String filename = - generateTestFile( - "tmp.avro", - records, - SyncBehavior.SYNC_DEFAULT, - 1000, - AvroCoder.of(FixedRecord.class), - DataFileConstants.NULL_CODEC); - File file = new File(filename); - - AvroSource source = AvroSource.from(filename).withSchema(FixedRecord.class); - try (BoundedSource.BoundedReader reader = source.createReader(null)) { - assertEquals(Double.valueOf(0.0), reader.getFractionConsumed()); - } - - List> splits = source.split(file.length() / 3, null); - for (BoundedSource subSource : splits) { - try (BoundedSource.BoundedReader reader = subSource.createReader(null)) { - assertEquals(Double.valueOf(0.0), reader.getFractionConsumed()); - } - } - } - - @Test - public void testProgress() throws Exception { - // 5 records, 2 per block. - List records = createFixedRecords(5); - String filename = - generateTestFile( - "tmp.avro", - records, - SyncBehavior.SYNC_REGULAR, - 2, - AvroCoder.of(FixedRecord.class), - DataFileConstants.NULL_CODEC); - - AvroSource source = AvroSource.from(filename).withSchema(FixedRecord.class); - try (BoundedSource.BoundedReader readerOrig = source.createReader(null)) { - assertThat(readerOrig, Matchers.instanceOf(BlockBasedReader.class)); - BlockBasedReader reader = (BlockBasedReader) readerOrig; - - // Before starting - assertEquals(0.0, reader.getFractionConsumed(), 1e-6); - assertEquals(0, reader.getSplitPointsConsumed()); - assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); - - // First 2 records are in the same block. - assertTrue(reader.start()); - assertTrue(reader.isAtSplitPoint()); - assertEquals(0, reader.getSplitPointsConsumed()); - assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); - // continued - assertTrue(reader.advance()); - assertFalse(reader.isAtSplitPoint()); - assertEquals(0, reader.getSplitPointsConsumed()); - assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); - - // Second block -> parallelism consumed becomes 1. - assertTrue(reader.advance()); - assertTrue(reader.isAtSplitPoint()); - assertEquals(1, reader.getSplitPointsConsumed()); - assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); - // continued - assertTrue(reader.advance()); - assertFalse(reader.isAtSplitPoint()); - assertEquals(1, reader.getSplitPointsConsumed()); - assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); - - // Third and final block -> parallelism consumed becomes 2, remaining becomes 1. - assertTrue(reader.advance()); - assertTrue(reader.isAtSplitPoint()); - assertEquals(2, reader.getSplitPointsConsumed()); - assertEquals(1, reader.getSplitPointsRemaining()); - - // Done - assertFalse(reader.advance()); - assertEquals(3, reader.getSplitPointsConsumed()); - assertEquals(0, reader.getSplitPointsRemaining()); - assertEquals(1.0, reader.getFractionConsumed(), 1e-6); - } - } - - @Test - public void testProgressEmptySource() throws Exception { - // 0 records, 20 per block. - List records = Collections.emptyList(); - String filename = - generateTestFile( - "tmp.avro", - records, - SyncBehavior.SYNC_REGULAR, - 2, - AvroCoder.of(FixedRecord.class), - DataFileConstants.NULL_CODEC); - - AvroSource source = AvroSource.from(filename).withSchema(FixedRecord.class); - try (BoundedSource.BoundedReader readerOrig = source.createReader(null)) { - assertThat(readerOrig, Matchers.instanceOf(BlockBasedReader.class)); - BlockBasedReader reader = (BlockBasedReader) readerOrig; - - // before starting - assertEquals(0.0, reader.getFractionConsumed(), 1e-6); - assertEquals(0, reader.getSplitPointsConsumed()); - assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining()); - - // confirm empty - assertFalse(reader.start()); - - // after reading empty source - assertEquals(0, reader.getSplitPointsConsumed()); - assertEquals(0, reader.getSplitPointsRemaining()); - assertEquals(1.0, reader.getFractionConsumed(), 1e-6); - } - } - - @Test - public void testGetCurrentFromUnstartedReader() throws Exception { - List records = createFixedRecords(DEFAULT_RECORD_COUNT); - String filename = - generateTestFile( - "tmp.avro", - records, - SyncBehavior.SYNC_DEFAULT, - 1000, - AvroCoder.of(FixedRecord.class), - DataFileConstants.NULL_CODEC); - - AvroSource source = AvroSource.from(filename).withSchema(FixedRecord.class); - try (BlockBasedSource.BlockBasedReader reader = - (BlockBasedSource.BlockBasedReader) source.createReader(null)) { - assertEquals(null, reader.getCurrentBlock()); - - expectedException.expect(NoSuchElementException.class); - expectedException.expectMessage("No block has been successfully read from"); - reader.getCurrent(); - } - } - - @Test - public void testSplitAtFractionExhaustive() throws Exception { - // A small-sized input is sufficient, because the test verifies that splitting is non-vacuous. - List expected = createFixedRecords(20); - String filename = - generateTestFile( - "tmp.avro", - expected, - SyncBehavior.SYNC_REGULAR, - 5, - AvroCoder.of(FixedRecord.class), - DataFileConstants.NULL_CODEC); - - AvroSource source = AvroSource.from(filename).withSchema(FixedRecord.class); - SourceTestUtils.assertSplitAtFractionExhaustive(source, null); - } - - @Test - public void testSplitsWithSmallBlocks() throws Exception { - PipelineOptions options = PipelineOptionsFactory.create(); - // Test reading from an object file with many small random-sized blocks. - // The file itself doesn't have to be big; we can use a decreased record count. - List expected = createRandomRecords(DEFAULT_RECORD_COUNT); - String filename = - generateTestFile( - "tmp.avro", - expected, - SyncBehavior.SYNC_RANDOM, - DEFAULT_RECORD_COUNT / 20 /* max records/block */, - AvroCoder.of(Bird.class), - DataFileConstants.NULL_CODEC); - File file = new File(filename); - - // Small minimum bundle size - AvroSource source = - AvroSource.from(filename).withSchema(Bird.class).withMinBundleSize(100L); - - // Assert that the source produces the expected records - assertEquals(expected, SourceTestUtils.readFromSource(source, options)); - - List> splits; - int nonEmptySplits; - - // Split with the minimum bundle size - splits = source.split(100L, options); - assertTrue(splits.size() > 2); - SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); - nonEmptySplits = 0; - for (BoundedSource subSource : splits) { - if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { - nonEmptySplits += 1; - } - } - assertTrue(nonEmptySplits > 2); - - // Split with larger bundle size - splits = source.split(file.length() / 4, options); - assertTrue(splits.size() > 2); - SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); - nonEmptySplits = 0; - for (BoundedSource subSource : splits) { - if (SourceTestUtils.readFromSource(subSource, options).size() > 0) { - nonEmptySplits += 1; - } - } - assertTrue(nonEmptySplits > 2); - - // Split with the file length - splits = source.split(file.length(), options); - assertTrue(splits.size() == 1); - SourceTestUtils.assertSourcesEqualReferenceSource(source, splits, options); - } - - @Test - public void testMultipleFiles() throws Exception { - String baseName = "tmp-"; - List expected = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - List contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10); - expected.addAll(contents); - generateTestFile( - baseName + i, - contents, - SyncBehavior.SYNC_DEFAULT, - 0, - AvroCoder.of(Bird.class), - DataFileConstants.NULL_CODEC); - } - - AvroSource source = - AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString()) - .withSchema(Bird.class); - List actual = SourceTestUtils.readFromSource(source, null); - assertThat(actual, containsInAnyOrder(expected.toArray())); - } - - @Test - public void testCreationWithSchema() throws Exception { - List expected = createRandomRecords(100); - String filename = - generateTestFile( - "tmp.avro", - expected, - SyncBehavior.SYNC_DEFAULT, - 0, - AvroCoder.of(Bird.class), - DataFileConstants.NULL_CODEC); - - // Create a source with a schema object - Schema schema = ReflectData.get().getSchema(Bird.class); - AvroSource source = AvroSource.from(filename).withSchema(schema); - List records = SourceTestUtils.readFromSource(source, null); - assertEqualsWithGeneric(expected, records); - - // Create a source with a JSON schema - String schemaString = ReflectData.get().getSchema(Bird.class).toString(); - source = AvroSource.from(filename).withSchema(schemaString); - records = SourceTestUtils.readFromSource(source, null); - assertEqualsWithGeneric(expected, records); - } - - @Test - public void testSchemaUpdate() throws Exception { - List birds = createRandomRecords(100); - String filename = - generateTestFile( - "tmp.avro", - birds, - SyncBehavior.SYNC_DEFAULT, - 0, - AvroCoder.of(Bird.class), - DataFileConstants.NULL_CODEC); - - AvroSource source = AvroSource.from(filename).withSchema(FancyBird.class); - List actual = SourceTestUtils.readFromSource(source, null); - - List expected = new ArrayList<>(); - for (Bird bird : birds) { - expected.add( - new FancyBird( - bird.number, bird.species, bird.quality, bird.quantity, null, "MAXIMUM OVERDRIVE")); - } - - assertThat(actual, containsInAnyOrder(expected.toArray())); - } - - @Test - public void testSchemaStringIsInterned() throws Exception { - List birds = createRandomRecords(100); - String filename = - generateTestFile( - "tmp.avro", - birds, - SyncBehavior.SYNC_DEFAULT, - 0, - AvroCoder.of(Bird.class), - DataFileConstants.NULL_CODEC); - Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename); - String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString(); - // Add "" to the schema to make sure it is not interned. - AvroSource sourceA = AvroSource.from(filename).withSchema("" + schema); - AvroSource sourceB = AvroSource.from(filename).withSchema("" + schema); - assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString()); - - // Ensure that deserialization still goes through interning - AvroSource sourceC = SerializableUtils.clone(sourceB); - assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString()); - } - - @Test - public void testParseFn() throws Exception { - List expected = createRandomRecords(100); - String filename = - generateTestFile( - "tmp.avro", - expected, - SyncBehavior.SYNC_DEFAULT, - 0, - AvroCoder.of(Bird.class), - DataFileConstants.NULL_CODEC); - - AvroSource source = - AvroSource.from(filename) - .withParseFn( - input -> - new Bird( - (long) input.get("number"), - input.get("species").toString(), - input.get("quality").toString(), - (long) input.get("quantity")), - AvroCoder.of(Bird.class)); - List actual = SourceTestUtils.readFromSource(source, null); - assertThat(actual, containsInAnyOrder(expected.toArray())); - } - - @Test - public void testDatumReaderFactoryWithGenericRecord() throws Exception { - List inputBirds = createRandomRecords(100); - - String filename = - generateTestFile( - "tmp.avro", - inputBirds, - SyncBehavior.SYNC_DEFAULT, - 0, - AvroCoder.of(Bird.class), - DataFileConstants.NULL_CODEC); - - AvroSource.DatumReaderFactory factory = - (writer, reader) -> - new GenericDatumReader(writer, reader) { - @Override - protected Object readString(Object old, Decoder in) throws IOException { - return super.readString(old, in) + "_custom"; - } - }; - - AvroSource source = - AvroSource.from(filename) - .withParseFn( - input -> - new Bird( - (long) input.get("number"), - input.get("species").toString(), - input.get("quality").toString(), - (long) input.get("quantity")), - AvroCoder.of(Bird.class)) - .withDatumReaderFactory(factory); - List actual = SourceTestUtils.readFromSource(source, null); - List expected = - inputBirds.stream() - .map(b -> new Bird(b.number, b.species + "_custom", b.quality + "_custom", b.quantity)) - .collect(Collectors.toList()); - - assertThat(actual, containsInAnyOrder(expected.toArray())); - } - - private void assertEqualsWithGeneric(List expected, List actual) { - assertEquals(expected.size(), actual.size()); - for (int i = 0; i < expected.size(); i++) { - Bird fixed = expected.get(i); - GenericRecord generic = actual.get(i); - assertEquals(fixed.number, generic.get("number")); - assertEquals(fixed.quality, generic.get("quality").toString()); // From Avro util.Utf8 - assertEquals(fixed.quantity, generic.get("quantity")); - assertEquals(fixed.species, generic.get("species").toString()); - } - } - - @Test - public void testDisplayData() { - AvroSource source = - AvroSource.from("foobar.txt").withSchema(Bird.class).withMinBundleSize(1234); - - DisplayData displayData = DisplayData.from(source); - assertThat(displayData, hasDisplayItem("filePattern", "foobar.txt")); - assertThat(displayData, hasDisplayItem("minBundleSize", 1234)); - } - - @Test - public void testReadMetadataWithCodecs() throws Exception { - // Test reading files generated using all codecs. - String[] codecs = { - DataFileConstants.NULL_CODEC, - DataFileConstants.BZIP2_CODEC, - DataFileConstants.DEFLATE_CODEC, - DataFileConstants.SNAPPY_CODEC, - DataFileConstants.XZ_CODEC - }; - List expected = createRandomRecords(DEFAULT_RECORD_COUNT); - - for (String codec : codecs) { - String filename = - generateTestFile( - codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); - - Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); - AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); - assertEquals(codec, metadata.getCodec()); - } - } - - @Test - public void testReadSchemaString() throws Exception { - List expected = createRandomRecords(DEFAULT_RECORD_COUNT); - String codec = DataFileConstants.NULL_CODEC; - String filename = - generateTestFile( - codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); - Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); - AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); - // By default, parse validates the schema, which is what we want. - Schema schema = new Schema.Parser().parse(metadata.getSchemaString()); - assertEquals(4, schema.getFields().size()); - } - - @Test - public void testCreateFromMetadata() throws Exception { - List expected = createRandomRecords(DEFAULT_RECORD_COUNT); - String codec = DataFileConstants.NULL_CODEC; - String filename = - generateTestFile( - codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); - Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); - - AvroSource source = AvroSource.from(fileMeta); - AvroSource sourceWithSchema = source.withSchema(Bird.class); - AvroSource sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234); - - assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode()); - assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode()); - assertEquals( - FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode()); - } - - /** - * Class that will encode to a fixed size: 16 bytes. - * - *

Each object has a 15-byte array. Avro encodes an object of this type as a byte array, so - * each encoded object will consist of 1 byte that encodes the length of the array, followed by 15 - * bytes. - */ - @DefaultCoder(AvroCoder.class) - public static class FixedRecord { - private byte[] value = new byte[15]; - - public FixedRecord() { - this(0); - } - - public FixedRecord(int i) { - value[0] = (byte) i; - value[1] = (byte) (i >> 8); - value[2] = (byte) (i >> 16); - value[3] = (byte) (i >> 24); - } - - public int asInt() { - return value[0] | (value[1] << 8) | (value[2] << 16) | (value[3] << 24); - } - - @Override - public boolean equals(@Nullable Object o) { - if (o instanceof FixedRecord) { - FixedRecord other = (FixedRecord) o; - return this.asInt() == other.asInt(); - } - return false; - } - - @Override - public int hashCode() { - return toString().hashCode(); - } - - @Override - public String toString() { - return Integer.toString(this.asInt()); - } - } - - /** Create a list of count 16-byte records. */ - private static List createFixedRecords(int count) { - List records = new ArrayList<>(); - for (int i = 0; i < count; i++) { - records.add(new FixedRecord(i)); - } - return records; - } - - /** Class used as the record type in tests. */ - @DefaultCoder(AvroCoder.class) - static class Bird { - long number; - String species; - String quality; - long quantity; - - public Bird() {} - - public Bird(long number, String species, String quality, long quantity) { - this.number = number; - this.species = species; - this.quality = quality; - this.quantity = quantity; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(Bird.class) - .addValue(number) - .addValue(species) - .addValue(quantity) - .addValue(quality) - .toString(); - } - - @Override - public boolean equals(@Nullable Object obj) { - if (obj instanceof Bird) { - Bird other = (Bird) obj; - return Objects.equals(species, other.species) - && Objects.equals(quality, other.quality) - && quantity == other.quantity - && number == other.number; - } - return false; - } - - @Override - public int hashCode() { - return Objects.hash(number, species, quality, quantity); - } - } - - /** - * Class used as the record type in tests. - * - *

Contains nullable fields and fields with default values. Can be read using a file written - * with the Bird schema. - */ - @DefaultCoder(AvroCoder.class) - public static class FancyBird { - long number; - String species; - String quality; - long quantity; - - @org.apache.avro.reflect.Nullable String habitat; - - @AvroDefault("\"MAXIMUM OVERDRIVE\"") - String fancinessLevel; - - public FancyBird() {} - - public FancyBird( - long number, - String species, - String quality, - long quantity, - String habitat, - String fancinessLevel) { - this.number = number; - this.species = species; - this.quality = quality; - this.quantity = quantity; - this.habitat = habitat; - this.fancinessLevel = fancinessLevel; - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(FancyBird.class) - .addValue(number) - .addValue(species) - .addValue(quality) - .addValue(quantity) - .addValue(habitat) - .addValue(fancinessLevel) - .toString(); - } - - @Override - public boolean equals(@Nullable Object obj) { - if (obj instanceof FancyBird) { - FancyBird other = (FancyBird) obj; - return Objects.equals(species, other.species) - && Objects.equals(quality, other.quality) - && quantity == other.quantity - && number == other.number - && Objects.equals(fancinessLevel, other.fancinessLevel) - && Objects.equals(habitat, other.habitat); - } - return false; - } - - @Override - public int hashCode() { - return Objects.hash(number, species, quality, quantity, habitat, fancinessLevel); - } - } - - /** Create a list of n random records. */ - private static List createRandomRecords(long n) { - String[] qualities = { - "miserable", "forelorn", "fidgity", "squirrelly", "fanciful", "chipper", "lazy" - }; - String[] species = {"pigeons", "owls", "gulls", "hawks", "robins", "jays"}; - Random random = new Random(0); - - List records = new ArrayList<>(); - for (long i = 0; i < n; i++) { - Bird bird = new Bird(); - bird.quality = qualities[random.nextInt(qualities.length)]; - bird.species = species[random.nextInt(species.length)]; - bird.number = i; - bird.quantity = random.nextLong(); - records.add(bird); - } - return records; - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/ReadTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/ReadTest.java index 04d9b8f12f7c4..aa528c4f08f4f 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/ReadTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/ReadTest.java @@ -38,7 +38,6 @@ import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.LongStream; -import org.apache.beam.sdk.coders.AvroCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.coders.CustomCoder; @@ -343,7 +342,7 @@ public Coder getOutputCoder() { @Override public Coder getCheckpointMarkCoder() { - return AvroCoder.of(CountingSource.CounterMark.class); + return new CountingSource.CounterMarkCoder(); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/SerializableAvroCodecFactoryTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/SerializableAvroCodecFactoryTest.java deleted file mode 100644 index 4383a16dd28ca..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/SerializableAvroCodecFactoryTest.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io; - -import static org.apache.avro.file.DataFileConstants.BZIP2_CODEC; -import static org.apache.avro.file.DataFileConstants.DEFLATE_CODEC; -import static org.apache.avro.file.DataFileConstants.NULL_CODEC; -import static org.apache.avro.file.DataFileConstants.SNAPPY_CODEC; -import static org.apache.avro.file.DataFileConstants.XZ_CODEC; -import static org.junit.Assert.assertEquals; - -import java.util.Arrays; -import java.util.List; -import org.apache.avro.file.CodecFactory; -import org.apache.beam.sdk.util.SerializableUtils; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Tests of SerializableAvroCodecFactory. */ -@RunWith(JUnit4.class) -public class SerializableAvroCodecFactoryTest { - private final List avroCodecs = - Arrays.asList(NULL_CODEC, SNAPPY_CODEC, DEFLATE_CODEC, XZ_CODEC, BZIP2_CODEC); - - @Test - public void testDefaultCodecsIn() throws Exception { - for (String codec : avroCodecs) { - SerializableAvroCodecFactory codecFactory = - new SerializableAvroCodecFactory(CodecFactory.fromString(codec)); - - assertEquals(CodecFactory.fromString(codec).toString(), codecFactory.getCodec().toString()); - } - } - - @Test - public void testDefaultCodecsSerDe() throws Exception { - for (String codec : avroCodecs) { - SerializableAvroCodecFactory codecFactory = - new SerializableAvroCodecFactory(CodecFactory.fromString(codec)); - - SerializableAvroCodecFactory serdeC = SerializableUtils.clone(codecFactory); - - assertEquals(CodecFactory.fromString(codec).toString(), serdeC.getCodec().toString()); - } - } - - @Test - public void testDeflateCodecSerDeWithLevels() throws Exception { - for (int i = 0; i < 10; ++i) { - SerializableAvroCodecFactory codecFactory = - new SerializableAvroCodecFactory(CodecFactory.deflateCodec(i)); - - SerializableAvroCodecFactory serdeC = SerializableUtils.clone(codecFactory); - - assertEquals(CodecFactory.deflateCodec(i).toString(), serdeC.getCodec().toString()); - } - } - - @Test - public void testXZCodecSerDeWithLevels() throws Exception { - for (int i = 0; i < 10; ++i) { - SerializableAvroCodecFactory codecFactory = - new SerializableAvroCodecFactory(CodecFactory.xzCodec(i)); - - SerializableAvroCodecFactory serdeC = SerializableUtils.clone(codecFactory); - - assertEquals(CodecFactory.xzCodec(i).toString(), serdeC.getCodec().toString()); - } - } - - @Test(expected = NullPointerException.class) - public void testNullCodecToString() throws Exception { - // use default CTR (available cause Serializable) - SerializableAvroCodecFactory codec = new SerializableAvroCodecFactory(); - assertEquals("null", codec.toString()); - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java index 379345b1001e6..84c05ee6c9062 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java @@ -245,15 +245,24 @@ private static File createZipFile( } private static TextSource prepareSource( - TemporaryFolder temporaryFolder, byte[] data, @Nullable byte[] delimiter) throws IOException { + TemporaryFolder temporaryFolder, byte[] data, @Nullable byte[] delimiter, int skipHeaderLines) + throws IOException { Path path = temporaryFolder.newFile().toPath(); Files.write(path, data); - return getTextSource(path.toString(), delimiter); + return getTextSource(path.toString(), delimiter, skipHeaderLines); } - public static TextSource getTextSource(String path, @Nullable byte[] delimiter) { + public static TextSource getTextSource( + String path, @Nullable byte[] delimiter, int skipHeaderLines) { return new TextSource( - ValueProvider.StaticValueProvider.of(path), EmptyMatchTreatment.DISALLOW, delimiter); + ValueProvider.StaticValueProvider.of(path), + EmptyMatchTreatment.DISALLOW, + delimiter, + skipHeaderLines); + } + + public static TextSource getTextSource(String path, @Nullable byte[] delimiter) { + return getTextSource(path, delimiter, 0); } private static String getFileSuffix(Compression compression) { @@ -384,7 +393,7 @@ public void testReadLinesWithDefaultDelimiterAndZeroAndOneLengthReturningChannel Files.write(path, line.getBytes(UTF_8)); Metadata metadata = FileSystems.matchSingleFileSpec(path.toString()); FileBasedSource source = - getTextSource(path.toString(), null) + getTextSource(path.toString(), null, 0) .createForSubrangeOfFile(metadata, 0, metadata.sizeBytes()); FileBasedReader reader = source.createSingleFileReader(PipelineOptionsFactory.create()); @@ -433,7 +442,49 @@ public void testSplittingSource() throws Exception { } private TextSource prepareSource(byte[] data) throws IOException { - return TextIOReadTest.prepareSource(tempFolder, data, null); + return TextIOReadTest.prepareSource(tempFolder, data, null, 0); + } + + private void runTestReadWithData(byte[] data, List expectedResults) throws Exception { + TextSource source = prepareSource(data); + List actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); + assertThat( + actual, containsInAnyOrder(new ArrayList<>(expectedResults).toArray(new String[0]))); + } + } + + /** Tests for reading files with/without header. */ + @RunWith(Parameterized.class) + public static class SkippingHeaderTest { + private static final ImmutableList EXPECTED = ImmutableList.of("asdf", "hjkl", "xyz"); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + @Parameterized.Parameters(name = "{index}: {0}") + public static Iterable data() { + return ImmutableList.builder() + .add(new Object[] {"\n\n\n", ImmutableList.of("", ""), 1}) + .add(new Object[] {"\n", ImmutableList.of(), 1}) + .add(new Object[] {"header\nasdf\nhjkl\nxyz\n", EXPECTED, 1}) + .add(new Object[] {"header1\nheader2\nasdf\nhjkl\nxyz\n", EXPECTED, 2}) + .build(); + } + + @Parameterized.Parameter(0) + public String line; + + @Parameterized.Parameter(1) + public ImmutableList expected; + + @Parameterized.Parameter(2) + public int skipHeaderLines; + + @Test + public void testReadLines() throws Exception { + runTestReadWithData(line.getBytes(UTF_8), expected); + } + + private TextSource prepareSource(byte[] data) throws IOException { + return TextIOReadTest.prepareSource(tempFolder, data, null, skipHeaderLines); } private void runTestReadWithData(byte[] data, List expectedResults) throws Exception { @@ -477,7 +528,8 @@ public static Iterable data() { @Test public void testReadLinesWithCustomDelimiter() throws Exception { SourceTestUtils.assertSplitAtFractionExhaustive( - TextIOReadTest.prepareSource(tempFolder, testCase.getBytes(UTF_8), new byte[] {'|', '*'}), + TextIOReadTest.prepareSource( + tempFolder, testCase.getBytes(UTF_8), new byte[] {'|', '*'}, 0), PipelineOptionsFactory.create()); } @@ -489,7 +541,7 @@ public void testReadLinesWithCustomDelimiterAndZeroAndOneLengthReturningChannel( Files.write(path, testCase.getBytes(UTF_8)); Metadata metadata = FileSystems.matchSingleFileSpec(path.toString()); FileBasedSource source = - getTextSource(path.toString(), delimiter) + getTextSource(path.toString(), delimiter, 0) .createForSubrangeOfFile(metadata, 0, metadata.sizeBytes()); FileBasedReader reader = source.createSingleFileReader(PipelineOptionsFactory.create()); @@ -743,7 +795,7 @@ public void testTextIOGetName() { } private TextSource prepareSource(byte[] data) throws IOException { - return TextIOReadTest.prepareSource(tempFolder, data, null); + return TextIOReadTest.prepareSource(tempFolder, data, null, 0); } @Test @@ -977,7 +1029,8 @@ public void testReadFilesWithFilename() throws IOException { new TextSource( ValueProvider.StaticValueProvider.of(input), EmptyMatchTreatment.DISALLOW, - new byte[] {'\n'}); + new byte[] {'\n'}, + 0); PCollection> lines = p.apply( @@ -1102,7 +1155,7 @@ public void processElement(ProcessContext c) { ValueProvider.StaticValueProvider.of(file.getMetadata().resourceId().getFilename()); // Create a TextSource, passing null as the delimiter to use the default // delimiters ('\n', '\r', or '\r\n'). - TextSource textSource = new TextSource(filenameProvider, null, null); + TextSource textSource = new TextSource(filenameProvider, null, null, 0); try { BoundedSource.BoundedReader reader = textSource diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/AvroSchemaTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/AvroSchemaTest.java deleted file mode 100644 index c32dbd9944a1d..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/AvroSchemaTest.java +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas; - -import static org.junit.Assert.assertEquals; - -import java.nio.ByteBuffer; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.avro.reflect.AvroIgnore; -import org.apache.avro.reflect.AvroName; -import org.apache.avro.reflect.AvroSchema; -import org.apache.avro.util.Utf8; -import org.apache.beam.sdk.schemas.Schema.FieldType; -import org.apache.beam.sdk.schemas.logicaltypes.EnumerationType; -import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; -import org.apache.beam.sdk.schemas.transforms.Group; -import org.apache.beam.sdk.schemas.utils.AvroUtils; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.ValidatesRunner; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.SerializableFunction; -import org.apache.beam.sdk.util.SerializableUtils; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.Row; -import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Days; -import org.joda.time.LocalDate; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; - -/** Tests for AVRO schema classes. */ -public class AvroSchemaTest { - /** A test POJO that corresponds to our AVRO schema. */ - public static class AvroSubPojo { - @AvroName("BOOL_NON_NULLABLE") - public boolean boolNonNullable; - - @AvroName("int") - @org.apache.avro.reflect.Nullable - public Integer anInt; - - public AvroSubPojo(boolean boolNonNullable, Integer anInt) { - this.boolNonNullable = boolNonNullable; - this.anInt = anInt; - } - - public AvroSubPojo() {} - - @Override - public boolean equals(@Nullable Object o) { - if (this == o) { - return true; - } - if (!(o instanceof AvroSubPojo)) { - return false; - } - AvroSubPojo that = (AvroSubPojo) o; - return boolNonNullable == that.boolNonNullable && Objects.equals(anInt, that.anInt); - } - - @Override - public int hashCode() { - return Objects.hash(boolNonNullable, anInt); - } - - @Override - public String toString() { - return "AvroSubPojo{" + "boolNonNullable=" + boolNonNullable + ", anInt=" + anInt + '}'; - } - } - - /** A test POJO that corresponds to our AVRO schema. */ - public static class AvroPojo { - public @AvroName("bool_non_nullable") boolean boolNonNullable; - - @org.apache.avro.reflect.Nullable - public @AvroName("int") Integer anInt; - - @org.apache.avro.reflect.Nullable - public @AvroName("long") Long aLong; - - @AvroName("float") - @org.apache.avro.reflect.Nullable - public Float aFloat; - - @AvroName("double") - @org.apache.avro.reflect.Nullable - public Double aDouble; - - @org.apache.avro.reflect.Nullable public String string; - @org.apache.avro.reflect.Nullable public ByteBuffer bytes; - - @AvroSchema("{\"type\": \"fixed\", \"size\": 4, \"name\": \"fixed4\"}") - public byte[] fixed; - - @AvroSchema("{\"type\": \"int\", \"logicalType\": \"date\"}") - public LocalDate date; - - @AvroSchema("{\"type\": \"long\", \"logicalType\": \"timestamp-millis\"}") - public DateTime timestampMillis; - - @AvroSchema("{\"name\": \"TestEnum\", \"type\": \"enum\", \"symbols\": [\"abc\",\"cde\"] }") - public TestEnum testEnum; - - @org.apache.avro.reflect.Nullable public AvroSubPojo row; - @org.apache.avro.reflect.Nullable public List array; - @org.apache.avro.reflect.Nullable public Map map; - @AvroIgnore String extraField; - - @Override - public boolean equals(@Nullable Object o) { - if (this == o) { - return true; - } - if (!(o instanceof AvroPojo)) { - return false; - } - AvroPojo avroPojo = (AvroPojo) o; - return boolNonNullable == avroPojo.boolNonNullable - && Objects.equals(anInt, avroPojo.anInt) - && Objects.equals(aLong, avroPojo.aLong) - && Objects.equals(aFloat, avroPojo.aFloat) - && Objects.equals(aDouble, avroPojo.aDouble) - && Objects.equals(string, avroPojo.string) - && Objects.equals(bytes, avroPojo.bytes) - && Arrays.equals(fixed, avroPojo.fixed) - && Objects.equals(date, avroPojo.date) - && Objects.equals(timestampMillis, avroPojo.timestampMillis) - && Objects.equals(testEnum, avroPojo.testEnum) - && Objects.equals(row, avroPojo.row) - && Objects.equals(array, avroPojo.array) - && Objects.equals(map, avroPojo.map); - } - - @Override - public int hashCode() { - return Objects.hash( - boolNonNullable, - anInt, - aLong, - aFloat, - aDouble, - string, - bytes, - Arrays.hashCode(fixed), - date, - timestampMillis, - testEnum, - row, - array, - map); - } - - public AvroPojo( - boolean boolNonNullable, - int anInt, - long aLong, - float aFloat, - double aDouble, - String string, - ByteBuffer bytes, - byte[] fixed, - LocalDate date, - DateTime timestampMillis, - TestEnum testEnum, - AvroSubPojo row, - List array, - Map map) { - this.boolNonNullable = boolNonNullable; - this.anInt = anInt; - this.aLong = aLong; - this.aFloat = aFloat; - this.aDouble = aDouble; - this.string = string; - this.bytes = bytes; - this.fixed = fixed; - this.date = date; - this.timestampMillis = timestampMillis; - this.testEnum = testEnum; - this.row = row; - this.array = array; - this.map = map; - this.extraField = ""; - } - - public AvroPojo() {} - - @Override - public String toString() { - return "AvroPojo{" - + "boolNonNullable=" - + boolNonNullable - + ", anInt=" - + anInt - + ", aLong=" - + aLong - + ", aFloat=" - + aFloat - + ", aDouble=" - + aDouble - + ", string='" - + string - + '\'' - + ", bytes=" - + bytes - + ", fixed=" - + Arrays.toString(fixed) - + ", date=" - + date - + ", timestampMillis=" - + timestampMillis - + ", testEnum=" - + testEnum - + ", row=" - + row - + ", array=" - + array - + ", map=" - + map - + ", extraField='" - + extraField - + '\'' - + '}'; - } - } - - private static final Schema SUBSCHEMA = - Schema.builder() - .addField("BOOL_NON_NULLABLE", FieldType.BOOLEAN) - .addNullableField("int", FieldType.INT32) - .build(); - private static final FieldType SUB_TYPE = FieldType.row(SUBSCHEMA).withNullable(true); - - private static final EnumerationType TEST_ENUM_TYPE = EnumerationType.create("abc", "cde"); - - private static final Schema SCHEMA = - Schema.builder() - .addField("bool_non_nullable", FieldType.BOOLEAN) - .addNullableField("int", FieldType.INT32) - .addNullableField("long", FieldType.INT64) - .addNullableField("float", FieldType.FLOAT) - .addNullableField("double", FieldType.DOUBLE) - .addNullableField("string", FieldType.STRING) - .addNullableField("bytes", FieldType.BYTES) - .addField("fixed", FieldType.logicalType(FixedBytes.of(4))) - .addField("date", FieldType.DATETIME) - .addField("timestampMillis", FieldType.DATETIME) - .addField("TestEnum", FieldType.logicalType(TEST_ENUM_TYPE)) - .addNullableField("row", SUB_TYPE) - .addNullableField("array", FieldType.array(SUB_TYPE)) - .addNullableField("map", FieldType.map(FieldType.STRING, SUB_TYPE)) - .build(); - - private static final Schema POJO_SCHEMA = - Schema.builder() - .addField("bool_non_nullable", FieldType.BOOLEAN) - .addNullableField("int", FieldType.INT32) - .addNullableField("long", FieldType.INT64) - .addNullableField("float", FieldType.FLOAT) - .addNullableField("double", FieldType.DOUBLE) - .addNullableField("string", FieldType.STRING) - .addNullableField("bytes", FieldType.BYTES) - .addField("fixed", FieldType.logicalType(FixedBytes.of(4))) - .addField("date", FieldType.DATETIME) - .addField("timestampMillis", FieldType.DATETIME) - .addField("testEnum", FieldType.logicalType(TEST_ENUM_TYPE)) - .addNullableField("row", SUB_TYPE) - .addNullableField("array", FieldType.array(SUB_TYPE.withNullable(false))) - .addNullableField("map", FieldType.map(FieldType.STRING, SUB_TYPE.withNullable(false))) - .build(); - - private static final byte[] BYTE_ARRAY = new byte[] {1, 2, 3, 4}; - private static final DateTime DATE_TIME = - new DateTime().withDate(1979, 3, 14).withTime(1, 2, 3, 4); - private static final LocalDate DATE = new LocalDate(1979, 3, 14); - private static final TestAvroNested AVRO_NESTED_SPECIFIC_RECORD = new TestAvroNested(true, 42); - private static final TestAvro AVRO_SPECIFIC_RECORD = - new TestAvro( - true, - 43, - 44L, - (float) 44.1, - (double) 44.2, - "mystring", - ByteBuffer.wrap(BYTE_ARRAY), - new fixed4(BYTE_ARRAY), - DATE, - DATE_TIME, - TestEnum.abc, - AVRO_NESTED_SPECIFIC_RECORD, - ImmutableList.of(AVRO_NESTED_SPECIFIC_RECORD, AVRO_NESTED_SPECIFIC_RECORD), - ImmutableMap.of("k1", AVRO_NESTED_SPECIFIC_RECORD, "k2", AVRO_NESTED_SPECIFIC_RECORD)); - private static final GenericRecord AVRO_NESTED_GENERIC_RECORD = - new GenericRecordBuilder(TestAvroNested.SCHEMA$) - .set("BOOL_NON_NULLABLE", true) - .set("int", 42) - .build(); - private static final GenericRecord AVRO_GENERIC_RECORD = - new GenericRecordBuilder(TestAvro.SCHEMA$) - .set("bool_non_nullable", true) - .set("int", 43) - .set("long", 44L) - .set("float", (float) 44.1) - .set("double", (double) 44.2) - .set("string", new Utf8("mystring")) - .set("bytes", ByteBuffer.wrap(BYTE_ARRAY)) - .set( - "fixed", - GenericData.get() - .createFixed( - null, BYTE_ARRAY, org.apache.avro.Schema.createFixed("fixed4", "", "", 4))) - .set("date", (int) Days.daysBetween(new LocalDate(1970, 1, 1), DATE).getDays()) - .set("timestampMillis", DATE_TIME.getMillis()) - .set("TestEnum", TestEnum.abc) - .set("row", AVRO_NESTED_GENERIC_RECORD) - .set("array", ImmutableList.of(AVRO_NESTED_GENERIC_RECORD, AVRO_NESTED_GENERIC_RECORD)) - .set( - "map", - ImmutableMap.of( - new Utf8("k1"), AVRO_NESTED_GENERIC_RECORD, - new Utf8("k2"), AVRO_NESTED_GENERIC_RECORD)) - .build(); - - private static final Row NESTED_ROW = Row.withSchema(SUBSCHEMA).addValues(true, 42).build(); - private static final Row ROW = - Row.withSchema(SCHEMA) - .addValues( - true, - 43, - 44L, - (float) 44.1, - (double) 44.2, - "mystring", - ByteBuffer.wrap(BYTE_ARRAY), - BYTE_ARRAY, - DATE.toDateTimeAtStartOfDay(DateTimeZone.UTC), - DATE_TIME, - TEST_ENUM_TYPE.valueOf("abc"), - NESTED_ROW, - ImmutableList.of(NESTED_ROW, NESTED_ROW), - ImmutableMap.of("k1", NESTED_ROW, "k2", NESTED_ROW)) - .build(); - - @Test - public void testSpecificRecordSchema() { - assertEquals(SCHEMA, new AvroRecordSchema().schemaFor(TypeDescriptor.of(TestAvro.class))); - } - - @Test - public void testPojoSchema() { - assertEquals(POJO_SCHEMA, new AvroRecordSchema().schemaFor(TypeDescriptor.of(AvroPojo.class))); - } - - @Test - public void testSpecificRecordToRow() { - SerializableFunction toRow = - new AvroRecordSchema().toRowFunction(TypeDescriptor.of(TestAvro.class)); - assertEquals(ROW, toRow.apply(AVRO_SPECIFIC_RECORD)); - } - - @Test - public void testRowToSpecificRecord() { - SerializableFunction fromRow = - new AvroRecordSchema().fromRowFunction(TypeDescriptor.of(TestAvro.class)); - assertEquals(AVRO_SPECIFIC_RECORD, fromRow.apply(ROW)); - } - - @Test - public void testGenericRecordToRow() { - SerializableFunction toRow = - AvroUtils.getGenericRecordToRowFunction(SCHEMA); - assertEquals(ROW, toRow.apply(AVRO_GENERIC_RECORD)); - } - - @Test - public void testRowToGenericRecord() { - SerializableFunction fromRow = - AvroUtils.getRowToGenericRecordFunction(TestAvro.SCHEMA$); - assertEquals(AVRO_GENERIC_RECORD, fromRow.apply(ROW)); - } - - private static final AvroSubPojo SUB_POJO = new AvroSubPojo(true, 42); - private static final AvroPojo AVRO_POJO = - new AvroPojo( - true, - 43, - 44L, - (float) 44.1, - (double) 44.2, - "mystring", - ByteBuffer.wrap(BYTE_ARRAY), - BYTE_ARRAY, - DATE, - DATE_TIME, - TestEnum.abc, - SUB_POJO, - ImmutableList.of(SUB_POJO, SUB_POJO), - ImmutableMap.of("k1", SUB_POJO, "k2", SUB_POJO)); - - private static final Row ROW_FOR_POJO = - Row.withSchema(POJO_SCHEMA) - .addValues( - true, - 43, - 44L, - (float) 44.1, - (double) 44.2, - "mystring", - ByteBuffer.wrap(BYTE_ARRAY), - BYTE_ARRAY, - DATE.toDateTimeAtStartOfDay(DateTimeZone.UTC), - DATE_TIME, - TEST_ENUM_TYPE.valueOf("abc"), - NESTED_ROW, - ImmutableList.of(NESTED_ROW, NESTED_ROW), - ImmutableMap.of("k1", NESTED_ROW, "k2", NESTED_ROW)) - .build(); - - @Test - public void testPojoRecordToRow() { - SerializableFunction toRow = - new AvroRecordSchema().toRowFunction(TypeDescriptor.of(AvroPojo.class)); - assertEquals(ROW_FOR_POJO, toRow.apply(AVRO_POJO)); - } - - @Test - public void testRowToPojo() { - SerializableFunction fromRow = - new AvroRecordSchema().fromRowFunction(TypeDescriptor.of(AvroPojo.class)); - assertEquals(AVRO_POJO, fromRow.apply(ROW_FOR_POJO)); - } - - @Test - public void testPojoRecordToRowSerializable() { - SerializableUtils.ensureSerializableRoundTrip( - new AvroRecordSchema().toRowFunction(TypeDescriptor.of(AvroPojo.class))); - } - - @Test - public void testPojoRecordFromRowSerializable() { - SerializableUtils.ensureSerializableRoundTrip( - new AvroRecordSchema().fromRowFunction(TypeDescriptor.of(AvroPojo.class))); - } - - @Rule public final transient TestPipeline pipeline = TestPipeline.create(); - - @Test - @Category(ValidatesRunner.class) - public void testAvroPipelineGroupBy() { - PCollection input = pipeline.apply(Create.of(ROW_FOR_POJO).withRowSchema(POJO_SCHEMA)); - - PCollection output = input.apply(Group.byFieldNames("string")); - Schema keySchema = Schema.builder().addStringField("string").build(); - Schema outputSchema = - Schema.builder() - .addRowField("key", keySchema) - .addIterableField("value", FieldType.row(POJO_SCHEMA)) - .build(); - PAssert.that(output) - .containsInAnyOrder( - Row.withSchema(outputSchema) - .addValue(Row.withSchema(keySchema).addValue("mystring").build()) - .addIterable(ImmutableList.of(ROW_FOR_POJO)) - .build()); - - pipeline.run(); - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaCoderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaCoderTest.java index 962c8f7183497..cc57d382af0c3 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaCoderTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaCoderTest.java @@ -26,7 +26,6 @@ import java.util.Collection; import java.util.Objects; import java.util.function.Supplier; -import org.apache.avro.reflect.AvroSchema; import org.apache.beam.sdk.coders.Coder.NonDeterministicException; import org.apache.beam.sdk.coders.RowCoder; import org.apache.beam.sdk.schemas.Schema.Field; @@ -196,43 +195,6 @@ public int hashCode() { } } - @DefaultSchema(AvroRecordSchema.class) - private static class SimpleAvro { - public String string; - public Integer int32; - public Long int64; - - @AvroSchema("{\"type\": \"long\", \"logicalType\": \"timestamp-millis\"}") - public DateTime datetime; - - public SimpleAvro(String string, Integer int32, Long int64, DateTime datetime) { - this.string = string; - this.int32 = int32; - this.int64 = int64; - this.datetime = datetime; - } - - @Override - public boolean equals(@Nullable Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - SimpleAvro that = (SimpleAvro) o; - return string.equals(that.string) - && int32.equals(that.int32) - && int64.equals(that.int64) - && datetime.equals(that.datetime); - } - - @Override - public int hashCode() { - return Objects.hash(string, int32, int64, datetime); - } - } - private static final SchemaRegistry REGISTRY = SchemaRegistry.createDefault(); private static SchemaCoder coderFrom(TypeDescriptor typeDescriptor) throws NoSuchSchemaException { @@ -316,23 +278,6 @@ public static Collection data() throws NoSuchSchemaException { new DateTime().withDate(1989, 3, 14).withTime(10, 30, 0, 0))), true }, - new Object[] { - coderFrom(TypeDescriptor.of(SimpleAvro.class)), - ImmutableList.>of( - () -> - new SimpleAvro( - "foo", - 9001, - 0L, - new DateTime().withDate(1979, 3, 14).withTime(10, 30, 0, 0)), - () -> - new SimpleAvro( - "bar", - 9002, - 1L, - new DateTime().withDate(1989, 3, 14).withTime(10, 30, 0, 0))), - true - }, new Object[] { RowCoder.of(LOGICAL_NANOS_SCHEMA), ImmutableList.>of( diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/io/AvroPayloadSerializerProviderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/io/AvroPayloadSerializerProviderTest.java deleted file mode 100644 index 2fbcc76f18ff9..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/io/AvroPayloadSerializerProviderTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas.io; - -import static org.junit.Assert.assertEquals; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.io.payloads.AvroPayloadSerializerProvider; -import org.apache.beam.sdk.schemas.utils.AvroUtils; -import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -@RunWith(JUnit4.class) -public class AvroPayloadSerializerProviderTest { - private static final Schema SCHEMA = - Schema.builder().addInt64Field("abc").addStringField("xyz").build(); - private static final org.apache.avro.Schema AVRO_SCHEMA = AvroUtils.toAvroSchema(SCHEMA); - private static final AvroCoder AVRO_CODER = AvroCoder.of(AVRO_SCHEMA); - private static final Row DESERIALIZED = - Row.withSchema(SCHEMA).withFieldValue("abc", 3L).withFieldValue("xyz", "qqq").build(); - private static final GenericRecord SERIALIZED = - new GenericRecordBuilder(AVRO_SCHEMA).set("abc", 3L).set("xyz", "qqq").build(); - - private final AvroPayloadSerializerProvider provider = new AvroPayloadSerializerProvider(); - - @Test - public void serialize() throws Exception { - byte[] bytes = provider.getSerializer(SCHEMA, ImmutableMap.of()).serialize(DESERIALIZED); - GenericRecord record = AVRO_CODER.decode(new ByteArrayInputStream(bytes)); - assertEquals(3L, record.get("abc")); - assertEquals("qqq", record.get("xyz").toString()); - } - - @Test - public void deserialize() throws Exception { - ByteArrayOutputStream os = new ByteArrayOutputStream(); - AVRO_CODER.encode(SERIALIZED, os); - Row row = provider.getSerializer(SCHEMA, ImmutableMap.of()).deserialize(os.toByteArray()); - assertEquals(DESERIALIZED, row); - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/ConvertTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/ConvertTest.java index 603482edaaa05..32d32e8918eb2 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/ConvertTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/ConvertTest.java @@ -20,12 +20,10 @@ import java.util.Arrays; import java.util.Map; import java.util.Objects; -import org.apache.avro.generic.GenericRecord; import org.apache.beam.sdk.schemas.JavaFieldSchema; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; -import org.apache.beam.sdk.schemas.utils.AvroUtils; import org.apache.beam.sdk.testing.NeedsRunner; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; @@ -134,9 +132,6 @@ public int hashCode() { .addValue(ImmutableMap.of("first", EXPECTED_ROW1_NESTED, "second", EXPECTED_ROW1_NESTED)) .build(); - private static final GenericRecord EXPECTED_GENERICRECORD1 = - AvroUtils.toGenericRecord(EXPECTED_ROW1, AvroUtils.toAvroSchema(EXPECTED_SCHEMA1)); - /** Test outer POJO. Different but equivalent schema. * */ @DefaultSchema(JavaFieldSchema.class) public static class POJO2 { @@ -248,13 +243,4 @@ public void testFromRowsUnboxingPrimitive() { PAssert.that(longs).containsInAnyOrder((Long) EXPECTED_ROW1.getValue("field2")); pipeline.run(); } - - @Test - @Category(NeedsRunner.class) - public void testToGenericRecords() { - PCollection records = - pipeline.apply(Create.of(new POJO1())).apply(Convert.to(GenericRecord.class)); - PAssert.that(records).containsInAnyOrder(EXPECTED_GENERICRECORD1); - pipeline.run(); - } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaExplodeTransformProviderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaExplodeTransformProviderTest.java new file mode 100644 index 0000000000000..7c8166d92b9a6 --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaExplodeTransformProviderTest.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import java.util.List; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.NeedsRunner; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +public class JavaExplodeTransformProviderTest { + @Rule public TestPipeline pipeline = TestPipeline.create(); + + private static final Schema INPUT_SCHEMA = + Schema.of( + Schema.Field.of("a", Schema.FieldType.iterable(Schema.FieldType.INT32)), + Schema.Field.of("b", Schema.FieldType.DOUBLE), + Schema.Field.of("c", Schema.FieldType.array(Schema.FieldType.STRING))); + + private static final Schema OUTPUT_SCHEMA = + Schema.of( + Schema.Field.of("a", Schema.FieldType.INT32), + Schema.Field.of("b", Schema.FieldType.DOUBLE), + Schema.Field.of("c", Schema.FieldType.STRING)); + + private static final List INPUT_ROWS = + ImmutableList.of( + Row.withSchema(INPUT_SCHEMA) + .addValues(ImmutableList.of(1, 2), 1.5, ImmutableList.of("x", "y")) + .build()); + + @Test + @Category(NeedsRunner.class) + public void testCrossProduct() { + PCollection input = pipeline.apply(Create.of(INPUT_ROWS)).setRowSchema(INPUT_SCHEMA); + + PCollection exploded = + PCollectionRowTuple.of(JavaExplodeTransformProvider.INPUT_ROWS_TAG, input) + .apply( + new JavaExplodeTransformProvider() + .from( + JavaExplodeTransformProvider.Configuration.builder() + .setFields(ImmutableList.of("a", "c")) + .setCrossProduct(true) + .build())) + .get(JavaExplodeTransformProvider.OUTPUT_ROWS_TAG); + + PAssert.that(exploded) + .containsInAnyOrder( + Row.withSchema(OUTPUT_SCHEMA).addValues(1, 1.5, "x").build(), + Row.withSchema(OUTPUT_SCHEMA).addValues(2, 1.5, "x").build(), + Row.withSchema(OUTPUT_SCHEMA).addValues(1, 1.5, "y").build(), + Row.withSchema(OUTPUT_SCHEMA).addValues(2, 1.5, "y").build()); + + pipeline.run(); + } + + @Test + @Category(NeedsRunner.class) + public void testZipProduct() { + PCollection input = pipeline.apply(Create.of(INPUT_ROWS)).setRowSchema(INPUT_SCHEMA); + + PCollection exploded = + PCollectionRowTuple.of(JavaExplodeTransformProvider.INPUT_ROWS_TAG, input) + .apply( + new JavaExplodeTransformProvider() + .from( + JavaExplodeTransformProvider.Configuration.builder() + .setFields(ImmutableList.of("a", "c")) + .setCrossProduct(false) + .build())) + .get(JavaExplodeTransformProvider.OUTPUT_ROWS_TAG); + + PAssert.that(exploded) + .containsInAnyOrder( + Row.withSchema(OUTPUT_SCHEMA).addValues(1, 1.5, "x").build(), + Row.withSchema(OUTPUT_SCHEMA).addValues(2, 1.5, "y").build()); + + pipeline.run(); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaFilterTransformProviderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaFilterTransformProviderTest.java new file mode 100644 index 0000000000000..b269b12f1554e --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaFilterTransformProviderTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.NeedsRunner; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +public class JavaFilterTransformProviderTest { + @Rule public TestPipeline pipeline = TestPipeline.create(); + + @Test + @Category(NeedsRunner.class) + public void testFilter() { + Schema inputSchema = + Schema.of( + Schema.Field.of("a", Schema.FieldType.STRING), + Schema.Field.of("b", Schema.FieldType.INT32), + Schema.Field.of("c", Schema.FieldType.DOUBLE)); + + PCollection input = + pipeline + .apply( + Create.of( + Row.withSchema(inputSchema).addValues("foo", 2, 0.5).build(), + Row.withSchema(inputSchema).addValues("bar", 4, 0.25).build())) + .setRowSchema(inputSchema); + + PCollection renamed = + PCollectionRowTuple.of(JavaFilterTransformProvider.INPUT_ROWS_TAG, input) + .apply( + new JavaFilterTransformProvider() + .from( + JavaFilterTransformProvider.Configuration.builder() + .setKeep( + JavaRowUdf.Configuration.builder() + .setExpression("b + c > 3") + .build()) + .build())) + .get(JavaFilterTransformProvider.OUTPUT_ROWS_TAG); + + PAssert.that(renamed) + .containsInAnyOrder( + Row.withSchema(inputSchema) + .withFieldValue("a", "bar") + .withFieldValue("b", 4) + .withFieldValue("c", 0.25) + .build()); + + pipeline.run(); + } + + @Test + @Category(NeedsRunner.class) + public void testErrorHandling() { + Schema inputSchema = Schema.of(Schema.Field.of("s", Schema.FieldType.STRING)); + + PCollection input = + pipeline + .apply( + Create.of( + Row.withSchema(inputSchema).addValues("short").build(), + Row.withSchema(inputSchema).addValues("looooooooooooong").build())) + .setRowSchema(inputSchema); + + PCollectionRowTuple result = + PCollectionRowTuple.of(JavaFilterTransformProvider.INPUT_ROWS_TAG, input) + .apply( + new JavaFilterTransformProvider() + .from( + JavaFilterTransformProvider.Configuration.builder() + .setLanguage("java") + .setKeep( + JavaRowUdf.Configuration.builder() + .setExpression("s.charAt(7) == 'o'") + .build()) + .setErrorHandling(ErrorHandling.builder().setOutput("errors").build()) + .build())); + + PCollection good = result.get(JavaFilterTransformProvider.OUTPUT_ROWS_TAG); + PAssert.that(good) + .containsInAnyOrder( + Row.withSchema(inputSchema).withFieldValue("s", "looooooooooooong").build()); + + PCollection errors = result.get("errors"); + Schema errorSchema = errors.getSchema(); + PAssert.that(errors) + .containsInAnyOrder( + Row.withSchema(errorSchema) + .withFieldValue( + "failed_row", Row.withSchema(inputSchema).addValues("short").build()) + .withFieldValue("error_message", "String index out of range: 7") + .build()); + pipeline.run(); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaMapToFieldsTransformProviderTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaMapToFieldsTransformProviderTest.java new file mode 100644 index 0000000000000..6ad6f353a4dd5 --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaMapToFieldsTransformProviderTest.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import java.util.Collections; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.NeedsRunner; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link JavaMapToFieldsTransformProvider}. */ +@RunWith(JUnit4.class) +public class JavaMapToFieldsTransformProviderTest { + @Rule public TestPipeline pipeline = TestPipeline.create(); + + @Test + @Category(NeedsRunner.class) + public void testRenameFields() { + Schema inputSchema = + Schema.of( + Schema.Field.of("a", Schema.FieldType.STRING), + Schema.Field.of("b", Schema.FieldType.INT32), + Schema.Field.of("c", Schema.FieldType.DOUBLE)); + + PCollection input = + pipeline + .apply( + Create.of( + Row.withSchema(inputSchema).addValues("foo", 2, 0.5).build(), + Row.withSchema(inputSchema).addValues("bar", 4, 0.25).build())) + .setRowSchema(inputSchema); + + PCollection renamed = + PCollectionRowTuple.of(JavaMapToFieldsTransformProvider.INPUT_ROWS_TAG, input) + .apply( + new JavaMapToFieldsTransformProvider() + .from( + JavaMapToFieldsTransformProvider.Configuration.builder() + .setFields( + ImmutableMap.of( + "newC", + JavaRowUdf.Configuration.builder().setExpression("c").build(), + "newA", + JavaRowUdf.Configuration.builder().setExpression("a").build())) + .build())) + .get(JavaMapToFieldsTransformProvider.OUTPUT_ROWS_TAG); + + Schema outputSchema = renamed.getSchema(); + + PAssert.that(renamed) + .containsInAnyOrder( + Row.withSchema(outputSchema) + .withFieldValue("newC", 0.5) + .withFieldValue("newA", "foo") + .build(), + Row.withSchema(outputSchema) + .withFieldValue("newC", 0.25) + .withFieldValue("newA", "bar") + .build()); + + pipeline.run(); + } + + @Test + @Category(NeedsRunner.class) + public void testAppendAndDropFields() { + Schema inputSchema = + Schema.of( + Schema.Field.of("a", Schema.FieldType.INT32), + Schema.Field.of("b", Schema.FieldType.DOUBLE)); + + PCollection input = + pipeline + .apply( + Create.of( + Row.withSchema(inputSchema).addValues(2, 0.5).build(), + Row.withSchema(inputSchema).addValues(4, 0.25).build())) + .setRowSchema(inputSchema); + + PCollection renamed = + PCollectionRowTuple.of(JavaMapToFieldsTransformProvider.INPUT_ROWS_TAG, input) + .apply( + new JavaMapToFieldsTransformProvider() + .from( + JavaMapToFieldsTransformProvider.Configuration.builder() + .setLanguage("java") + .setAppend(true) + .setDrop(Collections.singletonList("b")) + .setFields( + ImmutableMap.of( + "sum", + JavaRowUdf.Configuration.builder() + .setExpression("a+b") + .build())) + .build())) + .get(JavaMapToFieldsTransformProvider.OUTPUT_ROWS_TAG); + + Schema outputSchema = renamed.getSchema(); + + PAssert.that(renamed) + .containsInAnyOrder( + Row.withSchema(outputSchema).withFieldValue("a", 2).withFieldValue("sum", 2.5).build(), + Row.withSchema(outputSchema) + .withFieldValue("a", 4) + .withFieldValue("sum", 4.25) + .build()); + + pipeline.run(); + } + + @Test + @Category(NeedsRunner.class) + public void testErrorHandling() { + Schema inputSchema = Schema.of(Schema.Field.of("x", Schema.FieldType.INT32)); + + PCollection input = + pipeline + .apply( + Create.of( + Row.withSchema(inputSchema).addValues(4).build(), + Row.withSchema(inputSchema).addValues(-1).build())) + .setRowSchema(inputSchema); + + PCollectionRowTuple result = + PCollectionRowTuple.of(JavaMapToFieldsTransformProvider.INPUT_ROWS_TAG, input) + .apply( + new JavaMapToFieldsTransformProvider() + .from( + JavaMapToFieldsTransformProvider.Configuration.builder() + .setLanguage("java") + .setFields( + ImmutableMap.of( + "sqrt", + JavaRowUdf.Configuration.builder() + .setCallable( + "import java.util.function.Function;" + + "import org.apache.beam.sdk.values.Row;" + + "public class Sqrt implements Function {" + + " public Double apply(Row row) {" + + " int x = row.getInt32(\"x\");" + + " if (x < 0) {" + + " throw new ArithmeticException(\"negative value\");" + + " } else {" + + " return Math.sqrt(x);" + + " }" + + " }" + + "}") + .build())) + .setErrorHandling(ErrorHandling.builder().setOutput("errors").build()) + .build())); + + PCollection sqrts = result.get(JavaMapToFieldsTransformProvider.OUTPUT_ROWS_TAG); + Schema outputSchema = sqrts.getSchema(); + PAssert.that(sqrts) + .containsInAnyOrder(Row.withSchema(outputSchema).withFieldValue("sqrt", 2.0).build()); + + PCollection errors = result.get("errors"); + Schema errorSchema = errors.getSchema(); + PAssert.that(errors) + .containsInAnyOrder( + Row.withSchema(errorSchema) + .withFieldValue("failed_row", Row.withSchema(inputSchema).addValues(-1).build()) + .withFieldValue("error_message", "negative value") + .build()); + pipeline.run(); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdfTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdfTest.java new file mode 100644 index 0000000000000..78ee36e7ca54f --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/JavaRowUdfTest.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import static org.junit.Assert.assertEquals; + +import java.net.MalformedURLException; +import java.util.function.Function; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.values.Row; +import org.junit.Test; + +public class JavaRowUdfTest { + + public static final Schema TEST_SCHEMA = + Schema.of( + Schema.Field.of("anInt32", Schema.FieldType.INT32).withNullable(true), + Schema.Field.of("anInt64", Schema.FieldType.INT64).withNullable(true), + Schema.Field.of("aDouble", Schema.FieldType.DOUBLE).withNullable(true)); + + @Test + public void testExpressionUdf() + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + JavaRowUdf udf = + new JavaRowUdf( + JavaRowUdf.Configuration.builder().setExpression("anInt32 + anInt64").build(), + TEST_SCHEMA); + assertEquals(Schema.FieldType.INT64, udf.getOutputType()); + assertEquals( + 5L, + udf.getFunction() + .apply( + Row.withSchema(TEST_SCHEMA) + .withFieldValue("anInt32", 2) + .withFieldValue("anInt64", 3L) + .build())); + } + + @Test + public void testFieldNameExpressionUdf() + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + JavaRowUdf udf = + new JavaRowUdf( + JavaRowUdf.Configuration.builder().setExpression("anInt32").build(), TEST_SCHEMA); + assertEquals(Schema.FieldType.INT32.withNullable(true), udf.getOutputType()); + assertEquals( + 2, + udf.getFunction() + .apply( + Row.withSchema(TEST_SCHEMA) + .withFieldValue("anInt32", 2) + .withFieldValue("anInt64", 3L) + .build())); + } + + @Test + public void testCallableUdf() + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + JavaRowUdf udf = + new JavaRowUdf( + JavaRowUdf.Configuration.builder() + .setCallable( + String.join( + "\n", + "import org.apache.beam.sdk.values.Row;", + "import java.util.function.Function;", + "public class MyFunction implements Function {", + " public Double apply(Row row) { return 1.0 / row.getDouble(\"aDouble\"); }", + "}")) + .build(), + TEST_SCHEMA); + assertEquals(Schema.FieldType.DOUBLE, udf.getOutputType()); + assertEquals( + 0.25, + udf.getFunction() + .apply(Row.withSchema(TEST_SCHEMA).withFieldValue("aDouble", 4.0).build())); + } + + public static class TestFunction implements Function { + @Override + public Double apply(Row row) { + return 1.0 / row.getDouble("aDouble"); + } + } + + public static double staticTestMethod(Row row) { + return 1.0 / row.getDouble("aDouble"); + } + + public static class TestClassWithMethod { + public double testMethod(Row row) { + return 1.0 / row.getDouble("aDouble"); + } + } + + @Test + public void testNamedFunctionUdf() + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + JavaRowUdf udf = + new JavaRowUdf( + JavaRowUdf.Configuration.builder() + .setName(getClass().getTypeName() + "$TestFunction") + .build(), + TEST_SCHEMA); + assertEquals(Schema.FieldType.DOUBLE, udf.getOutputType()); + assertEquals( + 0.25, + udf.getFunction() + .apply(Row.withSchema(TEST_SCHEMA).withFieldValue("aDouble", 4.0).build())); + } + + @Test + public void testClassMethodUdf() + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + JavaRowUdf udf = + new JavaRowUdf( + JavaRowUdf.Configuration.builder() + .setName(getClass().getTypeName() + "$TestClassWithMethod::testMethod") + .build(), + TEST_SCHEMA); + assertEquals(Schema.FieldType.DOUBLE, udf.getOutputType()); + assertEquals( + 0.25, + udf.getFunction() + .apply(Row.withSchema(TEST_SCHEMA).withFieldValue("aDouble", 4.0).build())); + } + + @Test + public void testStaticMethodUdf() + throws MalformedURLException, ReflectiveOperationException, StringCompiler.CompileException { + JavaRowUdf udf = + new JavaRowUdf( + JavaRowUdf.Configuration.builder() + .setName(getClass().getTypeName() + "::staticTestMethod") + .build(), + TEST_SCHEMA); + assertEquals(Schema.FieldType.DOUBLE, udf.getOutputType()); + assertEquals( + 0.25, + udf.getFunction() + .apply(Row.withSchema(TEST_SCHEMA).withFieldValue("aDouble", 4.0).build())); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/StringCompilerTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/StringCompilerTest.java new file mode 100644 index 0000000000000..0c7bb4fa0538f --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/transforms/providers/StringCompilerTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.transforms.providers; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.function.Function; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.junit.Test; + +public class StringCompilerTest { + + public static final String SQUARE_SOURCE = + "import java.util.function.Function;" + + "public class Square implements Function {" + + " public Integer apply(Integer x) { return x * x; }" + + "}"; + + @Test + public void testGetClass() throws Exception { + Class clazz = StringCompiler.getClass("Square", SQUARE_SOURCE); + assertTrue(Function.class.isAssignableFrom(clazz)); + assertEquals("Square", clazz.getSimpleName()); + } + + @Test + public void testGetInstance() throws Exception { + Function square = + (Function) StringCompiler.getInstance("Square", SQUARE_SOURCE); + assertEquals(4, (int) square.apply(2)); + } + + @Test + public void testGuessExpressionType() throws Exception { + assertEquals( + double.class, + StringCompiler.guessExpressionType( + "a+b", ImmutableMap.of("a", int.class, "b", double.class))); + assertEquals( + double.class, + StringCompiler.guessExpressionType( + "a > 0 ? a : b", ImmutableMap.of("a", int.class, "b", double.class))); + assertEquals( + double.class, + StringCompiler.guessExpressionType("a * Math.random()", ImmutableMap.of("a", int.class))); + assertEquals( + int.class, + StringCompiler.guessExpressionType("(int) a", ImmutableMap.of("a", double.class))); + assertEquals( + long.class, + StringCompiler.guessExpressionType( + "a.getInt64(\"foo\")+b", ImmutableMap.of("a", Row.class, "b", int.class))); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/AvroGenerators.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/AvroGenerators.java deleted file mode 100644 index 5b3127a80ab50..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/AvroGenerators.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas.utils; - -import com.pholser.junit.quickcheck.generator.GenerationStatus; -import com.pholser.junit.quickcheck.generator.Generator; -import com.pholser.junit.quickcheck.random.SourceOfRandomness; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.function.Function; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.avro.Schema; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Joiner; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ObjectArrays; - -/** QuickCheck generators for AVRO. */ -class AvroGenerators { - - /** Generates arbitrary AVRO schemas. */ - public static class SchemaGenerator extends BaseSchemaGenerator { - - public static final SchemaGenerator INSTANCE = new SchemaGenerator(); - - private static final ImmutableList PRIMITIVE_TYPES = - ImmutableList.of( - Schema.Type.STRING, - Schema.Type.BYTES, - Schema.Type.INT, - Schema.Type.LONG, - Schema.Type.FLOAT, - Schema.Type.DOUBLE, - Schema.Type.BOOLEAN); - - private static final ImmutableList ALL_TYPES = - ImmutableList.builder() - .addAll(PRIMITIVE_TYPES) - .add(Schema.Type.FIXED) - .add(Schema.Type.ENUM) - .add(Schema.Type.RECORD) - .add(Schema.Type.ARRAY) - .add(Schema.Type.MAP) - .add(Schema.Type.UNION) - .add(Schema.Type.ARRAY) - .build(); - - private static final int MAX_NESTING = 10; - - @Override - public Schema generate(SourceOfRandomness random, GenerationStatus status) { - Schema.Type type; - - if (nesting(status) >= MAX_NESTING) { - type = random.choose(PRIMITIVE_TYPES); - } else { - type = random.choose(ALL_TYPES); - } - - if (PRIMITIVE_TYPES.contains(type)) { - return Schema.create(type); - } else { - nestingInc(status); - - if (type == Schema.Type.FIXED) { - int size = random.choose(Arrays.asList(1, 5, 12)); - return Schema.createFixed("fixed_" + branch(status), "", "", size); - } else if (type == Schema.Type.UNION) { - // only nullable fields, everything else isn't supported in row conversion code - return UnionSchemaGenerator.INSTANCE.generate(random, status); - } else if (type == Schema.Type.ENUM) { - return EnumSchemaGenerator.INSTANCE.generate(random, status); - } else if (type == Schema.Type.RECORD) { - return RecordSchemaGenerator.INSTANCE.generate(random, status); - } else if (type == Schema.Type.MAP) { - return Schema.createMap(generate(random, status)); - } else if (type == Schema.Type.ARRAY) { - return Schema.createArray(generate(random, status)); - } else { - throw new AssertionError("Unexpected AVRO type: " + type); - } - } - } - } - - public static class RecordSchemaGenerator extends BaseSchemaGenerator { - - public static final RecordSchemaGenerator INSTANCE = new RecordSchemaGenerator(); - - @Override - public Schema generate(SourceOfRandomness random, GenerationStatus status) { - List fields = - IntStream.range(0, random.nextInt(0, status.size()) + 1) - .mapToObj( - i -> { - // deterministically avoid collisions in record names - branchPush(status, String.valueOf(i)); - Schema.Field field = - createField(i, SchemaGenerator.INSTANCE.generate(random, status)); - branchPop(status); - return field; - }) - .collect(Collectors.toList()); - - return Schema.createRecord("record_" + branch(status), "", "example", false, fields); - } - - private Schema.Field createField(int i, Schema schema) { - return new Schema.Field("field_" + i, schema, null, (Object) null); - } - } - - static class UnionSchemaGenerator extends BaseSchemaGenerator { - - public static final UnionSchemaGenerator INSTANCE = new UnionSchemaGenerator(); - - @Override - public Schema generate(SourceOfRandomness random, GenerationStatus status) { - Map schemaMap = - IntStream.range(0, random.nextInt(0, status.size()) + 1) - .mapToObj( - i -> { - // deterministically avoid collisions in record names - branchPush(status, String.valueOf(i)); - Schema schema = - SchemaGenerator.INSTANCE - // nested unions aren't supported in AVRO - .filter(x -> x.getType() != Schema.Type.UNION) - .generate(random, status); - branchPop(status); - return schema; - }) - // AVRO requires uniqueness by full name - .collect(Collectors.toMap(Schema::getFullName, Function.identity(), (x, y) -> x)); - - List schemas = new ArrayList<>(schemaMap.values()); - - if (random.nextBoolean()) { - org.apache.avro.Schema nullSchema = org.apache.avro.Schema.create(Schema.Type.NULL); - schemas.add(nullSchema); - Collections.shuffle(schemas, random.toJDKRandom()); - } - - return Schema.createUnion(schemas); - } - } - - static class EnumSchemaGenerator extends BaseSchemaGenerator { - - public static final EnumSchemaGenerator INSTANCE = new EnumSchemaGenerator(); - - private static final Schema FRUITS = - Schema.createEnum("Fruit", "", "example", Arrays.asList("banana", "apple", "pear")); - - private static final Schema STATUS = - Schema.createEnum("Status", "", "example", Arrays.asList("OK", "ERROR", "WARNING")); - - @Override - public Schema generate(final SourceOfRandomness random, final GenerationStatus status) { - return random.choose(Arrays.asList(FRUITS, STATUS)); - } - } - - abstract static class BaseSchemaGenerator extends Generator { - - private static final GenerationStatus.Key NESTING_KEY = - new GenerationStatus.Key<>("nesting", Integer.class); - - private static final GenerationStatus.Key BRANCH_KEY = - new GenerationStatus.Key<>("branch", String[].class); - - BaseSchemaGenerator() { - super(org.apache.avro.Schema.class); - } - - void branchPush(GenerationStatus status, String value) { - String[] current = status.valueOf(BRANCH_KEY).orElse(new String[0]); - String[] next = ObjectArrays.concat(current, value); - - status.setValue(BRANCH_KEY, next); - } - - void branchPop(GenerationStatus status) { - String[] current = status.valueOf(BRANCH_KEY).orElse(new String[0]); - String[] next = Arrays.copyOf(current, current.length - 1); - - status.setValue(BRANCH_KEY, next); - } - - String branch(GenerationStatus status) { - return Joiner.on("_").join(status.valueOf(BRANCH_KEY).orElse(new String[0])); - } - - int nesting(GenerationStatus status) { - return status.valueOf(NESTING_KEY).orElse(0); - } - - void nestingInc(GenerationStatus status) { - status.setValue(NESTING_KEY, nesting(status) + 1); - } - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/AvroUtilsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/AvroUtilsTest.java deleted file mode 100644 index 5abffd308650a..0000000000000 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/AvroUtilsTest.java +++ /dev/null @@ -1,914 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.schemas.utils; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import com.pholser.junit.quickcheck.From; -import com.pholser.junit.quickcheck.Property; -import com.pholser.junit.quickcheck.runner.JUnitQuickcheck; -import java.math.BigDecimal; -import java.nio.ByteBuffer; -import java.sql.JDBCType; -import java.util.List; -import java.util.Map; -import org.apache.avro.Conversions; -import org.apache.avro.LogicalType; -import org.apache.avro.LogicalTypes; -import org.apache.avro.RandomData; -import org.apache.avro.Schema.Type; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.avro.reflect.ReflectData; -import org.apache.avro.util.Utf8; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.io.AvroGeneratedUser; -import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.Schema.Field; -import org.apache.beam.sdk.schemas.Schema.FieldType; -import org.apache.beam.sdk.schemas.logicaltypes.EnumerationType; -import org.apache.beam.sdk.schemas.logicaltypes.OneOfType; -import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; -import org.apache.beam.sdk.schemas.utils.AvroGenerators.RecordSchemaGenerator; -import org.apache.beam.sdk.schemas.utils.AvroUtils.TypeWithNullability; -import org.apache.beam.sdk.testing.CoderProperties; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.util.SerializableUtils; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; -import org.checkerframework.checker.nullness.qual.NonNull; -import org.checkerframework.checker.nullness.qual.Nullable; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.joda.time.Days; -import org.joda.time.Instant; -import org.joda.time.LocalTime; -import org.junit.Test; -import org.junit.runner.RunWith; - -/** Tests for conversion between AVRO records and Beam rows. */ -@RunWith(JUnitQuickcheck.class) -@SuppressWarnings({ - "rawtypes", // TODO(https://github.com/apache/beam/issues/20447) -}) -public class AvroUtilsTest { - - private static final org.apache.avro.Schema NULL_SCHEMA = - org.apache.avro.Schema.create(Type.NULL); - - @Property(trials = 1000) - @SuppressWarnings("unchecked") - public void supportsAnyAvroSchema( - @From(RecordSchemaGenerator.class) org.apache.avro.Schema avroSchema) { - - Schema schema = AvroUtils.toBeamSchema(avroSchema); - Iterable iterable = new RandomData(avroSchema, 10); - List records = Lists.newArrayList((Iterable) iterable); - - for (GenericRecord record : records) { - AvroUtils.toBeamRowStrict(record, schema); - } - } - - @Property(trials = 1000) - @SuppressWarnings("unchecked") - public void avroToBeamRoundTrip( - @From(RecordSchemaGenerator.class) org.apache.avro.Schema avroSchema) { - - Schema schema = AvroUtils.toBeamSchema(avroSchema); - Iterable iterable = new RandomData(avroSchema, 10); - List records = Lists.newArrayList((Iterable) iterable); - - for (GenericRecord record : records) { - Row row = AvroUtils.toBeamRowStrict(record, schema); - GenericRecord out = AvroUtils.toGenericRecord(row, avroSchema); - assertEquals(record, out); - } - } - - @Test - public void testUnwrapNullableSchema() { - org.apache.avro.Schema avroSchema = - org.apache.avro.Schema.createUnion( - org.apache.avro.Schema.create(Type.NULL), org.apache.avro.Schema.create(Type.STRING)); - - TypeWithNullability typeWithNullability = new TypeWithNullability(avroSchema); - assertTrue(typeWithNullability.nullable); - assertEquals(org.apache.avro.Schema.create(Type.STRING), typeWithNullability.type); - } - - @Test - public void testUnwrapNullableSchemaReordered() { - org.apache.avro.Schema avroSchema = - org.apache.avro.Schema.createUnion( - org.apache.avro.Schema.create(Type.STRING), org.apache.avro.Schema.create(Type.NULL)); - - TypeWithNullability typeWithNullability = new TypeWithNullability(avroSchema); - assertTrue(typeWithNullability.nullable); - assertEquals(org.apache.avro.Schema.create(Type.STRING), typeWithNullability.type); - } - - @Test - public void testUnwrapNullableSchemaToUnion() { - org.apache.avro.Schema avroSchema = - org.apache.avro.Schema.createUnion( - org.apache.avro.Schema.create(Type.STRING), - org.apache.avro.Schema.create(Type.LONG), - org.apache.avro.Schema.create(Type.NULL)); - - TypeWithNullability typeWithNullability = new TypeWithNullability(avroSchema); - assertTrue(typeWithNullability.nullable); - assertEquals( - org.apache.avro.Schema.createUnion( - org.apache.avro.Schema.create(Type.STRING), org.apache.avro.Schema.create(Type.LONG)), - typeWithNullability.type); - } - - @Test - public void testNullableArrayFieldToBeamArrayField() { - org.apache.avro.Schema.Field avroField = - new org.apache.avro.Schema.Field( - "arrayField", - ReflectData.makeNullable( - org.apache.avro.Schema.createArray(org.apache.avro.Schema.create(Type.INT))), - "", - null); - - Field expectedBeamField = Field.nullable("arrayField", FieldType.array(FieldType.INT32)); - - Field beamField = AvroUtils.toBeamField(avroField); - assertEquals(expectedBeamField, beamField); - } - - @Test - public void testNullableBeamArrayFieldToAvroField() { - Field beamField = Field.nullable("arrayField", FieldType.array(FieldType.INT32)); - - org.apache.avro.Schema.Field expectedAvroField = - new org.apache.avro.Schema.Field( - "arrayField", - ReflectData.makeNullable( - org.apache.avro.Schema.createArray(org.apache.avro.Schema.create(Type.INT))), - "", - null); - - org.apache.avro.Schema.Field avroField = AvroUtils.toAvroField(beamField, "ignored"); - assertEquals(expectedAvroField, avroField); - } - - private static List getAvroSubSchemaFields() { - List fields = Lists.newArrayList(); - fields.add( - new org.apache.avro.Schema.Field( - "bool", org.apache.avro.Schema.create(Type.BOOLEAN), "", null)); - fields.add( - new org.apache.avro.Schema.Field("int", org.apache.avro.Schema.create(Type.INT), "", null)); - return fields; - } - - private static org.apache.avro.Schema getAvroSubSchema(String name) { - return org.apache.avro.Schema.createRecord( - name, null, "topLevelRecord", false, getAvroSubSchemaFields()); - } - - private static org.apache.avro.Schema getAvroSchema() { - List fields = Lists.newArrayList(); - fields.add( - new org.apache.avro.Schema.Field( - "bool", org.apache.avro.Schema.create(Type.BOOLEAN), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "int", org.apache.avro.Schema.create(Type.INT), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "long", org.apache.avro.Schema.create(Type.LONG), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "float", org.apache.avro.Schema.create(Type.FLOAT), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "double", org.apache.avro.Schema.create(Type.DOUBLE), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "string", org.apache.avro.Schema.create(Type.STRING), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "bytes", org.apache.avro.Schema.create(Type.BYTES), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "decimal", - LogicalTypes.decimal(Integer.MAX_VALUE) - .addToSchema(org.apache.avro.Schema.create(Type.BYTES)), - "", - (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "timestampMillis", - LogicalTypes.timestampMillis().addToSchema(org.apache.avro.Schema.create(Type.LONG)), - "", - (Object) null)); - fields.add(new org.apache.avro.Schema.Field("row", getAvroSubSchema("row"), "", (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "array", - org.apache.avro.Schema.createArray(getAvroSubSchema("array")), - "", - (Object) null)); - fields.add( - new org.apache.avro.Schema.Field( - "map", org.apache.avro.Schema.createMap(getAvroSubSchema("map")), "", (Object) null)); - return org.apache.avro.Schema.createRecord("topLevelRecord", null, null, false, fields); - } - - private static Schema getBeamSubSchema() { - return new Schema.Builder() - .addField(Field.of("bool", FieldType.BOOLEAN)) - .addField(Field.of("int", FieldType.INT32)) - .build(); - } - - private Schema getBeamSchema() { - Schema subSchema = getBeamSubSchema(); - return new Schema.Builder() - .addField(Field.of("bool", FieldType.BOOLEAN)) - .addField(Field.of("int", FieldType.INT32)) - .addField(Field.of("long", FieldType.INT64)) - .addField(Field.of("float", FieldType.FLOAT)) - .addField(Field.of("double", FieldType.DOUBLE)) - .addField(Field.of("string", FieldType.STRING)) - .addField(Field.of("bytes", FieldType.BYTES)) - .addField(Field.of("decimal", FieldType.DECIMAL)) - .addField(Field.of("timestampMillis", FieldType.DATETIME)) - .addField(Field.of("row", FieldType.row(subSchema))) - .addField(Field.of("array", FieldType.array(FieldType.row(subSchema)))) - .addField(Field.of("map", FieldType.map(FieldType.STRING, FieldType.row(subSchema)))) - .build(); - } - - private static final byte[] BYTE_ARRAY = new byte[] {1, 2, 3, 4}; - private static final DateTime DATE_TIME = - new DateTime().withDate(1979, 3, 14).withTime(1, 2, 3, 4).withZone(DateTimeZone.UTC); - private static final BigDecimal BIG_DECIMAL = new BigDecimal(3600); - - private Row getBeamRow() { - Row subRow = Row.withSchema(getBeamSubSchema()).addValues(true, 42).build(); - return Row.withSchema(getBeamSchema()) - .addValue(true) - .addValue(43) - .addValue(44L) - .addValue((float) 44.1) - .addValue((double) 44.2) - .addValue("string") - .addValue(BYTE_ARRAY) - .addValue(BIG_DECIMAL) - .addValue(DATE_TIME) - .addValue(subRow) - .addValue(ImmutableList.of(subRow, subRow)) - .addValue(ImmutableMap.of("k1", subRow, "k2", subRow)) - .build(); - } - - private static GenericRecord getSubGenericRecord(String name) { - return new GenericRecordBuilder(getAvroSubSchema(name)) - .set("bool", true) - .set("int", 42) - .build(); - } - - private static GenericRecord getGenericRecord() { - - LogicalType decimalType = - LogicalTypes.decimal(Integer.MAX_VALUE) - .addToSchema(org.apache.avro.Schema.create(Type.BYTES)) - .getLogicalType(); - ByteBuffer encodedDecimal = - new Conversions.DecimalConversion().toBytes(BIG_DECIMAL, null, decimalType); - - return new GenericRecordBuilder(getAvroSchema()) - .set("bool", true) - .set("int", 43) - .set("long", 44L) - .set("float", (float) 44.1) - .set("double", (double) 44.2) - .set("string", new Utf8("string")) - .set("bytes", ByteBuffer.wrap(BYTE_ARRAY)) - .set("decimal", encodedDecimal) - .set("timestampMillis", DATE_TIME.getMillis()) - .set("row", getSubGenericRecord("row")) - .set("array", ImmutableList.of(getSubGenericRecord("array"), getSubGenericRecord("array"))) - .set( - "map", - ImmutableMap.of( - new Utf8("k1"), - getSubGenericRecord("map"), - new Utf8("k2"), - getSubGenericRecord("map"))) - .build(); - } - - @Test - public void testFromAvroSchema() { - assertEquals(getBeamSchema(), AvroUtils.toBeamSchema(getAvroSchema())); - } - - @Test - public void testFromBeamSchema() { - Schema beamSchema = getBeamSchema(); - org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema); - assertEquals(getAvroSchema(), avroSchema); - } - - @Test - public void testAvroSchemaFromBeamSchemaCanBeParsed() { - org.apache.avro.Schema convertedSchema = AvroUtils.toAvroSchema(getBeamSchema()); - org.apache.avro.Schema validatedSchema = - new org.apache.avro.Schema.Parser().parse(convertedSchema.toString()); - assertEquals(convertedSchema, validatedSchema); - } - - @Test - public void testAvroSchemaFromBeamSchemaWithFieldCollisionCanBeParsed() { - - // Two similar schemas, the only difference is the "street" field type in the nested record. - Schema contact = - new Schema.Builder() - .addField(Field.of("name", FieldType.STRING)) - .addField( - Field.of( - "address", - FieldType.row( - new Schema.Builder() - .addField(Field.of("street", FieldType.STRING)) - .addField(Field.of("city", FieldType.STRING)) - .build()))) - .build(); - - Schema contactMultiline = - new Schema.Builder() - .addField(Field.of("name", FieldType.STRING)) - .addField( - Field.of( - "address", - FieldType.row( - new Schema.Builder() - .addField(Field.of("street", FieldType.array(FieldType.STRING))) - .addField(Field.of("city", FieldType.STRING)) - .build()))) - .build(); - - // Ensure that no collisions happen between two sibling fields with same-named child fields - // (with different schemas, between a parent field and a sub-record field with the same name, - // and artificially with the generated field name. - Schema beamSchema = - new Schema.Builder() - .addField(Field.of("home", FieldType.row(contact))) - .addField(Field.of("work", FieldType.row(contactMultiline))) - .addField(Field.of("address", FieldType.row(contact))) - .addField(Field.of("topLevelRecord", FieldType.row(contactMultiline))) - .build(); - - org.apache.avro.Schema convertedSchema = AvroUtils.toAvroSchema(beamSchema); - org.apache.avro.Schema validatedSchema = - new org.apache.avro.Schema.Parser().parse(convertedSchema.toString()); - assertEquals(convertedSchema, validatedSchema); - } - - @Test - public void testNullableFieldInAvroSchema() { - List fields = Lists.newArrayList(); - fields.add( - new org.apache.avro.Schema.Field( - "int", ReflectData.makeNullable(org.apache.avro.Schema.create(Type.INT)), "", null)); - fields.add( - new org.apache.avro.Schema.Field( - "array", - org.apache.avro.Schema.createArray( - ReflectData.makeNullable(org.apache.avro.Schema.create(Type.BYTES))), - "", - null)); - fields.add( - new org.apache.avro.Schema.Field( - "map", - org.apache.avro.Schema.createMap( - ReflectData.makeNullable(org.apache.avro.Schema.create(Type.INT))), - "", - null)); - fields.add( - new org.apache.avro.Schema.Field( - "enum", - ReflectData.makeNullable( - org.apache.avro.Schema.createEnum( - "fruit", "", "", ImmutableList.of("banana", "apple", "pear"))), - "", - null)); - - org.apache.avro.Schema avroSchema = - org.apache.avro.Schema.createRecord("topLevelRecord", null, null, false, fields); - - Schema expectedSchema = - Schema.builder() - .addNullableField("int", FieldType.INT32) - .addArrayField("array", FieldType.BYTES.withNullable(true)) - .addMapField("map", FieldType.STRING, FieldType.INT32.withNullable(true)) - .addField( - "enum", - FieldType.logicalType(EnumerationType.create("banana", "apple", "pear")) - .withNullable(true)) - .build(); - assertEquals(expectedSchema, AvroUtils.toBeamSchema(avroSchema)); - - Map nullMap = Maps.newHashMap(); - nullMap.put("k1", null); - GenericRecord genericRecord = - new GenericRecordBuilder(avroSchema) - .set("int", null) - .set("array", Lists.newArrayList((Object) null)) - .set("map", nullMap) - .set("enum", null) - .build(); - Row expectedRow = - Row.withSchema(expectedSchema) - .addValue(null) - .addValue(Lists.newArrayList((Object) null)) - .addValue(nullMap) - .addValue(null) - .build(); - assertEquals(expectedRow, AvroUtils.toBeamRowStrict(genericRecord, expectedSchema)); - } - - @Test - public void testNullableFieldsInBeamSchema() { - Schema beamSchema = - Schema.builder() - .addNullableField("int", FieldType.INT32) - .addArrayField("array", FieldType.INT32.withNullable(true)) - .addMapField("map", FieldType.STRING, FieldType.INT32.withNullable(true)) - .build(); - - List fields = Lists.newArrayList(); - fields.add( - new org.apache.avro.Schema.Field( - "int", ReflectData.makeNullable(org.apache.avro.Schema.create(Type.INT)), "", null)); - fields.add( - new org.apache.avro.Schema.Field( - "array", - org.apache.avro.Schema.createArray( - ReflectData.makeNullable(org.apache.avro.Schema.create(Type.INT))), - "", - null)); - fields.add( - new org.apache.avro.Schema.Field( - "map", - org.apache.avro.Schema.createMap( - ReflectData.makeNullable(org.apache.avro.Schema.create(Type.INT))), - "", - null)); - org.apache.avro.Schema avroSchema = - org.apache.avro.Schema.createRecord("topLevelRecord", null, null, false, fields); - assertEquals(avroSchema, AvroUtils.toAvroSchema(beamSchema)); - - Map nullMapUtf8 = Maps.newHashMap(); - nullMapUtf8.put(new Utf8("k1"), null); - Map nullMapString = Maps.newHashMap(); - nullMapString.put("k1", null); - - GenericRecord expectedGenericRecord = - new GenericRecordBuilder(avroSchema) - .set("int", null) - .set("array", Lists.newArrayList((Object) null)) - .set("map", nullMapUtf8) - .build(); - Row row = - Row.withSchema(beamSchema) - .addValue(null) - .addValue(Lists.newArrayList((Object) null)) - .addValue(nullMapString) - .build(); - assertEquals(expectedGenericRecord, AvroUtils.toGenericRecord(row, avroSchema)); - } - - @Test - public void testUnionFieldInAvroSchema() { - - List fields = Lists.newArrayList(); - List unionFields = Lists.newArrayList(); - - unionFields.add(org.apache.avro.Schema.create(Type.INT)); - unionFields.add(org.apache.avro.Schema.create(Type.STRING)); - - fields.add( - new org.apache.avro.Schema.Field( - "union", org.apache.avro.Schema.createUnion(unionFields), "", null)); - org.apache.avro.Schema avroSchema = - org.apache.avro.Schema.createRecord("topLevelRecord", null, null, false, fields); - OneOfType oneOfType = - OneOfType.create(Field.of("int", FieldType.INT32), Field.of("string", FieldType.STRING)); - - Schema expectedSchema = Schema.builder().addLogicalTypeField("union", oneOfType).build(); - assertEquals(expectedSchema, AvroUtils.toBeamSchema(avroSchema)); - GenericRecord genericRecord = new GenericRecordBuilder(avroSchema).set("union", 23423).build(); - Row expectedRow = - Row.withSchema(expectedSchema).addValue(oneOfType.createValue(0, 23423)).build(); - assertEquals(expectedRow, AvroUtils.toBeamRowStrict(genericRecord, expectedSchema)); - } - - @Test - public void testUnionFieldInBeamSchema() { - OneOfType oneOfType = - OneOfType.create(Field.of("int", FieldType.INT32), Field.of("string", FieldType.STRING)); - - Schema beamSchema = Schema.builder().addLogicalTypeField("union", oneOfType).build(); - List fields = Lists.newArrayList(); - List unionFields = Lists.newArrayList(); - - unionFields.add(org.apache.avro.Schema.create(Type.INT)); - unionFields.add(org.apache.avro.Schema.create(Type.STRING)); - fields.add( - new org.apache.avro.Schema.Field( - "union", org.apache.avro.Schema.createUnion(unionFields), "", null)); - org.apache.avro.Schema avroSchema = - org.apache.avro.Schema.createRecord("topLevelRecord", null, null, false, fields); - GenericRecord expectedGenericRecord = - new GenericRecordBuilder(avroSchema).set("union", 23423).build(); - Row row = Row.withSchema(beamSchema).addValue(oneOfType.createValue(0, 23423)).build(); - assertEquals(expectedGenericRecord, AvroUtils.toGenericRecord(row, avroSchema)); - } - - @Test - public void testJdbcLogicalVarCharRowDataToAvroSchema() { - String expectedAvroSchemaJson = - "{ " - + " \"name\": \"topLevelRecord\", " - + " \"type\": \"record\", " - + " \"fields\": [{ " - + " \"name\": \"my_varchar_field\", " - + " \"type\": {\"type\": \"string\", \"logicalType\": \"varchar\", \"maxLength\": 10}" - + " }, " - + " { " - + " \"name\": \"my_longvarchar_field\", " - + " \"type\": {\"type\": \"string\", \"logicalType\": \"varchar\", \"maxLength\": 50}" - + " }, " - + " { " - + " \"name\": \"my_nvarchar_field\", " - + " \"type\": {\"type\": \"string\", \"logicalType\": \"varchar\", \"maxLength\": 10}" - + " }, " - + " { " - + " \"name\": \"my_longnvarchar_field\", " - + " \"type\": {\"type\": \"string\", \"logicalType\": \"varchar\", \"maxLength\": 50}" - + " }, " - + " { " - + " \"name\": \"fixed_length_char_field\", " - + " \"type\": {\"type\": \"string\", \"logicalType\": \"char\", \"maxLength\": 25}" - + " } " - + " ] " - + "}"; - - Schema beamSchema = - Schema.builder() - .addField( - Field.of( - "my_varchar_field", FieldType.logicalType(JdbcType.StringType.varchar(10)))) - .addField( - Field.of( - "my_longvarchar_field", - FieldType.logicalType(JdbcType.StringType.longvarchar(50)))) - .addField( - Field.of( - "my_nvarchar_field", FieldType.logicalType(JdbcType.StringType.nvarchar(10)))) - .addField( - Field.of( - "my_longnvarchar_field", - FieldType.logicalType(JdbcType.StringType.longnvarchar(50)))) - .addField( - Field.of( - "fixed_length_char_field", - FieldType.logicalType(JdbcType.StringType.fixedLengthChar(25)))) - .build(); - - assertEquals( - new org.apache.avro.Schema.Parser().parse(expectedAvroSchemaJson), - AvroUtils.toAvroSchema(beamSchema)); - } - - @Test - public void testJdbcLogicalVarCharRowDataToGenericRecord() { - Schema beamSchema = - Schema.builder() - .addField( - Field.of( - "my_varchar_field", FieldType.logicalType(JdbcType.StringType.varchar(10)))) - .addField( - Field.of( - "my_longvarchar_field", - FieldType.logicalType(JdbcType.StringType.longvarchar(50)))) - .addField( - Field.of( - "my_nvarchar_field", FieldType.logicalType(JdbcType.StringType.nvarchar(10)))) - .addField( - Field.of( - "my_longnvarchar_field", - FieldType.logicalType(JdbcType.StringType.longnvarchar(50)))) - .build(); - - Row rowData = - Row.withSchema(beamSchema) - .addValue("varchar_value") - .addValue("longvarchar_value") - .addValue("nvarchar_value") - .addValue("longnvarchar_value") - .build(); - - org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema); - GenericRecord expectedRecord = - new GenericRecordBuilder(avroSchema) - .set("my_varchar_field", "varchar_value") - .set("my_longvarchar_field", "longvarchar_value") - .set("my_nvarchar_field", "nvarchar_value") - .set("my_longnvarchar_field", "longnvarchar_value") - .build(); - - assertEquals(expectedRecord, AvroUtils.toGenericRecord(rowData, avroSchema)); - } - - @Test - public void testJdbcLogicalDateAndTimeRowDataToAvroSchema() { - String expectedAvroSchemaJson = - "{ " - + " \"name\": \"topLevelRecord\", " - + " \"type\": \"record\", " - + " \"fields\": [{ " - + " \"name\": \"my_date_field\", " - + " \"type\": { \"type\": \"int\", \"logicalType\": \"date\" }" - + " }, " - + " { " - + " \"name\": \"my_time_field\", " - + " \"type\": { \"type\": \"int\", \"logicalType\": \"time-millis\" }" - + " }" - + " ] " - + "}"; - - Schema beamSchema = - Schema.builder() - .addField(Field.of("my_date_field", FieldType.logicalType(JdbcType.DATE))) - .addField(Field.of("my_time_field", FieldType.logicalType(JdbcType.TIME))) - .build(); - - assertEquals( - new org.apache.avro.Schema.Parser().parse(expectedAvroSchemaJson), - AvroUtils.toAvroSchema(beamSchema)); - } - - @Test - public void testJdbcLogicalDateAndTimeRowDataToGenericRecord() { - // Test Fixed clock at - DateTime testDateTime = DateTime.parse("2021-05-29T11:15:16.234Z"); - - Schema beamSchema = - Schema.builder() - .addField(Field.of("my_date_field", FieldType.logicalType(JdbcType.DATE))) - .addField(Field.of("my_time_field", FieldType.logicalType(JdbcType.TIME))) - .build(); - - Row rowData = - Row.withSchema(beamSchema) - .addValue(testDateTime.toLocalDate().toDateTime(LocalTime.MIDNIGHT).toInstant()) - .addValue(Instant.ofEpochMilli(testDateTime.toLocalTime().millisOfDay().get())) - .build(); - - int daysFromEpoch = - Days.daysBetween( - Instant.EPOCH, - testDateTime.toLocalDate().toDateTime(LocalTime.MIDNIGHT).toInstant()) - .getDays(); - int timeSinceMidNight = testDateTime.toLocalTime().getMillisOfDay(); - - org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema); - GenericRecord expectedRecord = - new GenericRecordBuilder(avroSchema) - .set("my_date_field", daysFromEpoch) - .set("my_time_field", timeSinceMidNight) - .build(); - - assertEquals(expectedRecord, AvroUtils.toGenericRecord(rowData, avroSchema)); - } - - @Test - public void testSqlTypesToGenericRecord() { - // SqlTypes to LogicalTypes.date conversion is one direction - java.time.LocalDate localDate = java.time.LocalDate.of(1979, 3, 14); - - Schema beamSchema = - Schema.builder() - .addField(Field.of("local_date", FieldType.logicalType(SqlTypes.DATE))) - .build(); - - Row rowData = Row.withSchema(beamSchema).addValue(localDate).build(); - - org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema); - GenericRecord expectedRecord = - new GenericRecordBuilder(avroSchema).set("local_date", localDate.toEpochDay()).build(); - - assertEquals(expectedRecord, AvroUtils.toGenericRecord(rowData, avroSchema)); - } - - @Test - public void testBeamRowToGenericRecord() { - GenericRecord genericRecord = AvroUtils.toGenericRecord(getBeamRow(), null); - assertEquals(getAvroSchema(), genericRecord.getSchema()); - assertEquals(getGenericRecord(), genericRecord); - } - - @Test - public void testBeamRowToGenericRecordInferSchema() { - GenericRecord genericRecord = AvroUtils.toGenericRecord(getBeamRow()); - assertEquals(getAvroSchema(), genericRecord.getSchema()); - assertEquals(getGenericRecord(), genericRecord); - } - - @Test - public void testRowToGenericRecordFunction() { - SerializableUtils.ensureSerializable(AvroUtils.getRowToGenericRecordFunction(NULL_SCHEMA)); - SerializableUtils.ensureSerializable(AvroUtils.getRowToGenericRecordFunction(null)); - } - - @Test - public void testGenericRecordToBeamRow() { - GenericRecord genericRecord = getGenericRecord(); - Row row = AvroUtils.toBeamRowStrict(getGenericRecord(), null); - assertEquals(getBeamRow(), row); - - // Alternatively, a timestamp-millis logical type can have a joda datum. - genericRecord.put("timestampMillis", new DateTime(genericRecord.get("timestampMillis"))); - row = AvroUtils.toBeamRowStrict(getGenericRecord(), null); - assertEquals(getBeamRow(), row); - } - - @Test - public void testGenericRecordToRowFunction() { - SerializableUtils.ensureSerializable(AvroUtils.getGenericRecordToRowFunction(Schema.of())); - SerializableUtils.ensureSerializable(AvroUtils.getGenericRecordToRowFunction(null)); - } - - @Test - public void testAvroSchemaCoders() { - Pipeline pipeline = Pipeline.create(); - org.apache.avro.Schema schema = - org.apache.avro.Schema.createRecord( - "TestSubRecord", - "TestSubRecord doc", - "org.apache.beam.sdk.schemas.utils", - false, - getAvroSubSchemaFields()); - GenericRecord record = - new GenericRecordBuilder(getAvroSubSchema("simple")) - .set("bool", true) - .set("int", 42) - .build(); - - PCollection records = - pipeline.apply(Create.of(record).withCoder(AvroCoder.of(schema))); - assertFalse(records.hasSchema()); - records.setCoder(AvroUtils.schemaCoder(schema)); - assertTrue(records.hasSchema()); - CoderProperties.coderSerializable(records.getCoder()); - - AvroGeneratedUser user = new AvroGeneratedUser("foo", 42, "green"); - PCollection users = - pipeline.apply(Create.of(user).withCoder(AvroCoder.of(AvroGeneratedUser.class))); - assertFalse(users.hasSchema()); - users.setCoder(AvroUtils.schemaCoder((AvroCoder) users.getCoder())); - assertTrue(users.hasSchema()); - CoderProperties.coderSerializable(users.getCoder()); - } - - @Test - public void testAvroBytesToRowAndRowToAvroBytesFunctions() { - Schema schema = - Schema.builder() - .addInt32Field("f_int") - .addInt64Field("f_long") - .addDoubleField("f_double") - .addStringField("f_string") - .build(); - - SimpleFunction toBytesFn = AvroUtils.getRowToAvroBytesFunction(schema); - SimpleFunction toRowFn = AvroUtils.getAvroBytesToRowFunction(schema); - - Row row = Row.withSchema(schema).attachValues(1, 1L, 1d, "string"); - - byte[] serializedRow = toBytesFn.apply(row); - Row deserializedRow = toRowFn.apply(serializedRow); - - assertEquals(row, deserializedRow); - } - - @Test - public void testNullSchemas() { - assertEquals( - AvroUtils.getFromRowFunction(GenericRecord.class), - AvroUtils.getFromRowFunction(GenericRecord.class)); - } - - /** Helper class that simulate JDBC Logical types. */ - private static class JdbcType implements Schema.LogicalType { - - private static final JdbcType DATE = - new JdbcType<>(JDBCType.DATE, FieldType.STRING, FieldType.DATETIME, ""); - private static final JdbcType TIME = - new JdbcType<>(JDBCType.TIME, FieldType.STRING, FieldType.DATETIME, ""); - - private final String identifier; - private final FieldType argumentType; - private final FieldType baseType; - private final Object argument; - - private static class StringType extends JdbcType { - - private static StringType fixedLengthChar(int size) { - return new StringType(JDBCType.CHAR, size); - } - - private static StringType varchar(int size) { - return new StringType(JDBCType.VARCHAR, size); - } - - private static StringType longvarchar(int size) { - return new StringType(JDBCType.LONGVARCHAR, size); - } - - private static StringType nvarchar(int size) { - return new StringType(JDBCType.NVARCHAR, size); - } - - private static StringType longnvarchar(int size) { - return new StringType(JDBCType.LONGNVARCHAR, size); - } - - private StringType(JDBCType type, int size) { - super(type, FieldType.INT32, FieldType.STRING, size); - } - } - - private JdbcType( - JDBCType jdbcType, FieldType argumentType, FieldType baseType, Object argument) { - this.identifier = jdbcType.getName(); - this.argumentType = argumentType; - this.baseType = baseType; - this.argument = argument; - } - - @Override - public String getIdentifier() { - return identifier; - } - - @Override - public @Nullable FieldType getArgumentType() { - return argumentType; - } - - @Override - public FieldType getBaseType() { - return baseType; - } - - @Override - @SuppressWarnings("TypeParameterUnusedInFormals") - public @Nullable T1 getArgument() { - return (T1) argument; - } - - @Override - public @NonNull T toBaseType(@NonNull T input) { - return input; - } - - @Override - public @NonNull T toInputType(@NonNull T base) { - return base; - } - } -} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/TestJavaBeans.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/TestJavaBeans.java index 2e616aa2321b4..b5ad6f989d9e4 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/TestJavaBeans.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/utils/TestJavaBeans.java @@ -1362,13 +1362,13 @@ public int hashCode() { @DefaultSchema(JavaBeanSchema.class) public static class ParameterNullableBean { - @org.apache.avro.reflect.Nullable private Float value; + @Nullable private Float value; - public @org.apache.avro.reflect.Nullable Float getValue() { + public @Nullable Float getValue() { return value; } - public void setValue(@org.apache.avro.reflect.Nullable Float value) { + public void setValue(@Nullable Float value) { this.value = value; } } @@ -1379,14 +1379,14 @@ public void setValue(@org.apache.avro.reflect.Nullable Float value) { @DefaultSchema(JavaBeanSchema.class) public static class FieldWithDescriptionBean { - @org.apache.avro.reflect.Nullable private Float value; + @Nullable private Float value; @SchemaFieldDescription("This value is the value stored in the object as a float.") - public @org.apache.avro.reflect.Nullable Float getValue() { + public @Nullable Float getValue() { return value; } - public void setValue(@org.apache.avro.reflect.Nullable Float value) { + public void setValue(@Nullable Float value) { this.value = value; } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHintsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHintsTest.java index 3cc5221763742..c7643f718aa54 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHintsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHintsTest.java @@ -92,10 +92,13 @@ public void testFromOptions() { .withHint("beam:resources:bar", new ResourceHints.StringHint("foo"))); options = PipelineOptionsFactory.fromArgs( - "--resourceHints=min_ram=1KB", "--resourceHints=accelerator=foo") + "--resourceHints=min_ram=1KB", + "--resourceHints=accelerator=foo", + "--resourceHints=cpu_count=4") .as(ResourceHintsOptions.class); - assertEquals( - ResourceHints.fromOptions(options), - ResourceHints.create().withMinRam(1000).withAccelerator("foo")); + ResourceHints fromOptions = ResourceHints.fromOptions(options); + ResourceHints expect = + ResourceHints.create().withMinRam(1000).withAccelerator("foo").withCPUCount(4); + assertEquals(fromOptions, expect); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/HistogramDataTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/HistogramDataTest.java index b6e4d989a8f39..133bf787de301 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/HistogramDataTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/HistogramDataTest.java @@ -18,8 +18,10 @@ package org.apache.beam.sdk.util; import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.equalTo; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.math.IntMath; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -200,4 +202,134 @@ public void testIncrementBucketCountByIndex() { assertThat(data.getTopBucketCount(), equalTo(4L)); assertThat(data.getTotalCount(), equalTo(10L)); } + + // The following tests cover exponential buckets. + @Test + public void testExponentialBuckets_PositiveScaleRecord() { + // Buckets will be: + // Index Range + // Underflow (-inf, 0) + // 0 [0, sqrt(2)) + // 1 [sqrt(2), 2) + // i [2^(i/2), 2^((i+1)/2)) + HistogramData data = HistogramData.exponential(1, 40); + + data.record(-1); + assertThat(data.getBottomBucketCount(), equalTo(1L)); + + data.record(0, 1); + assertThat(data.getCount(0), equalTo(2L)); + + data.record(2); + assertThat(data.getTotalCount(), equalTo(4L)); + assertThat(data.getCount(2), equalTo(1L)); + + // 10th bucket contains range [2^5, 2^5.5) ~= [32, 45.25) + for (int i = 32; i <= 45; i++) { + data.record(i); + } + assertThat(data.getCount(10), equalTo(14L)); + + // 30th bucket contains range [2^15, 2^15.5) ~= [32768, 46340.9) + for (int i = 32768; i < 32768 + 100; i++) { + data.record(i); + } + assertThat(data.getCount(30), equalTo(100L)); + for (int i = 46340; i > 46340 - 100; i--) { + data.record(i); + } + assertThat(data.getCount(30), equalTo(200L)); + } + + @Test + public void testExponentialBuckets_ZeroScaleRecord() { + // Buckets will be: + // Index Range + // Underflow (-inf, 0) + // 0 [0, 2) + // 1 [2, 2^2] + // i [2^i, 2^(i+1)) + HistogramData data = HistogramData.exponential(0, 20); + + data.record(-1); + assertThat(data.getBottomBucketCount(), equalTo(1L)); + + data.record(0, 1); + assertThat(data.getCount(0), equalTo(2L)); + + data.record(4, 5, 6, 7); + assertThat(data.getCount(2), equalTo(4L)); + + for (int i = 32; i < 64; i++) { + data.record(i); + } + assertThat(data.getCount(5), equalTo(32L)); + + for (int i = IntMath.pow(2, 16); i < IntMath.pow(2, 16) + 100; i++) { + data.record(i); + } + assertThat(data.getCount(16), equalTo(100L)); + + Long expectedTotalCount = Long.valueOf(100 + 32 + 4 + 2 + 1); + assertThat(data.getTotalCount(), equalTo(expectedTotalCount)); + } + + @Test + public void testExponentialBuckets_NegativeScalesRecord() { + // Buckets will be: + // Index Range + // Underflow (-inf, 0) + // 0 [0, 4) + // 1 [4, 4^2] + // i [4^i, 4^(i+1)) + HistogramData data = HistogramData.exponential(-1, 20); + + data.record(-1); + assertThat(data.getBottomBucketCount(), equalTo(1L)); + + data.record(0, 1, 2); + assertThat(data.getCount(0), equalTo(3L)); + + data.record(16, 17, 32, 33, 62, 63); + assertThat(data.getCount(2), equalTo(6L)); + + for (int i = IntMath.pow(4, 5); i < IntMath.pow(4, 5) + 20; i++) { + data.record(i); + } + assertThat(data.getCount(5), equalTo(20L)); + + Long expectedTotalCount = Long.valueOf(20 + 6 + 3 + 1); + assertThat(data.getTotalCount(), equalTo(expectedTotalCount)); + } + + @Test + public void testExponentialBuckets_BucketSize() { + HistogramData zeroScaleBucket = HistogramData.exponential(0, 20); + assertThat(zeroScaleBucket.getBucketType().getBucketSize(0), equalTo(2.0)); + // 10th bucket contains [2^10, 2^11). + assertThat(zeroScaleBucket.getBucketType().getBucketSize(10), equalTo(1024.0)); + + HistogramData positiveScaleBucket = HistogramData.exponential(1, 20); + assertThat(positiveScaleBucket.getBucketType().getBucketSize(0), equalTo(Math.sqrt(2))); + // 10th bucket contains [2^5, 2^5.5). + assertThat(positiveScaleBucket.getBucketType().getBucketSize(10), closeTo(13.2, .1)); + + HistogramData negativeScaleBucket = HistogramData.exponential(-1, 20); + assertThat(negativeScaleBucket.getBucketType().getBucketSize(0), equalTo(4.0)); + // 10th bucket contains [2^20, 2^22). + assertThat(negativeScaleBucket.getBucketType().getBucketSize(10), equalTo(3145728.0)); + } + + @Test + public void testExponentialBuckets_NumBuckets() { + // Validate that numBuckets clipping WAI. + HistogramData zeroScaleBucket = HistogramData.exponential(0, 200); + assertThat(zeroScaleBucket.getBucketType().getNumBuckets(), equalTo(32)); + + HistogramData positiveScaleBucket = HistogramData.exponential(3, 500); + assertThat(positiveScaleBucket.getBucketType().getNumBuckets(), equalTo(32 * 8)); + + HistogramData negativeScaleBucket = HistogramData.exponential(-3, 500); + assertThat(negativeScaleBucket.getBucketType().getNumBuckets(), equalTo(4)); + } } diff --git a/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/ExpansionService.java b/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/ExpansionService.java index ec53e3f11e43d..fe02533ed0d06 100644 --- a/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/ExpansionService.java +++ b/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/ExpansionService.java @@ -626,7 +626,7 @@ private Map loadRegisteredTransforms() { // Needed to find which transform was new... SdkComponents sdkComponents = rehydratedComponents - .getSdkComponents(Collections.emptyList()) + .getSdkComponents(request.getRequirementsList()) .withNewIdPrefix(request.getNamespace()); sdkComponents.registerEnvironment( Environments.createOrGetDefaultEnvironment( diff --git a/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceSchemaTransformProviderTest.java b/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceSchemaTransformProviderTest.java index d7a665eabe0f9..696fed0f8ff45 100644 --- a/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceSchemaTransformProviderTest.java +++ b/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceSchemaTransformProviderTest.java @@ -20,6 +20,7 @@ import static org.apache.beam.runners.core.construction.BeamUrns.getUrn; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; import com.google.auto.service.AutoService; import java.util.ArrayList; @@ -305,7 +306,7 @@ public void testSchemaTransformDiscovery() { ExpansionApi.DiscoverSchemaTransformRequest.newBuilder().build(); ExpansionApi.DiscoverSchemaTransformResponse response = expansionService.discover(discoverRequest); - assertEquals(2, response.getSchemaTransformConfigsCount()); + assertTrue(response.getSchemaTransformConfigsCount() >= 2); } private void verifyLeafTransforms(ExpansionApi.ExpansionResponse response, int count) { diff --git a/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/coders/DefaultCoderTest.java b/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/coders/DefaultCoderTest.java deleted file mode 100644 index 82991f191887f..0000000000000 --- a/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/coders/DefaultCoderTest.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.extensions.avro.coders; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.instanceOf; - -import java.util.List; -import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.coders.CoderRegistry; -import org.apache.beam.sdk.coders.DefaultCoder; -import org.apache.beam.sdk.coders.DefaultCoder.DefaultCoderProviderRegistrar.DefaultCoderProvider; -import org.apache.beam.sdk.coders.ListCoder; -import org.apache.beam.sdk.values.TypeDescriptor; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.ExpectedException; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** Tests for {@link DefaultCoder}. */ -@RunWith(JUnit4.class) -public class DefaultCoderTest { - - @Rule public ExpectedException thrown = ExpectedException.none(); - - @DefaultCoder(AvroCoder.class) - private static class AvroRecord {} - - @Test - public void testCodersWithoutComponents() throws Exception { - CoderRegistry registry = CoderRegistry.createDefault(); - registry.registerCoderProvider(new DefaultCoderProvider()); - assertThat(registry.getCoder(AvroRecord.class), instanceOf(AvroCoder.class)); - } - - @Test - public void testDefaultCoderInCollection() throws Exception { - CoderRegistry registry = CoderRegistry.createDefault(); - registry.registerCoderProvider(new DefaultCoderProvider()); - Coder> avroRecordCoder = - registry.getCoder(new TypeDescriptor>() {}); - assertThat(avroRecordCoder, instanceOf(ListCoder.class)); - assertThat(((ListCoder) avroRecordCoder).getElemCoder(), instanceOf(AvroCoder.class)); - } -} diff --git a/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/GceMetadataUtil.java b/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/GceMetadataUtil.java index b853ab792e082..fd49b759fd6df 100644 --- a/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/GceMetadataUtil.java +++ b/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/util/GceMetadataUtil.java @@ -30,40 +30,60 @@ import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** */ public class GceMetadataUtil { private static final String BASE_METADATA_URL = "http://metadata/computeMetadata/v1/"; + private static final Logger LOG = LoggerFactory.getLogger(GceMetadataUtil.class); + static String fetchMetadata(String key) { + String requestUrl = BASE_METADATA_URL + key; int timeoutMillis = 5000; final HttpParams httpParams = new BasicHttpParams(); HttpConnectionParams.setConnectionTimeout(httpParams, timeoutMillis); - HttpClient client = new DefaultHttpClient(httpParams); - HttpGet request = new HttpGet(BASE_METADATA_URL + key); - request.setHeader("Metadata-Flavor", "Google"); - + String ret = ""; try { + HttpClient client = new DefaultHttpClient(httpParams); + + HttpGet request = new HttpGet(requestUrl); + request.setHeader("Metadata-Flavor", "Google"); + HttpResponse response = client.execute(request); - if (response.getStatusLine().getStatusCode() != 200) { - // May mean its running on a non DataflowRunner, in which case it's perfectly normal. - return ""; + if (response.getStatusLine().getStatusCode() == 200) { + InputStream in = response.getEntity().getContent(); + try (final Reader reader = new InputStreamReader(in, StandardCharsets.UTF_8)) { + ret = CharStreams.toString(reader); + } } - InputStream in = response.getEntity().getContent(); - try (final Reader reader = new InputStreamReader(in, StandardCharsets.UTF_8)) { - return CharStreams.toString(reader); - } - } catch (IOException e) { - // May mean its running on a non DataflowRunner, in which case it's perfectly normal. + } catch (IOException ignored) { } - return ""; + + // The return value can be an empty string, which may mean it's running on a non DataflowRunner. + LOG.debug("Fetched GCE Metadata at '{}' and got '{}'", requestUrl, ret); + + return ret; + } + + private static String fetchVmInstanceMetadata(String instanceMetadataKey) { + return GceMetadataUtil.fetchMetadata("instance/" + instanceMetadataKey); } private static String fetchCustomGceMetadata(String customMetadataKey) { - return GceMetadataUtil.fetchMetadata("instance/attributes/" + customMetadataKey); + return GceMetadataUtil.fetchVmInstanceMetadata("attributes/" + customMetadataKey); } public static String fetchDataflowJobId() { return GceMetadataUtil.fetchCustomGceMetadata("job_id"); } + + public static String fetchDataflowJobName() { + return GceMetadataUtil.fetchCustomGceMetadata("job_name"); + } + + public static String fetchDataflowWorkerId() { + return GceMetadataUtil.fetchVmInstanceMetadata("id"); + } } diff --git a/sdks/java/extensions/protobuf/build.gradle b/sdks/java/extensions/protobuf/build.gradle index 2696f8886ddd8..568d4f220867d 100644 --- a/sdks/java/extensions/protobuf/build.gradle +++ b/sdks/java/extensions/protobuf/build.gradle @@ -35,6 +35,8 @@ ext.summary = "Add support to Apache Beam for Google Protobuf." dependencies { implementation library.java.byte_buddy implementation library.java.vendored_guava_32_1_2_jre + implementation library.java.commons_compress + implementation library.java.slf4j_api implementation project(path: ":sdks:java:core", configuration: "shadow") implementation library.java.protobuf_java testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") diff --git a/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteUtils.java b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteUtils.java new file mode 100644 index 0000000000000..f156fed0f38c7 --- /dev/null +++ b/sdks/java/extensions/protobuf/src/main/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteUtils.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.protobuf; + +import static java.util.stream.Collectors.toList; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; + +import com.google.protobuf.DescriptorProtos; +import com.google.protobuf.Descriptors; +import com.google.protobuf.DynamicMessage; +import com.google.protobuf.InvalidProtocolBufferException; +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; +import java.util.List; +import org.apache.beam.sdk.io.FileSystems; +import org.apache.beam.sdk.io.fs.MatchResult; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.values.Row; +import org.apache.commons.compress.utils.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class for working with Protocol Buffer (Proto) data in the context of Apache Beam. This + * class provides methods to retrieve Beam Schemas from Proto messages, convert Proto bytes to Beam + * Rows, and vice versa. It also includes utilities for handling Protocol Buffer schemas and related + * file operations. + * + *

Users can utilize the methods in this class to facilitate the integration of Proto data + * processing within Apache Beam pipelines, allowing for the seamless transformation of Proto + * messages to Beam Rows and vice versa. + */ +public class ProtoByteUtils { + + private static final Logger LOG = LoggerFactory.getLogger(ProtoByteUtils.class); + + /** + * Retrieves a Beam Schema from a Protocol Buffer message. + * + * @param fileDescriptorPath The path to the File Descriptor Set file. + * @param messageName The name of the Protocol Buffer message. + * @return The Beam Schema representing the Protocol Buffer message. + */ + public static Schema getBeamSchemaFromProto(String fileDescriptorPath, String messageName) { + ProtoSchemaInfo dpd = getProtoDomain(fileDescriptorPath); + ProtoDomain protoDomain = dpd.getProtoDomain(); + return ProtoDynamicMessageSchema.forDescriptor(protoDomain, messageName).getSchema(); + } + + public static SerializableFunction getProtoBytesToRowFunction( + String fileDescriptorPath, String messageName) { + + ProtoSchemaInfo dynamicProtoDomain = getProtoDomain(fileDescriptorPath); + ProtoDomain protoDomain = dynamicProtoDomain.getProtoDomain(); + @SuppressWarnings("unchecked") + ProtoDynamicMessageSchema protoDynamicMessageSchema = + ProtoDynamicMessageSchema.forDescriptor(protoDomain, messageName); + return new SimpleFunction() { + @Override + public Row apply(byte[] input) { + try { + final Descriptors.Descriptor descriptor = + protoDomain + .getFileDescriptor(dynamicProtoDomain.getFileName()) + .findMessageTypeByName(messageName); + DynamicMessage dynamicMessage = DynamicMessage.parseFrom(descriptor, input); + SerializableFunction res = + protoDynamicMessageSchema.getToRowFunction(); + return res.apply(dynamicMessage); + } catch (InvalidProtocolBufferException e) { + LOG.error("Error parsing to DynamicMessage", e); + throw new RuntimeException(e); + } + } + }; + } + + public static SerializableFunction getRowToProtoBytes( + String fileDescriptorPath, String messageName) { + ProtoSchemaInfo dynamicProtoDomain = getProtoDomain(fileDescriptorPath); + ProtoDomain protoDomain = dynamicProtoDomain.getProtoDomain(); + @SuppressWarnings("unchecked") + ProtoDynamicMessageSchema protoDynamicMessageSchema = + ProtoDynamicMessageSchema.forDescriptor(protoDomain, messageName); + + return new SimpleFunction() { + @Override + public byte[] apply(Row input) { + SerializableFunction res = + protoDynamicMessageSchema.getFromRowFunction(); + return res.apply(input).toByteArray(); + } + }; + } + + /** + * Retrieves a ProtoSchemaInfo containing schema information for the specified Protocol Buffer + * file. + * + * @param fileDescriptorPath The path to the File Descriptor Set file. + * @return ProtoSchemaInfo containing the associated ProtoDomain and File Name. + * @throws RuntimeException if an error occurs during schema retrieval. + */ + private static ProtoSchemaInfo getProtoDomain(String fileDescriptorPath) { + byte[] from = getFileAsBytes(fileDescriptorPath); + try { + DescriptorProtos.FileDescriptorSet descriptorSet = + DescriptorProtos.FileDescriptorSet.parseFrom(from); + return new ProtoSchemaInfo( + descriptorSet.getFile(0).getName(), ProtoDomain.buildFrom(descriptorSet)); + } catch (InvalidProtocolBufferException e) { + throw new RuntimeException(e); + } + } + + /** + * Reads the contents of a file specified by its path and returns them as a byte array. + * + * @param fileDescriptorPath The path to the file to read. + * @return Byte array containing the file contents. + * @throws RuntimeException if an error occurs during file reading. + */ + private static byte[] getFileAsBytes(String fileDescriptorPath) { + ReadableByteChannel channel = getFileByteChannel(fileDescriptorPath); + try (InputStream inputStream = Channels.newInputStream(channel)) { + return IOUtils.toByteArray(inputStream); + } catch (IOException e) { + throw new RuntimeException("Error when reading: " + fileDescriptorPath, e); + } + } + + /** + * Retrieves a ReadableByteChannel for a file specified by its path. + * + * @param filePath The path to the file to obtain a ReadableByteChannel for. + * @return ReadableByteChannel for the specified file. + * @throws RuntimeException if an error occurs while finding or opening the file. + */ + private static ReadableByteChannel getFileByteChannel(String filePath) { + try { + MatchResult result = FileSystems.match(filePath); + checkArgument( + result.status() == MatchResult.Status.OK && !result.metadata().isEmpty(), + "Failed to match any files with the pattern: " + filePath); + + List rId = + result.metadata().stream().map(MatchResult.Metadata::resourceId).collect(toList()); + + checkArgument(rId.size() == 1, "Expected exactly 1 file, but got " + rId.size() + " files."); + return FileSystems.open(rId.get(0)); + } catch (IOException e) { + throw new RuntimeException("Error when finding: " + filePath, e); + } + } + + /** + * Represents metadata associated with a Protocol Buffer schema, including the File Name and + * ProtoDomain. + */ + static class ProtoSchemaInfo implements Serializable { + private String fileName; + private ProtoDomain protoDomain; + + /** + * Constructs a ProtoSchemaInfo with the specified File Name and ProtoDomain. + * + * @param fileName The name of the associated Protocol Buffer file. + * @param protoDomain The ProtoDomain containing schema information. + */ + public ProtoSchemaInfo(String fileName, ProtoDomain protoDomain) { + this.fileName = fileName; + this.protoDomain = protoDomain; + } + + /** + * Sets the ProtoDomain associated with this ProtoSchemaInfo. + * + * @param protoDomain The ProtoDomain to set. + */ + @SuppressWarnings("unused") + public void setProtoDomain(ProtoDomain protoDomain) { + this.protoDomain = protoDomain; + } + + /** + * Gets the ProtoDomain associated with this ProtoSchemaInfo. + * + * @return The ProtoDomain containing schema information. + */ + public ProtoDomain getProtoDomain() { + return protoDomain; + } + + /** + * Gets the File Name associated with this ProtoSchemaInfo. + * + * @return The name of the associated Protocol Buffer file. + */ + public String getFileName() { + return fileName; + } + + /** + * Sets the File Name associated with this ProtoSchemaInfo. + * + * @param fileName The name of the Protocol Buffer file to set. + */ + public void setFileName(String fileName) { + this.fileName = fileName; + } + } +} diff --git a/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteUtilsTest.java b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteUtilsTest.java new file mode 100644 index 0000000000000..2a4cb4b5d5fb9 --- /dev/null +++ b/sdks/java/extensions/protobuf/src/test/java/org/apache/beam/sdk/extensions/protobuf/ProtoByteUtilsTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.protobuf; + +import java.util.Objects; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.Row; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class ProtoByteUtilsTest { + + private static final String DESCRIPTOR_PATH = + Objects.requireNonNull( + ProtoByteUtilsTest.class.getResource( + "/proto_byte/file_descriptor/proto_byte_utils.pb")) + .getPath(); + + private static final String MESSAGE_NAME = "MyMessage"; + + private static final Schema SCHEMA = + Schema.builder() + .addField("id", Schema.FieldType.INT32) + .addField("name", Schema.FieldType.STRING) + .addField("active", Schema.FieldType.BOOLEAN) + .addField( + "address", + Schema.FieldType.row( + Schema.builder() + .addField("city", Schema.FieldType.STRING) + .addField("street", Schema.FieldType.STRING) + .addField("state", Schema.FieldType.STRING) + .addField("zip_code", Schema.FieldType.STRING) + .build())) + .build(); + + @Test + public void testProtoSchemaToBeamSchema() { + Schema schema = ProtoByteUtils.getBeamSchemaFromProto(DESCRIPTOR_PATH, MESSAGE_NAME); + Assert.assertEquals(schema.getFieldNames(), SCHEMA.getFieldNames()); + } + + @Test + public void testProtoBytesToRowFunctionGenerateSerializableFunction() { + SerializableFunction protoBytesToRowFunction = + ProtoByteUtils.getProtoBytesToRowFunction(DESCRIPTOR_PATH, MESSAGE_NAME); + Assert.assertNotNull(protoBytesToRowFunction); + } + + @Test(expected = java.lang.RuntimeException.class) + public void testProtoBytesToRowFunctionReturnsRowFailure() { + // Create a proto bytes to row function + SerializableFunction protoBytesToRowFunction = + ProtoByteUtils.getProtoBytesToRowFunction(DESCRIPTOR_PATH, MESSAGE_NAME); + + // Create some test input bytes that are not matching + byte[] inputBytes = new byte[] {1, 2, 3, 4, 5}; + + // Call the proto bytes to row function that should fail because the input does not match + protoBytesToRowFunction.apply(inputBytes); + } + + @Test + public void testRowToProtoFunction() { + Row row = + Row.withSchema(SCHEMA) + .withFieldValue("id", 1234) + .withFieldValue("name", "Doe") + .withFieldValue("active", false) + .withFieldValue("address.city", "seattle") + .withFieldValue("address.street", "fake street") + .withFieldValue("address.zip_code", "TO-1234") + .withFieldValue("address.state", "wa") + .build(); + + Assert.assertNotNull( + ProtoByteUtils.getRowToProtoBytes(DESCRIPTOR_PATH, MESSAGE_NAME).apply(row)); + } +} diff --git a/sdks/java/extensions/protobuf/src/test/resources/README.md b/sdks/java/extensions/protobuf/src/test/resources/README.md index 79083f5142b04..de9cb742788bf 100644 --- a/sdks/java/extensions/protobuf/src/test/resources/README.md +++ b/sdks/java/extensions/protobuf/src/test/resources/README.md @@ -32,3 +32,9 @@ protoc \ --include_imports \ sdks/java/extensions/protobuf/src/test/resources/test/option/v1/simple.proto ``` +```bash +protoc \ + -Isdks/java/extensions/protobuf/src/test/resources/ \ + --descriptor_set_out=sdks/java/extensions/protobuf/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb \ + sdks/java/extensions/protobuf/src/test/resources/proto_byte/proto_byte_utils.proto +``` diff --git a/sdks/java/extensions/protobuf/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb b/sdks/java/extensions/protobuf/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb new file mode 100644 index 0000000000000..67e93cc177cce --- /dev/null +++ b/sdks/java/extensions/protobuf/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb @@ -0,0 +1,13 @@ + + +test_proto.proto" + MyMessage +id (Rid +name ( Rname +active (Ractive, +address ( 2.MyMessage.AddressRaddressf +Address +street ( Rstreet +city ( Rcity +state ( Rstate +zip_code ( RzipCodebproto3 \ No newline at end of file diff --git a/.test-infra/jenkins/job_PreCommit_Java_Spark3_Versions.groovy b/sdks/java/extensions/protobuf/src/test/resources/proto_byte/proto_byte_utils.proto similarity index 63% rename from .test-infra/jenkins/job_PreCommit_Java_Spark3_Versions.groovy rename to sdks/java/extensions/protobuf/src/test/resources/proto_byte/proto_byte_utils.proto index f13c4c0a1e2b6..aead141f4b9a8 100644 --- a/.test-infra/jenkins/job_PreCommit_Java_Spark3_Versions.groovy +++ b/sdks/java/extensions/protobuf/src/test/resources/proto_byte/proto_byte_utils.proto @@ -16,22 +16,20 @@ * limitations under the License. */ -import PrecommitJobBuilder +syntax = "proto3"; -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Java_Spark3_Versions', - gradleTask: ':runners:spark:3:sparkVersionsTest', - gradleSwitches: [ - '-PdisableSpotlessCheck=true' - ], // spotless checked in separate pre-commit - triggerPathPatterns: [ - '^runners/spark/.*$', - ], - timeoutMins: 120, - ) -builder.build { - publishers { - archiveJunit('**/build/test-results/**/*.xml') +message MyMessage { + int32 id = 1; + string name = 2; + bool active = 3; + + // Nested field + message Address { + string street = 1; + string city = 2; + string state = 3; + string zip_code = 4; } -} \ No newline at end of file + + Address address = 4; +} diff --git a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java index 4a5f4f12a07ad..5ba3484964c1e 100644 --- a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java +++ b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java @@ -495,6 +495,20 @@ public OutputT expand(InputT input) { boolean pythonAvailable = isPythonAvailable(); boolean dockerAvailable = isDockerAvailable(); + File requirementsFile = null; + if (!extraPackages.isEmpty()) { + requirementsFile = File.createTempFile("requirements", ".txt"); + requirementsFile.deleteOnExit(); + try (Writer fout = + new OutputStreamWriter( + new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + for (String pkg : extraPackages) { + fout.write(pkg); + fout.write('\n'); + } + } + } + // We use the transform service if either of the following is true. // * It was explicitly requested. // * Python executable is not available in the system but Docker is available. @@ -514,19 +528,16 @@ public OutputT expand(InputT input) { projectName, port); - TransformServiceLauncher service = TransformServiceLauncher.forProject(projectName, port); + String pythonRequirementsFile = + requirementsFile != null ? requirementsFile.getAbsolutePath() : null; + TransformServiceLauncher service = + TransformServiceLauncher.forProject(projectName, port, pythonRequirementsFile); service.setBeamVersion(ReleaseInfo.getReleaseInfo().getSdkVersion()); - // TODO(https://github.com/apache/beam/issues/26833): add support for installing extra - // packages. - if (!extraPackages.isEmpty()) { - throw new RuntimeException( - "Transform Service does not support installing extra packages yet"); - } try { // Starting the transform service. service.start(); // Waiting the service to be ready. - service.waitTillUp(15000); + service.waitTillUp(-1); // Expanding the transform. output = apply(input, String.format("localhost:%s", port), payload); } finally { @@ -539,17 +550,7 @@ public OutputT expand(InputT input) { ImmutableList.Builder args = ImmutableList.builder(); args.add( "--port=" + port, "--fully_qualified_name_glob=*", "--pickle_library=cloudpickle"); - if (!extraPackages.isEmpty()) { - File requirementsFile = File.createTempFile("requirements", ".txt"); - requirementsFile.deleteOnExit(); - try (Writer fout = - new OutputStreamWriter( - new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { - for (String pkg : extraPackages) { - fout.write(pkg); - fout.write('\n'); - } - } + if (requirementsFile != null) { args.add("--requirements_file=" + requirementsFile.getAbsolutePath()); } PythonService service = diff --git a/sdks/java/extensions/schemaio-expansion-service/build.gradle b/sdks/java/extensions/schemaio-expansion-service/build.gradle index d23330d73c224..68fb67e93e37e 100644 --- a/sdks/java/extensions/schemaio-expansion-service/build.gradle +++ b/sdks/java/extensions/schemaio-expansion-service/build.gradle @@ -32,14 +32,27 @@ applyJavaNature( dependencies { implementation project(path: ":sdks:java:expansion-service") permitUnusedDeclared project(path: ":sdks:java:expansion-service") // BEAM-11761 + implementation project(":sdks:java:extensions:google-cloud-platform-core") + permitUnusedDeclared project(path: ":sdks:java:extensions:google-cloud-platform-core") // BEAM-11761 + implementation project(":sdks:java:io:csv") + permitUnusedDeclared project(path: ":sdks:java:io:csv") // BEAM-11761 implementation project(":sdks:java:io:jdbc") permitUnusedDeclared project(":sdks:java:io:jdbc") // BEAM-11761 + implementation project(":sdks:java:io:json") + permitUnusedDeclared project(path: ":sdks:java:io:json") // BEAM-11761 implementation library.java.postgres permitUnusedDeclared library.java.postgres // BEAM-11761 implementation project(path: ":model:pipeline", configuration: "shadow") implementation project(path: ":sdks:java:core", configuration: "shadow") implementation library.java.vendored_grpc_1_54_0 implementation library.java.vendored_guava_32_1_2_jre + // Stage default drivers for JdbcIO schema transforms + implementation 'mysql:mysql-connector-java:8.0.22' + permitUnusedDeclared 'mysql:mysql-connector-java:8.0.22' // BEAM-11761 + implementation 'com.oracle.database.jdbc:ojdbc8:23.2.0.0' + permitUnusedDeclared 'com.oracle.database.jdbc:ojdbc8:23.2.0.0' // BEAM-11761 + implementation 'com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11' + permitUnusedDeclared 'com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11' // BEAM-11761 testImplementation library.java.junit testImplementation library.java.mockito_core } diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/BeamKafkaTable.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/BeamKafkaTable.java index f1ec20831a4cd..ab1817f6d75c2 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/BeamKafkaTable.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/BeamKafkaTable.java @@ -110,7 +110,7 @@ public PCollection buildIOReader(PBegin begin) { .setRowSchema(getSchema()); } - KafkaIO.Read createKafkaRead() { + protected KafkaIO.Read createKafkaRead() { KafkaIO.Read kafkaRead; if (topics != null) { kafkaRead = diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/KafkaTestTable.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/KafkaTestTable.java index 44b4dbe21acaf..158b0345bd8b0 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/KafkaTestTable.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/KafkaTestTable.java @@ -61,7 +61,7 @@ public KafkaTestTable(Schema beamSchema, List topics, int partitionsPerT } @Override - KafkaIO.Read createKafkaRead() { + protected KafkaIO.Read createKafkaRead() { return super.createKafkaRead().withConsumerFactoryFn(this::mkMockConsumer); } diff --git a/sdks/java/extensions/timeseries/build.gradle b/sdks/java/extensions/timeseries/build.gradle index 79d3957e6197f..86bf89d729202 100644 --- a/sdks/java/extensions/timeseries/build.gradle +++ b/sdks/java/extensions/timeseries/build.gradle @@ -29,4 +29,5 @@ dependencies { implementation project(path: ":sdks:java:core", configuration: "shadow") testImplementation library.java.junit testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") + testImplementation project(path: ":sdks:java:extensions:avro", configuration: "testRuntimeMigration") } diff --git a/sdks/java/extensions/zetasketch/build.gradle b/sdks/java/extensions/zetasketch/build.gradle index 3c745408e9379..bb532ad08aa15 100644 --- a/sdks/java/extensions/zetasketch/build.gradle +++ b/sdks/java/extensions/zetasketch/build.gradle @@ -42,6 +42,7 @@ dependencies { testImplementation project(":sdks:java:extensions:google-cloud-platform-core") testImplementation library.java.google_api_services_bigquery testImplementation library.java.proto_google_cloud_bigquery_storage_v1 + testImplementation project(path: ":sdks:java:extensions:avro", configuration: "testRuntimeMigration") testRuntimeOnly library.java.slf4j_simple testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") testRuntimeOnly project(":runners:google-cloud-dataflow-java") diff --git a/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/data/BeamFnDataGrpcMultiplexer.java b/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/data/BeamFnDataGrpcMultiplexer.java index c05a86fdb1b7b..3f83af4a8d6ba 100644 --- a/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/data/BeamFnDataGrpcMultiplexer.java +++ b/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/data/BeamFnDataGrpcMultiplexer.java @@ -257,7 +257,7 @@ public void onError(Throwable t) { @Override public void onCompleted() { - LOG.warn( + LOG.info( "Hanged up for {}.", apiServiceDescriptor == null ? "unknown endpoint" : apiServiceDescriptor); } diff --git a/sdks/java/harness/build.gradle b/sdks/java/harness/build.gradle index 25d6b2ac4040f..937e870a22f4e 100644 --- a/sdks/java/harness/build.gradle +++ b/sdks/java/harness/build.gradle @@ -30,6 +30,7 @@ dependencies { provided project(path: ":model:pipeline", configuration: "shadow") provided project(path: ":sdks:java:core", configuration: "shadow") provided project(path: ":sdks:java:transform-service:launcher", configuration: "shadow") + provided library.java.avro provided library.java.joda_time provided library.java.slf4j_api provided library.java.vendored_grpc_1_54_0 @@ -47,18 +48,24 @@ applyJavaNature( automaticModuleName: 'org.apache.beam.fn.harness', testShadowJar: true, shadowJarValidationExcludes: [ + "avro/shaded/com/google/**", + "com/thoughtworks/paranamer/**", "junit/**", "io/github/classgraph/**", "nonapi/io/github/classgraph/**", + "org/apache/avro/**", "org/apache/beam/fn/harness/**", "org/apache/beam/model/fnexecution/**", "org/apache/beam/runners/core/**", "org/apache/beam/runners/core/construction/**", "org/apache/beam/sdk/extensions/avro/**", "org/apache/beam/sdk/fn/**", + "org/apache/commons/**", "org/checkerframework/**", + "org/codehaus/jackson/**", "org/hamcrest/**", "org/junit/**", + "org/tukaani/xz/**", ], shadowClosure: { @@ -87,4 +94,5 @@ dependencies { shadowTestRuntimeClasspath project(path: ":sdks:java:core", configuration: "shadowTest") testImplementation project(path: ":sdks:java:fn-execution", configuration: "testRuntimeMigration") shadowTestRuntimeClasspath library.java.slf4j_jdk14 + permitUnusedDeclared library.java.avro } diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java index 1800e997b2d83..ddf52125b2e48 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java @@ -2503,6 +2503,10 @@ private class NonWindowObservingProcessBundleContext @Override public void output(OutputT output) { // Don't need to check timestamp since we can always output using the input timestamp. + if (currentElement == null) { + throw new IllegalStateException( + "Attempting to emit an element outside of a @ProcessElement context."); + } outputTo(mainOutputConsumer, currentElement.withValue(output)); } diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java index e103da4d6007d..cada9b12f60b4 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnHarness.java @@ -384,9 +384,10 @@ private BeamFnApi.ProcessBundleDescriptor loadDescriptor(String id) { } processBundleHandler.shutdown(); } catch (Exception e) { - System.out.println("Shutting down harness due to exception: " + e.toString()); + LOG.error("Shutting down harness due to exception", e); + e.printStackTrace(); } finally { - System.out.println("Shutting SDK harness down."); + LOG.info("Shutting SDK harness down."); executionStateSampler.stop(); executorService.shutdown(); } diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java index a82ce92768201..5509d6380ef6d 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java @@ -250,6 +250,8 @@ public class ExecutionStateTracker implements BundleProgressReporter { private final AtomicReference<@Nullable Thread> trackedThread; // Read by multiple threads, read and written by the ExecutionStateSampler thread lazily. private final AtomicLong lastTransitionTime; + // Used to throttle lull logging. + private long lastLullReport; // Read and written by the bundle processing thread frequently. private long numTransitions; // Read by the ExecutionStateSampler, written by the bundle processing thread lazily and @@ -333,31 +335,41 @@ private void takeSample(long currentTimeMillis, long millisSinceLastSample) { transitionsAtLastSample = transitionsAtThisSample; } else { long lullTimeMs = currentTimeMillis - lastTransitionTime.get(); - Thread thread = trackedThread.get(); if (lullTimeMs > MAX_LULL_TIME_MS) { - if (thread == null) { - LOG.warn( - String.format( - "Operation ongoing in bundle %s for at least %s without outputting or completing (stack trace unable to be generated).", - processBundleId.get(), - DURATION_FORMATTER.print(Duration.millis(lullTimeMs).toPeriod()))); - } else if (currentExecutionState == null) { - LOG.warn( - String.format( - "Operation ongoing in bundle %s for at least %s without outputting or completing:%n at %s", - processBundleId.get(), - DURATION_FORMATTER.print(Duration.millis(lullTimeMs).toPeriod()), - Joiner.on("\n at ").join(thread.getStackTrace()))); - } else { - LOG.warn( - String.format( - "Operation ongoing in bundle %s for PTransform{id=%s, name=%s, state=%s} for at least %s without outputting or completing:%n at %s", - processBundleId.get(), - currentExecutionState.ptransformId, - currentExecutionState.ptransformUniqueName, - currentExecutionState.stateName, - DURATION_FORMATTER.print(Duration.millis(lullTimeMs).toPeriod()), - Joiner.on("\n at ").join(thread.getStackTrace()))); + if (lullTimeMs < lastLullReport // This must be a new report. + || lullTimeMs > 1.2 * lastLullReport // Exponential backoff. + || lullTimeMs + > MAX_LULL_TIME_MS + lastLullReport // At least once every MAX_LULL_TIME_MS. + ) { + lastLullReport = lullTimeMs; + Thread thread = trackedThread.get(); + if (thread == null) { + LOG.warn( + String.format( + "Operation ongoing in bundle %s for at least %s without outputting " + + "or completing (stack trace unable to be generated).", + processBundleId.get(), + DURATION_FORMATTER.print(Duration.millis(lullTimeMs).toPeriod()))); + } else if (currentExecutionState == null) { + LOG.warn( + String.format( + "Operation ongoing in bundle %s for at least %s without outputting " + + "or completing:%n at %s", + processBundleId.get(), + DURATION_FORMATTER.print(Duration.millis(lullTimeMs).toPeriod()), + Joiner.on("\n at ").join(thread.getStackTrace()))); + } else { + LOG.warn( + String.format( + "Operation ongoing in bundle %s for PTransform{id=%s, name=%s, state=%s} " + + "for at least %s without outputting or completing:%n at %s", + processBundleId.get(), + currentExecutionState.ptransformId, + currentExecutionState.ptransformUniqueName, + currentExecutionState.stateName, + DURATION_FORMATTER.print(Duration.millis(lullTimeMs).toPeriod()), + Joiner.on("\n at ").join(thread.getStackTrace()))); + } } } } diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java index 8fa074b047683..c16296be717da 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/logging/BeamFnLoggingClient.java @@ -408,11 +408,15 @@ public void publish(LogRecord record) { if (severity == null) { return; } + if (record == null) { + return; + } + String messageString = getFormatter().formatMessage(record); BeamFnApi.LogEntry.Builder builder = BeamFnApi.LogEntry.newBuilder() .setSeverity(severity) - .setMessage(getFormatter().formatMessage(record)) + .setMessage(messageString == null ? "null" : messageString) .setThread(Integer.toString(record.getThreadID())) .setTimestamp( Timestamp.newBuilder() diff --git a/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/CassandraIO.java b/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/CassandraIO.java index d33642b9c3ab7..1429253d1948a 100644 --- a/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/CassandraIO.java +++ b/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/CassandraIO.java @@ -24,6 +24,7 @@ import com.datastax.driver.core.ConsistencyLevel; import com.datastax.driver.core.PlainTextAuthProvider; import com.datastax.driver.core.QueryOptions; +import com.datastax.driver.core.SSLOptions; import com.datastax.driver.core.Session; import com.datastax.driver.core.SocketOptions; import com.datastax.driver.core.policies.DCAwareRoundRobinPolicy; @@ -192,6 +193,9 @@ public abstract static class Read extends PTransform> @Nullable abstract ValueProvider> ringRanges(); + @Nullable + abstract ValueProvider sslOptions(); + abstract Builder builder(); /** Specify the hosts of the Apache Cassandra instances. */ @@ -385,6 +389,22 @@ public Read withRingRanges(ValueProvider> ringRange) { return builder().setRingRanges(ringRange).build(); } + /** + * Optionally, specify {@link SSLOptions} configuration to utilize SSL. See + * https://docs.datastax.com/en/developer/java-driver/3.11/manual/ssl/#jsse-programmatic + */ + public Read withSsl(SSLOptions sslOptions) { + return withSsl(ValueProvider.StaticValueProvider.of(sslOptions)); + } + + /** + * Optionally, specify {@link SSLOptions} configuration to utilize SSL. See + * https://docs.datastax.com/en/developer/java-driver/3.11/manual/ssl/#jsse-programmatic + */ + public Read withSsl(ValueProvider sslOptions) { + return builder().setSslOptions(sslOptions).build(); + } + @Override public PCollection expand(PBegin input) { checkArgument((hosts() != null && port() != null), "WithHosts() and withPort() are required"); @@ -422,7 +442,8 @@ private static Set getRingRanges(Read read) { read.localDc(), read.consistencyLevel(), read.connectTimeout(), - read.readTimeout())) { + read.readTimeout(), + read.sslOptions())) { if (isMurmur3Partitioner(cluster)) { LOG.info("Murmur3Partitioner detected, splitting"); Integer splitCount; @@ -495,6 +516,8 @@ abstract static class Builder { abstract Builder setRingRanges(ValueProvider> ringRange); + abstract Builder setSslOptions(ValueProvider sslOptions); + abstract Read autoBuild(); public Read build() { @@ -543,6 +566,8 @@ public abstract static class Write extends PTransform, PDone> abstract @Nullable ValueProvider readTimeout(); + abstract @Nullable ValueProvider sslOptions(); + abstract @Nullable SerializableFunction mapperFactoryFn(); abstract Builder builder(); @@ -725,6 +750,22 @@ public Write withMapperFactoryFn(SerializableFunction mapper return builder().setMapperFactoryFn(mapperFactoryFn).build(); } + /** + * Optionally, specify {@link SSLOptions} configuration to utilize SSL. See + * https://docs.datastax.com/en/developer/java-driver/3.11/manual/ssl/#jsse-programmatic + */ + public Write withSsl(SSLOptions sslOptions) { + return withSsl(ValueProvider.StaticValueProvider.of(sslOptions)); + } + + /** + * Optionally, specify {@link SSLOptions} configuration to utilize SSL. See + * https://docs.datastax.com/en/developer/java-driver/3.11/manual/ssl/#jsse-programmatic + */ + public Write withSsl(ValueProvider sslOptions) { + return builder().setSslOptions(sslOptions).build(); + } + @Override public void validate(PipelineOptions pipelineOptions) { checkState( @@ -799,6 +840,8 @@ abstract static class Builder { abstract Optional> mapperFactoryFn(); + abstract Builder setSslOptions(ValueProvider sslOptions); + abstract Write autoBuild(); // not public public Write build() { @@ -880,7 +923,8 @@ static Cluster getCluster( ValueProvider localDc, ValueProvider consistencyLevel, ValueProvider connectTimeout, - ValueProvider readTimeout) { + ValueProvider readTimeout, + ValueProvider sslOptions) { Cluster.Builder builder = Cluster.builder().addContactPoints(hosts.get().toArray(new String[0])).withPort(port.get()); @@ -913,6 +957,10 @@ static Cluster getCluster( socketOptions.setReadTimeoutMillis(readTimeout.get()); } + if (sslOptions != null) { + builder.withSSL(sslOptions.get()); + } + return builder.build(); } @@ -941,7 +989,8 @@ private static class Mutator { spec.localDc(), spec.consistencyLevel(), spec.connectTimeout(), - spec.readTimeout()); + spec.readTimeout(), + spec.sslOptions()); this.session = cluster.connect(spec.keyspace().get()); this.mapperFactoryFn = spec.mapperFactoryFn(); this.mutateFutures = new ArrayList<>(); diff --git a/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/ConnectionManager.java b/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/ConnectionManager.java index 21e7d257dcaa3..962e8ad8ec004 100644 --- a/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/ConnectionManager.java +++ b/sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/ConnectionManager.java @@ -71,7 +71,8 @@ static Session getSession(Read read) { read.localDc(), read.consistencyLevel(), read.connectTimeout(), - read.readTimeout())); + read.readTimeout(), + read.sslOptions())); return sessionMap.computeIfAbsent( readToSessionHash(read), k -> cluster.connect(Objects.requireNonNull(read.keyspace()).get())); diff --git a/sdks/java/io/clickhouse/build.gradle b/sdks/java/io/clickhouse/build.gradle index d61dcbe2660af..d711fb7fa3165 100644 --- a/sdks/java/io/clickhouse/build.gradle +++ b/sdks/java/io/clickhouse/build.gradle @@ -66,6 +66,7 @@ dependencies { testImplementation library.java.testcontainers_clickhouse testRuntimeOnly library.java.slf4j_jdk14 testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") + testImplementation project(path: ":sdks:java:extensions:avro", configuration: "testRuntimeMigration") } processTestResources { diff --git a/sdks/java/io/contextualtextio/build.gradle b/sdks/java/io/contextualtextio/build.gradle index c54ce15bb6c01..0557a1dfa259b 100644 --- a/sdks/java/io/contextualtextio/build.gradle +++ b/sdks/java/io/contextualtextio/build.gradle @@ -33,6 +33,7 @@ dependencies { implementation library.java.vendored_grpc_1_54_0 testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") + testImplementation library.java.commons_compress testImplementation library.java.guava_testlib testImplementation library.java.junit testImplementation library.java.hamcrest diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/providers/CsvWriteTransformProvider.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/providers/CsvWriteTransformProvider.java new file mode 100644 index 0000000000000..4e07a06197f57 --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/providers/CsvWriteTransformProvider.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv.providers; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.service.AutoService; +import com.google.auto.value.AutoValue; +import java.util.Collections; +import java.util.List; +import org.apache.beam.sdk.io.WriteFilesResult; +import org.apache.beam.sdk.io.csv.CsvIO; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.Schema.Field; +import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; +import org.apache.commons.csv.CSVFormat; + +/** + * An implementation of {@link TypedSchemaTransformProvider} for {@link CsvIO#write}. + * + *

Internal only: This class is actively being worked on, and it will likely change. We + * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam + * repository. + */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +@AutoService(SchemaTransformProvider.class) +public class CsvWriteTransformProvider + extends TypedSchemaTransformProvider { + private static final String INPUT_ROWS_TAG = "input"; + private static final String WRITE_RESULTS = "output"; + + @Override + protected Class configurationClass() { + return CsvWriteConfiguration.class; + } + + @Override + protected SchemaTransform from(CsvWriteConfiguration configuration) { + return new CsvWriteTransform(configuration); + } + + @Override + public String identifier() { + return String.format("beam:schematransform:org.apache.beam:csv_write:v1"); + } + + @Override + public List inputCollectionNames() { + return Collections.singletonList(INPUT_ROWS_TAG); + } + + @Override + public List outputCollectionNames() { + return Collections.singletonList(WRITE_RESULTS); + } + + /** Configuration for writing to BigQuery with Storage Write API. */ + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class CsvWriteConfiguration { + + public void validate() { + checkArgument( + !Strings.isNullOrEmpty(this.getPath()), "Path for a CSV Write must be specified."); + } + + public static Builder builder() { + return new AutoValue_CsvWriteTransformProvider_CsvWriteConfiguration.Builder(); + } + + @SchemaFieldDescription("The file path to write to.") + public abstract String getPath(); + + /** Builder for {@link CsvWriteConfiguration}. */ + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder setPath(String path); + + /** Builds a {@link CsvWriteConfiguration} instance. */ + public abstract CsvWriteConfiguration build(); + } + } + + /** A {@link SchemaTransform} for {@link CsvIO#write}. */ + protected static class CsvWriteTransform extends SchemaTransform { + + private final CsvWriteConfiguration configuration; + + CsvWriteTransform(CsvWriteConfiguration configuration) { + configuration.validate(); + this.configuration = configuration; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + WriteFilesResult result = + input + .get(INPUT_ROWS_TAG) + .apply(CsvIO.writeRows(configuration.getPath(), CSVFormat.DEFAULT).withSuffix("")); + Schema outputSchema = Schema.of(Field.of("filename", FieldType.STRING)); + return PCollectionRowTuple.of( + WRITE_RESULTS, + result + .getPerDestinationOutputFilenames() + .apply( + "Collect filenames", + MapElements.into(TypeDescriptors.rows()) + .via( + (destinationAndRow) -> + Row.withSchema(outputSchema) + .withFieldValue("filename", destinationAndRow.getValue()) + .build())) + .setRowSchema(outputSchema)); + } + } +} diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/providers/package-info.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/providers/package-info.java new file mode 100644 index 0000000000000..646e69b7cb8c0 --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/providers/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Transforms for reading and writing CSV files. */ +package org.apache.beam.sdk.io.csv.providers; diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index 560b27aae1628..b0122035a0152 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -186,13 +186,15 @@ task integrationTest(type: Test, dependsOn: processTestResources) { def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' def gcpTempRoot = project.findProperty('gcpTempRoot') ?: 'gs://temp-storage-for-end-to-end-tests' def firestoreDb = project.findProperty('firestoreDb') ?: 'firestoredb' - def host = project.findProperty('host') ?: 'batch-firestore.googleapis.com:443' + def firestoreHost = project.findProperty('firestoreHost') ?: 'batch-firestore.googleapis.com:443' + def bigtableChangeStreamInstanceId = project.findProperty('bigtableChangeStreamInstanceId') ?: 'beam-test' systemProperty "beamTestPipelineOptions", JsonOutput.toJson([ "--runner=DirectRunner", "--project=${gcpProject}", "--tempRoot=${gcpTempRoot}", "--firestoreDb=${firestoreDb}", - "--host=${host}", + "--firestoreHost=${firestoreHost}", + "--bigtableChangeStreamInstanceId=${bigtableChangeStreamInstanceId}", ]) // Disable Gradle cache: these ITs interact with live service that should always be considered "out of date" @@ -202,10 +204,8 @@ task integrationTest(type: Test, dependsOn: processTestResources) { exclude '**/BigQueryIOReadIT.class' exclude '**/BigQueryIOStorageQueryIT.class' exclude '**/BigQueryIOStorageReadIT.class' - exclude '**/BigQueryIOStorageReadTableRowIT.class' exclude '**/BigQueryIOStorageWriteIT.class' exclude '**/BigQueryToTableIT.class' - exclude '**/BigQueryIOJsonTest.class' maxParallelForks 4 classpath = sourceSets.test.runtimeClasspath @@ -222,14 +222,14 @@ task integrationTestKms(type: Test) { def gcpTempRoot = project.findProperty('gcpTempRootKms') ?: 'gs://temp-storage-for-end-to-end-tests-cmek' def dataflowKmsKey = project.findProperty('dataflowKmsKey') ?: "projects/apache-beam-testing/locations/global/keyRings/beam-it/cryptoKeys/test" def firestoreDb = project.findProperty('firestoreDb') ?: 'firestoredb' - def host = project.findProperty('host') ?: 'batch-firestore.googleapis.com:443' + def firestoreHost = project.findProperty('firestoreHost') ?: 'batch-firestore.googleapis.com:443' systemProperty "beamTestPipelineOptions", JsonOutput.toJson([ "--runner=DirectRunner", "--project=${gcpProject}", "--tempRoot=${gcpTempRoot}", "--dataflowKmsKey=${dataflowKmsKey}", "--firestoreDb=${firestoreDb}", - "--host=${host}", + "--firestoreHost=${firestoreHost}", ]) // Disable Gradle cache: these ITs interact with live service that should always be considered "out of date" @@ -244,6 +244,48 @@ task integrationTestKms(type: Test) { } } +/* + Integration tests for BigQueryIO that run on BigQuery's early rollout region (us-east7) + with the intended purpose of catching breaking changes from new BigQuery releases. + If these tests fail here but not in `Java_GCP_IO_Direct`, there may be a new BigQuery change + that is breaking the connector. If this is the case, we should verify with the appropriate + BigQuery infrastructure API team. + + To test in a BigQuery location, we just need to create our datasets in that location. + */ +task bigQueryEarlyRolloutIntegrationTest(type: Test, dependsOn: processTestResources) { + group = "Verification" + def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' + def gcpTempRoot = project.findProperty('gcpTempRoot') ?: 'gs://temp-storage-for-bigquery-day0-tests' + systemProperty "beamTestPipelineOptions", JsonOutput.toJson([ + "--runner=DirectRunner", + "--project=${gcpProject}", + "--tempRoot=${gcpTempRoot}", + "--bigQueryLocation=us-east7", + ]) + + outputs.upToDateWhen { false } + + // export and direct read + include '**/BigQueryToTableIT.class' + include '**/BigQueryIOJsonIT.class' + include '**/BigQueryIOStorageReadTableRowIT.class' + // storage write api + include '**/StorageApiDirectWriteProtosIT.class' + include '**/StorageApiSinkFailedRowsIT.class' + include '**/StorageApiSinkRowUpdateIT.class' + include '**/StorageApiSinkSchemaUpdateIT.class' + include '**/TableRowToStorageApiProtoIT.class' + // file loads + include '**/BigQuerySchemaUpdateOptionsIT.class' + include '**/BigQueryTimePartitioningClusteringIT.class' + include '**/FileLoadsStreamingIT.class' + + maxParallelForks 4 + classpath = sourceSets.test.runtimeClasspath + testClassesDirs = sourceSets.test.output.classesDirs +} + // path(s) for Cloud Spanner related classes def spannerIncludes = [ '**/org/apache/beam/sdk/io/gcp/spanner/**', @@ -267,8 +309,8 @@ task spannerCodeCoverageReport(type: JacocoReport, dependsOn: test) { sourceDirectories.setFrom(files(project.sourceSets.main.allSource.srcDirs)) executionData.setFrom(file("${buildDir}/jacoco/test.exec")) reports { - html.enabled true - html.destination file("${buildDir}/reports/jacoco/spanner/") + html.getRequired().set(true) + html.getOutputLocation().set(file("${buildDir}/reports/jacoco/spanner/")) } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java index 46c25d47e7a89..9210f305eca72 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/AppendClientInfo.java @@ -20,6 +20,7 @@ import com.google.api.services.bigquery.model.TableRow; import com.google.auto.value.AutoValue; import com.google.auto.value.extension.memoized.Memoized; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import com.google.cloud.bigquery.storage.v1.TableSchema; import com.google.protobuf.ByteString; import com.google.protobuf.DescriptorProtos; @@ -106,7 +107,8 @@ public AppendClientInfo withNoAppendClient() { public AppendClientInfo withAppendClient( BigQueryServices.DatasetService datasetService, Supplier getStreamName, - boolean useConnectionPool) + boolean useConnectionPool, + AppendRowsRequest.MissingValueInterpretation missingValueInterpretation) throws Exception { if (getStreamAppendClient() != null) { return this; @@ -115,7 +117,8 @@ public AppendClientInfo withAppendClient( return toBuilder() .setStreamName(streamName) .setStreamAppendClient( - datasetService.getStreamAppendClient(streamName, getDescriptor(), useConnectionPool)) + datasetService.getStreamAppendClient( + streamName, getDescriptor(), useConnectionPool, missingValueInterpretation)) .build(); } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java index 58d769312444d..2ea5d1c292021 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java @@ -37,6 +37,7 @@ import com.google.api.services.bigquery.model.TableSchema; import com.google.api.services.bigquery.model.TimePartitioning; import com.google.auto.value.AutoValue; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest; import com.google.cloud.bigquery.storage.v1.DataFormat; import com.google.cloud.bigquery.storage.v1.ReadSession; @@ -485,8 +486,11 @@ *

Upserts and deletes

* * The connector also supports streaming row updates to BigQuery, with the following qualifications: - * - The CREATE_IF_NEEDED CreateDisposition is not supported. Tables must be precreated with primary - * keys. - Only the STORAGE_WRITE_API_AT_LEAST_ONCE method is supported. + * + *

- Only the STORAGE_WRITE_API_AT_LEAST_ONCE method is supported. + * + *

- If the table is not previously created and CREATE_IF_NEEDED is used, a primary key must be + * specified using {@link Write#withPrimaryKey}. * *

Two types of updates are supported. UPSERT replaces the row with the matching primary key or * inserts the row if non exists. DELETE removes the row with the matching primary key. Row inserts @@ -534,8 +538,8 @@ * } * *

Note that in order to use inserts or deletes, the table must bet set up with a primary key. If - * the table is not previously created and CREATE_IF_NEEDED is used, a primary key must be - * specified. + * the table is not previously created and CREATE_IF_NEEDED is used, a primary key must be specified + * using {@link Write#withPrimaryKey}. */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20506) @@ -2143,6 +2147,8 @@ public static Write write() { .setMaxRetryJobs(1000) .setPropagateSuccessfulStorageApiWrites(false) .setDirectWriteProtos(true) + .setDefaultMissingValueInterpretation( + AppendRowsRequest.MissingValueInterpretation.DEFAULT_VALUE) .build(); } @@ -2164,9 +2170,10 @@ public static Write writeTableRows() { * apply row updates; directly calling {@link Write#withRowMutationInformationFn} is preferred * when writing non TableRows types (e.g. {@link #writeGenericRecords} or a custom user type). * - *

This is only supported when using the {@link Write.Method#STORAGE_API_AT_LEAST_ONCE} insert - * method and {@link Write.CreateDisposition#CREATE_NEVER}. The tables must be precreated with a - * primary key. + *

This is supported when using the {@link Write.Method#STORAGE_API_AT_LEAST_ONCE} insert + * method, and with either {@link Write.CreateDisposition#CREATE_NEVER} or {@link + * Write.CreateDisposition#CREATE_IF_NEEDED}. For CREATE_IF_NEEDED, a primary key must be + * specified using {@link Write#withPrimaryKey}. */ public static Write applyRowMutations() { return BigQueryIO.write() @@ -2327,6 +2334,8 @@ public enum Method { abstract @Nullable List getPrimaryKey(); + abstract AppendRowsRequest.MissingValueInterpretation getDefaultMissingValueInterpretation(); + abstract Boolean getOptimizeWrites(); abstract Boolean getUseBeamSchema(); @@ -2429,6 +2438,9 @@ abstract Builder setPropagateSuccessfulStorageApiWrites( abstract Builder setPrimaryKey(@Nullable List primaryKey); + abstract Builder setDefaultMissingValueInterpretation( + AppendRowsRequest.MissingValueInterpretation missingValueInterpretation); + abstract Builder setOptimizeWrites(Boolean optimizeWrites); abstract Builder setUseBeamSchema(Boolean useBeamSchema); @@ -2499,6 +2511,8 @@ public enum WriteDisposition { *

The replacement may occur in multiple steps - for instance by first removing the * existing table, then creating a replacement, then filling it in. This is not an atomic * operation, and external programs may see the table in any of these intermediate steps. + * + *

Note: This write disposition is only supported for the FILE_LOADS write method. */ WRITE_TRUNCATE, @@ -2816,9 +2830,10 @@ public Write withMethod(Write.Method method) { * function that determines how a row is applied to BigQuery (upsert, or delete) along with a * sequence number for ordering operations. * - *

This is only supported when using the {@link Write.Method#STORAGE_API_AT_LEAST_ONCE} - * insert method and {@link Write.CreateDisposition#CREATE_NEVER}. The tables must be precreated - * with a primary key. + *

This is supported when using the {@link Write.Method#STORAGE_API_AT_LEAST_ONCE} insert + * method, and with either {@link Write.CreateDisposition#CREATE_NEVER} or {@link + * Write.CreateDisposition#CREATE_IF_NEEDED}. For CREATE_IF_NEEDED, a primary key must be + * specified using {@link Write#withPrimaryKey}. */ public Write withRowMutationInformationFn( SerializableFunction updateFn) { @@ -2962,6 +2977,21 @@ public Write withPrimaryKey(List primaryKey) { return toBuilder().setPrimaryKey(primaryKey).build(); } + /** + * Specify how missing values should be interpreted when there is a default value in the schema. + * Options are to take the default value or to write an explicit null (not an option of the + * field is also required.). Note: this is only used when using one of the storage write API + * insert methods. + */ + public Write withDefaultMissingValueInterpretation( + AppendRowsRequest.MissingValueInterpretation missingValueInterpretation) { + checkArgument( + missingValueInterpretation == AppendRowsRequest.MissingValueInterpretation.DEFAULT_VALUE + || missingValueInterpretation + == AppendRowsRequest.MissingValueInterpretation.NULL_VALUE); + return toBuilder().setDefaultMissingValueInterpretation(missingValueInterpretation).build(); + } + /** * If true, enables new codepaths that are expected to use less resources while writing to * BigQuery. Not enabled by default in order to maintain backwards compatibility. @@ -3250,7 +3280,7 @@ && getStorageApiTriggeringFrequency(bqOptions) != null) { checkArgument(getNumFileShards() == 0, "Number of file shards" + error); if (getStorageApiTriggeringFrequency(bqOptions) != null) { - LOG.warn("Storage API triggering frequency" + error); + LOG.warn("Setting a triggering frequency" + error); } if (getStorageApiNumStreams(bqOptions) != 0) { LOG.warn("Setting the number of Storage API streams" + error); @@ -3266,6 +3296,8 @@ && getStorageApiTriggeringFrequency(bqOptions) != null) { checkArgument( !getAutoSchemaUpdate(), "withAutoSchemaUpdate only supported when using STORAGE_WRITE_API or STORAGE_API_AT_LEAST_ONCE."); + } else if (getWriteDisposition() == WriteDisposition.WRITE_TRUNCATE) { + LOG.error("The Storage API sink does not support the WRITE_TRUNCATE write disposition."); } if (getRowMutationInformationFn() != null) { checkArgument(getMethod() == Method.STORAGE_API_AT_LEAST_ONCE); @@ -3681,7 +3713,8 @@ private WriteResult continueExpandTyped( getAutoSchemaUpdate(), getIgnoreUnknownValues(), getPropagateSuccessfulStorageApiWrites(), - getRowMutationInformationFn() != null); + getRowMutationInformationFn() != null, + getDefaultMissingValueInterpretation()); return input.apply("StorageApiLoads", storageApiLoads); } else { throw new RuntimeException("Unexpected write method " + method); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOMetadata.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOMetadata.java index ee64a7ab9ddba..1893418dedb34 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOMetadata.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOMetadata.java @@ -28,8 +28,15 @@ final class BigQueryIOMetadata { private @Nullable String beamJobId; - private BigQueryIOMetadata(@Nullable String beamJobId) { + private @Nullable String beamJobName; + + private @Nullable String beamWorkerId; + + private BigQueryIOMetadata( + @Nullable String beamJobId, @Nullable String beamJobName, @Nullable String beamWorkerId) { this.beamJobId = beamJobId; + this.beamJobName = beamJobName; + this.beamWorkerId = beamWorkerId; } private static final Pattern VALID_CLOUD_LABEL_PATTERN = @@ -41,17 +48,24 @@ private BigQueryIOMetadata(@Nullable String beamJobId) { */ public static BigQueryIOMetadata create() { String dataflowJobId = GceMetadataUtil.fetchDataflowJobId(); + String dataflowJobName = GceMetadataUtil.fetchDataflowJobName(); + String dataflowWorkerId = GceMetadataUtil.fetchDataflowWorkerId(); + // If a Dataflow job id is returned on GCE metadata. Then it means // this program is running on a Dataflow GCE VM. - boolean isDataflowRunner = dataflowJobId != null && !dataflowJobId.isEmpty(); + boolean isDataflowRunner = !dataflowJobId.isEmpty(); String beamJobId = null; + String beamJobName = null; + String beamWorkerId = null; if (isDataflowRunner) { if (BigQueryIOMetadata.isValidCloudLabel(dataflowJobId)) { beamJobId = dataflowJobId; + beamJobName = dataflowJobName; + beamWorkerId = dataflowWorkerId; } } - return new BigQueryIOMetadata(beamJobId); + return new BigQueryIOMetadata(beamJobId, beamJobName, beamWorkerId); } public Map addAdditionalJobLabels(Map jobLabels) { @@ -68,6 +82,20 @@ public Map addAdditionalJobLabels(Map jobLabels) return this.beamJobId; } + /* + * Returns the beam job name. Can be null if it is not running on Dataflow. + */ + public @Nullable String getBeamJobName() { + return this.beamJobName; + } + + /* + * Returns the beam worker id. Can be null if it is not running on Dataflow. + */ + public @Nullable String getBeamWorkerId() { + return this.beamWorkerId; + } + /** * Returns true if label_value is a valid cloud label string. This function can return false in * cases where the label value is valid. However, it will not return true in a case where the diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java index 1cc9049a542d9..c9c96eb35f3fd 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServices.java @@ -30,6 +30,7 @@ import com.google.api.services.bigquery.model.Table; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import com.google.cloud.bigquery.storage.v1.AppendRowsResponse; import com.google.cloud.bigquery.storage.v1.BatchCommitWriteStreamsResponse; import com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest; @@ -213,7 +214,10 @@ WriteStream createWriteStream(String tableUrn, WriteStream.Type type) * first. */ StreamAppendClient getStreamAppendClient( - String streamName, DescriptorProtos.DescriptorProto descriptor, boolean useConnectionPool) + String streamName, + DescriptorProtos.DescriptorProto descriptor, + boolean useConnectionPool, + AppendRowsRequest.MissingValueInterpretation missingValueInterpretation) throws Exception; /** Flush a given stream up to the given offset. The stream must have type BUFFERED. */ diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java index 17b5c5ebd99b1..b6d5eefe715e0 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java @@ -61,6 +61,7 @@ import com.google.api.services.bigquery.model.TableRow; import com.google.auth.Credentials; import com.google.auth.http.HttpCredentialsAdapter; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import com.google.cloud.bigquery.storage.v1.AppendRowsResponse; import com.google.cloud.bigquery.storage.v1.BatchCommitWriteStreamsRequest; import com.google.cloud.bigquery.storage.v1.BatchCommitWriteStreamsResponse; @@ -154,7 +155,7 @@ "nullness", // TODO(https://github.com/apache/beam/issues/20506) "keyfor" }) -class BigQueryServicesImpl implements BigQueryServices { +public class BigQueryServicesImpl implements BigQueryServices { private static final Logger LOG = LoggerFactory.getLogger(BigQueryServicesImpl.class); // The maximum number of retries to execute a BigQuery RPC. @@ -549,7 +550,7 @@ public void close() throws Exception {} } @VisibleForTesting - static class DatasetServiceImpl implements DatasetService { + public static class DatasetServiceImpl implements DatasetService { // Backoff: 200ms * 1.5 ^ n, n=[1,5] private static final FluentBackoff INSERT_BACKOFF_FACTORY = FluentBackoff.DEFAULT.withInitialBackoff(Duration.millis(200)).withMaxRetries(5); @@ -610,7 +611,7 @@ static class DatasetServiceImpl implements DatasetService { this.executor = null; } - private DatasetServiceImpl(BigQueryOptions bqOptions) { + public DatasetServiceImpl(BigQueryOptions bqOptions) { this.errorExtractor = new ApiErrorExtractor(); this.client = newBigQueryClient(bqOptions).build(); this.newWriteClient = newBigQueryWriteClient(bqOptions); @@ -1352,7 +1353,10 @@ public WriteStream createWriteStream(String tableUrn, WriteStream.Type type) @Override public StreamAppendClient getStreamAppendClient( - String streamName, DescriptorProtos.DescriptorProto descriptor, boolean useConnectionPool) + String streamName, + DescriptorProtos.DescriptorProto descriptor, + boolean useConnectionPool, + AppendRowsRequest.MissingValueInterpretation missingValueInterpretation) throws Exception { ProtoSchema protoSchema = ProtoSchema.newBuilder().setProtoDescriptor(descriptor).build(); @@ -1364,6 +1368,15 @@ public StreamAppendClient getStreamAppendClient( .setChannelsPerCpu(2) .build(); + String traceId = + String.format( + "Dataflow:%s:%s:%s", + bqIOMetadata.getBeamJobName() == null + ? options.getJobName() + : bqIOMetadata.getBeamJobName(), + bqIOMetadata.getBeamJobId() == null ? "" : bqIOMetadata.getBeamJobId(), + bqIOMetadata.getBeamWorkerId() == null ? "" : bqIOMetadata.getBeamWorkerId()); + StreamWriter streamWriter = StreamWriter.newBuilder(streamName, newWriteClient) .setExecutorProvider( @@ -1374,11 +1387,8 @@ public StreamAppendClient getStreamAppendClient( .setEnableConnectionPool(useConnectionPool) .setMaxInflightRequests(storageWriteMaxInflightRequests) .setMaxInflightBytes(storageWriteMaxInflightBytes) - .setTraceId( - "Dataflow:" - + (bqIOMetadata.getBeamJobId() != null - ? bqIOMetadata.getBeamJobId() - : options.getJobName())) + .setTraceId(traceId) + .setDefaultMissingValueInterpretation(missingValueInterpretation) .build(); return new StreamAppendClient() { private int pins = 0; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java index 00ee815c3c930..fa5ffae0909d4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java @@ -64,6 +64,7 @@ import org.apache.beam.sdk.transforms.SerializableFunctions; import org.apache.beam.sdk.util.Preconditions; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -94,8 +95,15 @@ public class BigQueryUtils { // For parsing the format used to refer to tables parameters in BigQueryIO. // "{project_id}:{dataset_id}.{table_id}" or // "{project_id}.{dataset_id}.{table_id}" + // following documentation in + // https://cloud.google.com/resource-manager/docs/creating-managing-projects#before_you_begin, + // https://cloud.google.com/bigquery/docs/datasets#dataset-naming, and + // https://cloud.google.com/bigquery/docs/tables#table_naming private static final Pattern SIMPLE_TABLE_PATTERN = - Pattern.compile("^(?[^\\.:]+)[\\.:](?[^\\.:]+)[\\.](?[^\\.:]+)$"); + Pattern.compile( + "^(?[a-z][a-z0-9.\\-:]{4,28}[a-z0-9])[\\:.]" + + "(?[a-zA-Z0-9_]{1,1024})[\\.]" + + "(?
[\\p{L}\\p{M}\\p{N}\\p{Pc}\\p{Pd}\\p{Zs}$]{1,1024})$"); /** Options for how to convert BigQuery data to Beam data. */ @AutoValue @@ -1000,6 +1008,25 @@ private static Object convertAvroNumeric(Object value) { return null; } + /** + * @param tableReference - a BigQueryTableIdentifier that may or may not include the project. + * @return a String representation of the table destination in the form: + * `myproject.mydataset.mytable` + */ + public static @Nullable String toTableSpec(TableReference tableReference) { + if (tableReference.getDatasetId() == null || tableReference.getTableId() == null) { + throw new IllegalArgumentException( + String.format( + "Table reference [%s] must include at least a dataset and a table.", tableReference)); + } + String tableSpec = + String.format("%s.%s", tableReference.getDatasetId(), tableReference.getTableId()); + if (!Strings.isNullOrEmpty(tableReference.getProjectId())) { + tableSpec = String.format("%s.%s", tableReference.getProjectId(), tableSpec); + } + return tableSpec; + } + private static @Nullable ServiceCallMetric callMetricForMethod( @Nullable TableReference tableReference, String method) { if (tableReference != null) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java index f9f57f71ba2ca..0227b80201292 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.bigquery; import com.google.api.services.bigquery.model.TableRow; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import java.nio.ByteBuffer; import java.util.concurrent.ThreadLocalRandom; import javax.annotation.Nullable; @@ -63,9 +64,10 @@ public class StorageApiLoads private final boolean allowAutosharding; private final boolean autoUpdateSchema; private final boolean ignoreUnknownValues; - private final boolean usesCdc; + private final AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation; + public StorageApiLoads( Coder destinationCoder, StorageApiDynamicDestinations dynamicDestinations, @@ -80,7 +82,8 @@ public StorageApiLoads( boolean autoUpdateSchema, boolean ignoreUnknownValues, boolean propagateSuccessfulStorageApiWrites, - boolean usesCdc) { + boolean usesCdc, + AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation) { this.destinationCoder = destinationCoder; this.dynamicDestinations = dynamicDestinations; this.rowUpdateFn = rowUpdateFn; @@ -97,6 +100,7 @@ public StorageApiLoads( this.successfulWrittenRowsTag = new TupleTag<>("successfulPublishedRowsTag"); } this.usesCdc = usesCdc; + this.defaultMissingValueInterpretation = defaultMissingValueInterpretation; } public TupleTag getFailedRowsTag() { @@ -156,7 +160,8 @@ public WriteResult expandInconsistent( ignoreUnknownValues, createDisposition, kmsKey, - usesCdc)); + usesCdc, + defaultMissingValueInterpretation)); PCollection insertErrors = PCollectionList.of(convertMessagesResult.get(failedRowsTag)) @@ -243,7 +248,8 @@ public WriteResult expandTriggered( failedRowsTag, successfulWrittenRowsTag, autoUpdateSchema, - ignoreUnknownValues)); + ignoreUnknownValues, + defaultMissingValueInterpretation)); PCollection insertErrors = PCollectionList.of(convertMessagesResult.get(failedRowsTag)) @@ -331,7 +337,8 @@ public WriteResult expandUntriggered( ignoreUnknownValues, createDisposition, kmsKey, - usesCdc)); + usesCdc, + defaultMissingValueInterpretation)); PCollection insertErrors = PCollectionList.of(convertMessagesResult.get(failedRowsTag)) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java index 6a9997fffdb4a..022ee1fbed08d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.bigquery; import com.google.api.services.bigquery.model.TableRow; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import javax.annotation.Nullable; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.transforms.PTransform; @@ -49,6 +50,7 @@ public class StorageApiWriteRecordsInconsistent private final BigQueryIO.Write.CreateDisposition createDisposition; private final @Nullable String kmsKey; private final boolean usesCdc; + private final AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation; public StorageApiWriteRecordsInconsistent( StorageApiDynamicDestinations dynamicDestinations, @@ -61,7 +63,8 @@ public StorageApiWriteRecordsInconsistent( boolean ignoreUnknownValues, BigQueryIO.Write.CreateDisposition createDisposition, @Nullable String kmsKey, - boolean usesCdc) { + boolean usesCdc, + AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation) { this.dynamicDestinations = dynamicDestinations; this.bqServices = bqServices; this.failedRowsTag = failedRowsTag; @@ -73,6 +76,7 @@ public StorageApiWriteRecordsInconsistent( this.createDisposition = createDisposition; this.kmsKey = kmsKey; this.usesCdc = usesCdc; + this.defaultMissingValueInterpretation = defaultMissingValueInterpretation; } @Override @@ -103,7 +107,8 @@ public PCollectionTuple expand(PCollection private final BigQueryIO.Write.CreateDisposition createDisposition; private final @Nullable String kmsKey; private final boolean usesCdc; + private final AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation; /** * The Guava cache object is thread-safe. However our protocol requires that client pin the @@ -166,7 +168,8 @@ public StorageApiWriteUnshardedRecords( boolean ignoreUnknownValues, BigQueryIO.Write.CreateDisposition createDisposition, @Nullable String kmsKey, - boolean usesCdc) { + boolean usesCdc, + AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation) { this.dynamicDestinations = dynamicDestinations; this.bqServices = bqServices; this.failedRowsTag = failedRowsTag; @@ -178,6 +181,7 @@ public StorageApiWriteUnshardedRecords( this.createDisposition = createDisposition; this.kmsKey = kmsKey; this.usesCdc = usesCdc; + this.defaultMissingValueInterpretation = defaultMissingValueInterpretation; } @Override @@ -210,7 +214,8 @@ public PCollectionTuple expand(PCollection private final BigQueryIO.Write.CreateDisposition createDisposition; private final @Nullable String kmsKey; private final boolean usesCdc; + private final AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation; static class AppendRowsContext extends RetryManager.Operation.Context { long offset; @@ -390,7 +396,8 @@ AppendClientInfo generateClient(@Nullable TableSchema updatedSchema) throws Exce .withAppendClient( Preconditions.checkStateNotNull(maybeDatasetService), () -> streamName, - usingMultiplexing)); + usingMultiplexing, + defaultMissingValueInterpretation)); Preconditions.checkStateNotNull(appendClientInfo.get().getStreamAppendClient()); return null; }, @@ -704,7 +711,19 @@ long flush( retrieveErrorDetails(contexts)); failedContext.failureCount += 1; - invalidateWriteStream(); + boolean quotaError = false; + Throwable error = failedContext.getError(); + Status.Code statusCode = Status.Code.OK; + if (error != null) { + statusCode = Status.fromThrowable(error).getCode(); + quotaError = statusCode.equals(Status.Code.RESOURCE_EXHAUSTED); + } + + if (!quotaError) { + // This forces us to close and reopen all gRPC connections to Storage API on error, + // which empirically fixes random stuckness issues. + invalidateWriteStream(); + } // Maximum number of times we retry before we fail the work item. if (failedContext.failureCount > 5) { @@ -713,8 +732,6 @@ long flush( // The following errors are known to be persistent, so always fail the work item in // this case. - Throwable error = Preconditions.checkStateNotNull(failedContext.getError()); - Status.Code statusCode = Status.fromThrowable(error).getCode(); if (statusCode.equals(Status.Code.OUT_OF_RANGE) || statusCode.equals(Status.Code.ALREADY_EXISTS)) { throw new RuntimeException( @@ -839,7 +856,8 @@ void postFlush() { boolean ignoreUnknownValues, BigQueryIO.Write.CreateDisposition createDisposition, @Nullable String kmsKey, - boolean usesCdc) { + boolean usesCdc, + AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation) { this.messageConverters = new TwoLevelMessageConverterCache<>(operationName); this.dynamicDestinations = dynamicDestinations; this.bqServices = bqServices; @@ -855,6 +873,7 @@ void postFlush() { this.createDisposition = createDisposition; this.kmsKey = kmsKey; this.usesCdc = usesCdc; + this.defaultMissingValueInterpretation = defaultMissingValueInterpretation; } boolean shouldFlush() { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java index efcf87eac7a32..f4982396e9d5f 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java @@ -22,6 +22,7 @@ import com.google.api.core.ApiFuture; import com.google.api.core.ApiFutures; import com.google.api.services.bigquery.model.TableRow; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import com.google.cloud.bigquery.storage.v1.AppendRowsResponse; import com.google.cloud.bigquery.storage.v1.Exceptions; import com.google.cloud.bigquery.storage.v1.Exceptions.StreamFinalizedException; @@ -125,6 +126,7 @@ public class StorageApiWritesShardedRecords failedRowsCoder; private final boolean autoUpdateSchema; private final boolean ignoreUnknownValues; + private final AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation; private final Duration streamIdleTime = DEFAULT_STREAM_IDLE_TIME; private final TupleTag failedRowsTag; @@ -217,7 +219,8 @@ public StorageApiWritesShardedRecords( TupleTag failedRowsTag, @Nullable TupleTag successfulRowsTag, boolean autoUpdateSchema, - boolean ignoreUnknownValues) { + boolean ignoreUnknownValues, + AppendRowsRequest.MissingValueInterpretation defaultMissingValueInterpretation) { this.dynamicDestinations = dynamicDestinations; this.createDisposition = createDisposition; this.kmsKey = kmsKey; @@ -229,6 +232,7 @@ public StorageApiWritesShardedRecords( this.succussfulRowsCoder = successfulRowsCoder; this.autoUpdateSchema = autoUpdateSchema; this.ignoreUnknownValues = ignoreUnknownValues; + this.defaultMissingValueInterpretation = defaultMissingValueInterpretation; } @Override @@ -494,7 +498,11 @@ public void process( client.unpin(); client.close(); })) - .withAppendClient(datasetService, getOrCreateStream, false); + .withAppendClient( + datasetService, + getOrCreateStream, + false, + defaultMissingValueInterpretation); // This pin is "owned" by the cache. Preconditions.checkStateNotNull(info.getStreamAppendClient()).pin(); return info; @@ -554,7 +562,11 @@ public void process( appendClientInfo.set( appendClientInfo .get() - .withAppendClient(datasetService, getOrCreateStream, false)); + .withAppendClient( + datasetService, + getOrCreateStream, + false, + defaultMissingValueInterpretation)); StreamAppendClient streamAppendClient = Preconditions.checkArgumentNotNull( appendClientInfo.get().getStreamAppendClient()); @@ -599,7 +611,11 @@ public void process( appendClientInfo.set( appendClientInfo .get() - .withAppendClient(datasetService, getOrCreateStream, false)); + .withAppendClient( + datasetService, + getOrCreateStream, + false, + defaultMissingValueInterpretation)); return Preconditions.checkStateNotNull(appendClientInfo.get().getStreamAppendClient()) .appendRows(context.offset, context.protoRows); } catch (Exception e) { @@ -673,7 +689,7 @@ public void process( boolean offsetMismatch = statusCode.equals(Code.OUT_OF_RANGE) || statusCode.equals(Code.ALREADY_EXISTS); - // Invalidate the StreamWriter and force a new one to be created. + boolean quotaError = statusCode.equals(Code.RESOURCE_EXHAUSTED); if (!offsetMismatch) { // Don't log errors for expected offset mismatch. These will be logged as warnings // below. @@ -681,13 +697,19 @@ public void process( "Got error " + failedContext.getError() + " closing " + failedContext.streamName); } - // TODO: Only do this on explicit NOT_FOUND errors once BigQuery reliably produces them. try { + // TODO: Only do this on explicit NOT_FOUND errors once BigQuery reliably produces + // them. tryCreateTable.call(); } catch (Exception e) { throw new RuntimeException(e); } - clearClients.accept(failedContexts); + + if (!quotaError) { + // This forces us to close and reopen all gRPC connections to Storage API on error, + // which empirically fixes random stuckness issues. + clearClients.accept(failedContexts); + } appendFailures.inc(); boolean explicitStreamFinalized = diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java index c31886da61447..4d714aaaf777b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProto.java @@ -213,11 +213,22 @@ private static String getPrettyFieldName(SchemaInformation schema) { .put(TableFieldSchema.Type.JSON, "JSON") .build(); - public static TableFieldSchema.Mode modeToProtoMode(String mode) { - return Optional.ofNullable(mode) - .map(Mode::valueOf) - .map(m -> MODE_MAP_JSON_PROTO.get(m)) - .orElse(TableFieldSchema.Mode.NULLABLE); + public static TableFieldSchema.Mode modeToProtoMode( + @Nullable String defaultValueExpression, String mode) { + TableFieldSchema.Mode resultMode = + Optional.ofNullable(mode) + .map(Mode::valueOf) + .map(MODE_MAP_JSON_PROTO::get) + .orElse(TableFieldSchema.Mode.NULLABLE); + if (defaultValueExpression == null) { + return resultMode; + } else { + // If there is a default value expression, treat this field as if it were nullable or + // repeated. + return resultMode.equals(TableFieldSchema.Mode.REPEATED) + ? resultMode + : TableFieldSchema.Mode.NULLABLE; + } } public static String protoModeToJsonMode(TableFieldSchema.Mode protoMode) { @@ -310,7 +321,7 @@ public static TableFieldSchema tableFieldToProtoTableField( if (field.getMaxLength() != null) { builder.setMaxLength(field.getMaxLength()); } - builder.setMode(modeToProtoMode(field.getMode())); + builder.setMode(modeToProtoMode(field.getDefaultValueExpression(), field.getMode())); if (field.getPrecision() != null) { builder.setPrecision(field.getPrecision()); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQueryOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQueryOptions.java index 3574c12ee3a99..4d8095c1879d8 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQueryOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/TestBigQueryOptions.java @@ -24,10 +24,17 @@ /** {@link TestPipelineOptions} for {@link TestBigQuery}. */ public interface TestBigQueryOptions extends TestPipelineOptions, BigQueryOptions, GcpOptions { + String BIGQUERY_EARLY_ROLLOUT_REGION = "us-east7"; @Description("Dataset used in the integration tests. Default is integ_test") @Default.String("integ_test") String getTargetDataset(); void setTargetDataset(String value); + + @Description("Region to perform BigQuery operations in.") + @Default.String("") + String getBigQueryLocation(); + + void setBigQueryLocation(String location); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java index 92a0af2054827..ad978e95016a7 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java @@ -457,6 +457,25 @@ public Read withTableId(String tableId) { return withTableId(StaticValueProvider.of(tableId)); } + /** + * Returns a new {@link BigtableIO.Read} that will read using the specified app profile id. + * + *

Does not modify this object. + */ + public Read withAppProfileId(ValueProvider appProfileId) { + BigtableConfig config = getBigtableConfig(); + return toBuilder().setBigtableConfig(config.withAppProfileId(appProfileId)).build(); + } + + /** + * Returns a new {@link BigtableIO.Read} that will read using the specified app profile id. + * + *

Does not modify this object. + */ + public Read withAppProfileId(String appProfileId) { + return withAppProfileId(StaticValueProvider.of(appProfileId)); + } + /** * WARNING: Should be used only to specify additional parameters for connection to the Cloud * Bigtable, instanceId and projectId should be provided over {@link #withInstanceId} and {@link @@ -837,6 +856,31 @@ public Write withTableId(String tableId) { return withTableId(StaticValueProvider.of(tableId)); } + /** + * Returns a new {@link BigtableIO.Write} that will write using the specified app profile id. + * + *

Remember that in order to use single-row transactions, this must use a single-cluster + * routing policy. + * + *

Does not modify this object. + */ + public Write withAppProfileId(ValueProvider appProfileId) { + BigtableConfig config = getBigtableConfig(); + return toBuilder().setBigtableConfig(config.withAppProfileId(appProfileId)).build(); + } + + /** + * Returns a new {@link BigtableIO.Write} that will write using the specified app profile id. + * + *

Remember that in order to use single-row transactions, this must use a single-cluster + * routing policy. + * + *

Does not modify this object. + */ + public Write withAppProfileId(String appProfileId) { + return withAppProfileId(StaticValueProvider.of(appProfileId)); + } + /** * WARNING: Should be used only to specify additional parameters for connection to the Cloud * Bigtable, instanceId and projectId should be provided over {@link #withInstanceId} and {@link @@ -1326,7 +1370,11 @@ public List split(long desiredBundleSizeBytes, PipelineOptions o long maximumNumberOfSplits = 4000; long sizeEstimate = getEstimatedSizeBytes(options); desiredBundleSizeBytes = - Math.max(sizeEstimate / maximumNumberOfSplits, desiredBundleSizeBytes); + Math.max( + sizeEstimate / maximumNumberOfSplits, + // BoundedReadEvaluatorFactory may provide us with a desiredBundleSizeBytes of 0 + // https://github.com/apache/beam/issues/28793 + Math.max(1, desiredBundleSizeBytes)); // Delegate to testable helper. List splits = diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/BigtableChangeStreamTestOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/BigtableChangeStreamTestOptions.java new file mode 100644 index 0000000000000..71303a0e84acd --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/BigtableChangeStreamTestOptions.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigtable.changestreams; + +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.testing.TestPipelineOptions; + +public interface BigtableChangeStreamTestOptions extends TestPipelineOptions { + @Description("Instance ID for Bigtable Change Stream") + @Default.String("beam-test") + String getBigtableChangeStreamInstanceId(); + + void setBigtableChangeStreamInstanceId(String value); +} diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/common/GcpIoPipelineOptionsRegistrar.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/common/GcpIoPipelineOptionsRegistrar.java index 1ed9ed6cb6c35..6cfc03c9eaa7f 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/common/GcpIoPipelineOptionsRegistrar.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/common/GcpIoPipelineOptionsRegistrar.java @@ -20,6 +20,8 @@ import com.google.auto.service.AutoService; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; +import org.apache.beam.sdk.io.gcp.bigquery.TestBigQueryOptions; +import org.apache.beam.sdk.io.gcp.bigtable.changestreams.BigtableChangeStreamTestOptions; import org.apache.beam.sdk.io.gcp.firestore.FirestoreOptions; import org.apache.beam.sdk.io.gcp.pubsub.PubsubOptions; import org.apache.beam.sdk.options.PipelineOptions; @@ -36,6 +38,8 @@ public Iterable> getPipelineOptions() { .add(BigQueryOptions.class) .add(PubsubOptions.class) .add(FirestoreOptions.class) + .add(TestBigQueryOptions.class) + .add(BigtableChangeStreamTestOptions.class) .build(); } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreOptions.java index 1be6568372d9c..a292a106e51f5 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreOptions.java @@ -66,7 +66,7 @@ public interface FirestoreOptions extends PipelineOptions { */ @Description("Firestore endpoint (host and port)") @Default.String("batch-firestore.googleapis.com:443") - String getHost(); + String getFirestoreHost(); /** * Define a host port pair to allow connecting to a Cloud Firestore instead of the default live @@ -74,5 +74,5 @@ public interface FirestoreOptions extends PipelineOptions { * * @param host the host and port to connect to */ - void setHost(String host); + void setFirestoreHost(String host); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreStatefulComponentFactory.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreStatefulComponentFactory.java index 21c29c485d1e4..4e8c11f7072cb 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreStatefulComponentFactory.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreStatefulComponentFactory.java @@ -93,7 +93,7 @@ FirestoreStub getFirestoreStub(PipelineOptions options) { GcpOptions gcpOptions = options.as(GcpOptions.class); builder .setCredentialsProvider(FixedCredentialsProvider.create(gcpOptions.getGcpCredential())) - .setEndpoint(firestoreOptions.getHost()); + .setEndpoint(firestoreOptions.getFirestoreHost()); headers.put( "x-goog-request-params", "project_id=" diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformConfiguration.java index befb22ca6dc2c..6e665baaf6b1c 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformConfiguration.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformConfiguration.java @@ -19,6 +19,7 @@ import com.google.api.client.util.Clock; import com.google.auto.value.AutoValue; +import java.util.List; import javax.annotation.Nullable; import org.apache.beam.sdk.io.gcp.pubsub.PubsubTestClient.PubsubTestClientFactory; import org.apache.beam.sdk.schemas.AutoValueSchema; @@ -60,12 +61,61 @@ public abstract class PubsubReadSchemaTransformConfiguration { + "For JSON data, this is a schema defined with JSON-schema syntax (https://json-schema.org/).") public abstract String getSchema(); + @SchemaFieldDescription( + "Any additional pubsub attributes that should be populated as String fields in the ouptut rows.") + public abstract @Nullable List getAttributes(); + + @SchemaFieldDescription( + "Any additional field that should be populated with the full set of PubSub attributes.") + public abstract @Nullable String getAttributesMap(); + + @SchemaFieldDescription( + "When reading from Cloud Pub/Sub where unique record identifiers are provided as Pub/Sub message attributes, " + + "specifies the name of the attribute containing the unique identifier. " + + "The value of the attribute can be any string that uniquely identifies this record. " + + "Pub/Sub cannot guarantee that no duplicate data will be delivered on the Pub/Sub stream. " + + "If idAttribute is not provided, Beam cannot guarantee that no duplicate data will be delivered, " + + "and deduplication of the stream will be strictly best effort.") + public abstract @Nullable String getIdAttribute(); + + @SchemaFieldDescription( + "Specifies the name of the attribute that contains the timestamp, if any. " + + "The timestamp value is expected to be represented in the attribute as either " + + "(1) a numerical value representing the number of milliseconds since the Unix epoch. " + + "For example, if using the Joda time classes, " + + "Instant.getMillis() returns the correct value for this attribute." + + " or (2) a String in RFC 3339 format. For example, 2015-10-29T23:41:41.123Z. " + + "The sub-second component of the timestamp is optional, and digits beyond the first three " + + "(i.e., time units smaller than milliseconds) will be ignored.") + public abstract @Nullable String getTimestampAttribute(); + + @SchemaFieldDescription("Specifies how to handle errors.") + public abstract @Nullable ErrorHandling getErrorHandling(); + // Used for testing only. public abstract @Nullable PubsubTestClientFactory getClientFactory(); // Used for testing only. public abstract @Nullable Clock getClock(); + @AutoValue + public abstract static class ErrorHandling { + @SchemaFieldDescription("The name of the output PCollection containing failed reads.") + public abstract String getOutput(); + + public static PubsubReadSchemaTransformConfiguration.ErrorHandling.Builder builder() { + return new AutoValue_PubsubReadSchemaTransformConfiguration_ErrorHandling.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + public abstract PubsubReadSchemaTransformConfiguration.ErrorHandling.Builder setOutput( + String output); + + public abstract PubsubReadSchemaTransformConfiguration.ErrorHandling build(); + } + } + public static Builder builder() { return new AutoValue_PubsubReadSchemaTransformConfiguration.Builder(); } @@ -80,6 +130,16 @@ public abstract static class Builder { public abstract Builder setSchema(String schema); + public abstract Builder setAttributes(@Nullable List attributes); + + public abstract Builder setAttributesMap(@Nullable String attributesMap); + + public abstract Builder setIdAttribute(@Nullable String schema); + + public abstract Builder setTimestampAttribute(@Nullable String schema); + + public abstract Builder setErrorHandling(@Nullable ErrorHandling errorHandling); + // Used for testing only. public abstract Builder setClientFactory(@Nullable PubsubTestClientFactory clientFactory); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProvider.java index 61a4cf68c9872..c1f6b2b31754a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProvider.java @@ -23,8 +23,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Objects; import java.util.Set; +import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; import org.apache.beam.sdk.io.gcp.pubsub.PubsubTestClient.PubsubTestClientFactory; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; @@ -32,11 +32,8 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; -import org.apache.beam.sdk.schemas.utils.AvroUtils; import org.apache.beam.sdk.schemas.utils.JsonUtils; import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.DoFn.FinishBundle; -import org.apache.beam.sdk.transforms.DoFn.ProcessElement; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.PCollectionRowTuple; @@ -63,7 +60,7 @@ public class PubsubReadSchemaTransformProvider extends TypedSchemaTransformProvider { - public static final String VALID_FORMATS_STR = "AVRO,JSON"; + public static final String VALID_FORMATS_STR = "RAW,AVRO,JSON"; public static final Set VALID_DATA_FORMATS = Sets.newHashSet(VALID_FORMATS_STR.split(",")); @@ -89,38 +86,43 @@ public SchemaTransform from(PubsubReadSchemaTransformConfiguration configuration "To read from Pubsub, a subscription name or a topic name must be provided. Not both."); } - if ((Strings.isNullOrEmpty(configuration.getSchema()) - && !Strings.isNullOrEmpty(configuration.getFormat())) - || (!Strings.isNullOrEmpty(configuration.getSchema()) - && Strings.isNullOrEmpty(configuration.getFormat()))) { - throw new IllegalArgumentException( - "A schema was provided without a data format (or viceversa). Please provide " - + "both of these parameters to read from Pubsub, or if you would like to use the Pubsub schema service," - + " please leave both of these blank."); + if (!"RAW".equals(configuration.getFormat())) { + if ((Strings.isNullOrEmpty(configuration.getSchema()) + && !Strings.isNullOrEmpty(configuration.getFormat())) + || (!Strings.isNullOrEmpty(configuration.getSchema()) + && Strings.isNullOrEmpty(configuration.getFormat()))) { + throw new IllegalArgumentException( + "A schema was provided without a data format (or viceversa). Please provide " + + "both of these parameters to read from Pubsub, or if you would like to use the Pubsub schema service," + + " please leave both of these blank."); + } } - Schema beamSchema; - SerializableFunction valueMapper; + Schema payloadSchema; + SerializableFunction payloadMapper; - if (!VALID_DATA_FORMATS.contains(configuration.getFormat())) { + String format = + configuration.getFormat() == null ? null : configuration.getFormat().toUpperCase(); + if ("RAW".equals(format)) { + payloadSchema = Schema.of(Schema.Field.of("payload", Schema.FieldType.BYTES)); + payloadMapper = input -> Row.withSchema(payloadSchema).addValue(input).build(); + } else if ("JSON".equals(format)) { + payloadSchema = JsonUtils.beamSchemaFromJsonSchema(configuration.getSchema()); + payloadMapper = JsonUtils.getJsonBytesToRowFunction(payloadSchema); + } else if ("AVRO".equals(format)) { + payloadSchema = + AvroUtils.toBeamSchema( + new org.apache.avro.Schema.Parser().parse(configuration.getSchema())); + payloadMapper = AvroUtils.getAvroBytesToRowFunction(payloadSchema); + } else { throw new IllegalArgumentException( String.format( "Format %s not supported. Only supported formats are %s", configuration.getFormat(), VALID_FORMATS_STR)); } - beamSchema = - Objects.equals(configuration.getFormat(), "JSON") - ? JsonUtils.beamSchemaFromJsonSchema(configuration.getSchema()) - : AvroUtils.toBeamSchema( - new org.apache.avro.Schema.Parser().parse(configuration.getSchema())); - valueMapper = - Objects.equals(configuration.getFormat(), "JSON") - ? JsonUtils.getJsonBytesToRowFunction(beamSchema) - : AvroUtils.getAvroBytesToRowFunction(beamSchema); PubsubReadSchemaTransform transform = - new PubsubReadSchemaTransform( - configuration.getTopic(), configuration.getSubscription(), beamSchema, valueMapper); + new PubsubReadSchemaTransform(configuration, payloadSchema, payloadMapper); if (configuration.getClientFactory() != null) { transform.setClientFactory(configuration.getClientFactory()); @@ -135,45 +137,101 @@ public SchemaTransform from(PubsubReadSchemaTransformConfiguration configuration private static class PubsubReadSchemaTransform extends SchemaTransform implements Serializable { final Schema beamSchema; final SerializableFunction valueMapper; - final @Nullable String topic; - final @Nullable String subscription; + final PubsubReadSchemaTransformConfiguration configuration; @Nullable PubsubTestClientFactory clientFactory; @Nullable Clock clock; PubsubReadSchemaTransform( - @Nullable String topic, - @Nullable String subscription, - Schema beamSchema, + PubsubReadSchemaTransformConfiguration configuration, + Schema payloadSchema, SerializableFunction valueMapper) { - this.topic = topic; - this.subscription = subscription; - this.beamSchema = beamSchema; + this.configuration = configuration; + Schema outputSchema; + List attributes = configuration.getAttributes(); + String attributesMap = configuration.getAttributesMap(); + if (attributes == null && attributesMap == null) { + outputSchema = payloadSchema; + } else { + Schema.Builder outputSchemaBuilder = Schema.builder(); + outputSchemaBuilder.addFields(payloadSchema.getFields()); + if (attributes != null) { + for (String attribute : attributes) { + outputSchemaBuilder.addStringField(attribute); + } + } + if (attributesMap != null) { + outputSchemaBuilder.addMapField( + attributesMap, Schema.FieldType.STRING, Schema.FieldType.STRING); + } + outputSchema = outputSchemaBuilder.build(); + } + this.beamSchema = outputSchema; this.valueMapper = valueMapper; } private static class ErrorCounterFn extends DoFn { - private Counter pubsubErrorCounter; + private final Counter pubsubErrorCounter; private Long errorsInBundle = 0L; - private SerializableFunction valueMapper; + private final SerializableFunction valueMapper; + private final @Nullable List attributes; + private final @Nullable String attributesMap; + private final Schema outputSchema; + + final boolean useErrorOutput; - ErrorCounterFn(String name, SerializableFunction valueMapper) { + ErrorCounterFn( + String name, + SerializableFunction valueMapper, + @Nullable List attributes, + @Nullable String attributesMap, + Schema outputSchema, + boolean useErrorOutput) { this.pubsubErrorCounter = Metrics.counter(PubsubReadSchemaTransformProvider.class, name); this.valueMapper = valueMapper; + this.attributes = attributes; + this.attributesMap = attributesMap; + this.outputSchema = outputSchema; + this.useErrorOutput = useErrorOutput; } @ProcessElement - public void process(@DoFn.Element PubsubMessage message, MultiOutputReceiver receiver) { + public void process(@DoFn.Element PubsubMessage message, MultiOutputReceiver receiver) + throws Exception { try { - receiver.get(OUTPUT_TAG).output(valueMapper.apply(message.getPayload())); + Row payloadRow = valueMapper.apply(message.getPayload()); + Row outputRow; + if (attributes == null && attributesMap == null) { + outputRow = payloadRow; + } else { + Row.Builder rowBuilder = Row.withSchema(outputSchema); + List<@Nullable Object> payloadValues = payloadRow.getValues(); + if (payloadValues != null) { + rowBuilder.addValues(payloadValues); + } + if (attributes != null) { + for (String attribute : attributes) { + rowBuilder.addValue(message.getAttribute(attribute)); + } + } + if (attributesMap != null) { + rowBuilder.addValue(message.getAttributeMap()); + } + outputRow = rowBuilder.build(); + } + receiver.get(OUTPUT_TAG).output(outputRow); } catch (Exception e) { errorsInBundle += 1; - receiver - .get(ERROR_TAG) - .output( - Row.withSchema(ERROR_SCHEMA) - .addValues(e.toString(), message.getPayload()) - .build()); + if (useErrorOutput) { + receiver + .get(ERROR_TAG) + .output( + Row.withSchema(ERROR_SCHEMA) + .addValues(e.toString(), message.getPayload()) + .build()); + } else { + throw e; + } } } @@ -194,11 +252,14 @@ void setClock(@Nullable Clock clock) { @SuppressWarnings("nullness") PubsubIO.Read buildPubsubRead() { - PubsubIO.Read pubsubRead = PubsubIO.readMessages(); - if (!Strings.isNullOrEmpty(topic)) { - pubsubRead = pubsubRead.fromTopic(topic); + PubsubIO.Read pubsubRead = + (configuration.getAttributes() == null && configuration.getAttributesMap() == null) + ? PubsubIO.readMessages() + : PubsubIO.readMessagesWithAttributes(); + if (!Strings.isNullOrEmpty(configuration.getTopic())) { + pubsubRead = pubsubRead.fromTopic(configuration.getTopic()); } else { - pubsubRead = pubsubRead.fromSubscription(subscription); + pubsubRead = pubsubRead.fromSubscription(configuration.getSubscription()); } if (clientFactory != null && clock != null) { pubsubRead = pubsubRead.withClientFactory(clientFactory); @@ -207,26 +268,47 @@ PubsubIO.Read buildPubsubRead() { throw new IllegalArgumentException( "Both PubsubTestClientFactory and Clock need to be specified for testing, but only one is provided"); } + if (!Strings.isNullOrEmpty(configuration.getIdAttribute())) { + pubsubRead = pubsubRead.withIdAttribute(configuration.getIdAttribute()); + } + if (!Strings.isNullOrEmpty(configuration.getTimestampAttribute())) { + pubsubRead = pubsubRead.withTimestampAttribute(configuration.getTimestampAttribute()); + } return pubsubRead; } @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { PubsubIO.Read pubsubRead = buildPubsubRead(); + @SuppressWarnings("nullness") + String errorOutput = + configuration.getErrorHandling() == null + ? null + : configuration.getErrorHandling().getOutput(); PCollectionTuple outputTuple = input .getPipeline() .apply(pubsubRead) .apply( - ParDo.of(new ErrorCounterFn("PubSub-read-error-counter", valueMapper)) + ParDo.of( + new ErrorCounterFn( + "PubSub-read-error-counter", + valueMapper, + configuration.getAttributes(), + configuration.getAttributesMap(), + beamSchema, + errorOutput != null)) .withOutputTags(OUTPUT_TAG, TupleTagList.of(ERROR_TAG))); + outputTuple.get(OUTPUT_TAG).setRowSchema(beamSchema); + outputTuple.get(ERROR_TAG).setRowSchema(ERROR_SCHEMA); - return PCollectionRowTuple.of( - "output", - outputTuple.get(OUTPUT_TAG).setRowSchema(beamSchema), - "errors", - outputTuple.get(ERROR_TAG).setRowSchema(ERROR_SCHEMA)); + PCollectionRowTuple result = PCollectionRowTuple.of("output", outputTuple.get(OUTPUT_TAG)); + if (errorOutput == null) { + return result; + } else { + return result.and(errorOutput, outputTuple.get(ERROR_TAG)); + } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformConfiguration.java index 57620c968c5fd..f962e7185f1b9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformConfiguration.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformConfiguration.java @@ -18,6 +18,8 @@ package org.apache.beam.sdk.io.gcp.pubsub; import com.google.auto.value.AutoValue; +import java.util.List; +import javax.annotation.Nullable; import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; @@ -41,6 +43,45 @@ public abstract class PubsubWriteSchemaTransformConfiguration { "The name of the topic to write data to. " + "Format: projects/${PROJECT}/topics/${TOPIC}") public abstract String getTopic(); + @SchemaFieldDescription( + "The set of fields to write as PubSub attributes instead of part of the payload.") + public abstract @Nullable List getAttributes(); + + @SchemaFieldDescription( + "A map field to write as PubSub attributes instead of part of the payload.") + public abstract @Nullable String getAttributesMap(); + + @SchemaFieldDescription( + "If set, will set an attribute for each Cloud Pub/Sub message with the given name and a unique value. " + + "This attribute can then be used in a ReadFromPubSub PTransform to deduplicate messages.") + public abstract @Nullable String getIdAttribute(); + + @SchemaFieldDescription( + "If set, will set an attribute for each Cloud Pub/Sub message with the given name and the message's " + + "publish time as the value.") + public abstract @Nullable String getTimestampAttribute(); + + @SchemaFieldDescription("Specifies how to handle errors.") + public abstract @Nullable ErrorHandling getErrorHandling(); + + @AutoValue + public abstract static class ErrorHandling { + @SchemaFieldDescription("The name of the output PCollection containing failed writes.") + public abstract String getOutput(); + + public static PubsubWriteSchemaTransformConfiguration.ErrorHandling.Builder builder() { + return new AutoValue_PubsubWriteSchemaTransformConfiguration_ErrorHandling.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + public abstract PubsubWriteSchemaTransformConfiguration.ErrorHandling.Builder setOutput( + String output); + + public abstract PubsubWriteSchemaTransformConfiguration.ErrorHandling build(); + } + } + public static Builder builder() { return new AutoValue_PubsubWriteSchemaTransformConfiguration.Builder(); } @@ -51,6 +92,16 @@ public abstract static class Builder { public abstract Builder setTopic(String topic); + public abstract Builder setAttributes(@Nullable List attributes); + + public abstract Builder setAttributesMap(@Nullable String attributesMap); + + public abstract Builder setIdAttribute(@Nullable String idAttribute); + + public abstract Builder setTimestampAttribute(@Nullable String timestampAttribute); + + public abstract Builder setErrorHandling(@Nullable ErrorHandling errorHandling); + public abstract PubsubWriteSchemaTransformConfiguration build(); } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformProvider.java index 11c3d18bd3dc1..6187f6f79d3e9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteSchemaTransformProvider.java @@ -19,14 +19,19 @@ import com.google.auto.service.AutoService; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Objects; import java.util.Set; +import javax.annotation.Nullable; +import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; -import org.apache.beam.sdk.schemas.utils.AvroUtils; import org.apache.beam.sdk.schemas.utils.JsonUtils; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; @@ -36,6 +41,8 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; import org.checkerframework.checker.initialization.qual.Initialized; import org.checkerframework.checker.nullness.qual.NonNull; @@ -56,7 +63,7 @@ public class PubsubWriteSchemaTransformProvider public static final TupleTag OUTPUT_TAG = new TupleTag() {}; public static final TupleTag ERROR_TAG = new TupleTag() {}; - public static final String VALID_FORMATS_STR = "AVRO,JSON"; + public static final String VALID_FORMATS_STR = "RAW,AVRO,JSON"; public static final Set VALID_DATA_FORMATS = Sets.newHashSet(VALID_FORMATS_STR.split(",")); @@ -66,68 +73,177 @@ public Class configurationClass() { } public static class ErrorFn extends DoFn { - private SerializableFunction valueMapper; - private Schema errorSchema; + private final SerializableFunction valueMapper; + private final @Nullable Set attributes; + private final @Nullable String attributesMap; + private final Schema payloadSchema; + private final Schema errorSchema; + private final boolean useErrorOutput; - ErrorFn(SerializableFunction valueMapper, Schema errorSchema) { + ErrorFn( + SerializableFunction valueMapper, + @Nullable List attributes, + @Nullable String attributesMap, + Schema payloadSchema, + Schema errorSchema, + boolean useErrorOutput) { this.valueMapper = valueMapper; + this.attributes = attributes == null ? null : ImmutableSet.copyOf(attributes); + this.attributesMap = attributesMap; + this.payloadSchema = payloadSchema; this.errorSchema = errorSchema; + this.useErrorOutput = useErrorOutput; } @ProcessElement - public void processElement(@Element Row row, MultiOutputReceiver receiver) { + public void processElement(@Element Row row, MultiOutputReceiver receiver) throws Exception { try { - receiver.get(OUTPUT_TAG).output(new PubsubMessage(valueMapper.apply(row), null)); - } catch (Exception e) { + Row payloadRow; + Map messageAttributes = null; + if (attributes == null && attributesMap == null) { + payloadRow = row; + } else { + Row.Builder payloadRowBuilder = Row.withSchema(payloadSchema); + messageAttributes = new HashMap<>(); + List fields = row.getSchema().getFields(); + for (int ix = 0; ix < fields.size(); ix++) { + String name = fields.get(ix).getName(); + if (attributes != null && attributes.contains(name)) { + messageAttributes.put(name, row.getValue(ix)); + } else if (name.equals(attributesMap)) { + Map attrs = row.getMap(ix); + if (attrs != null) { + messageAttributes.putAll(attrs); + } + } else { + payloadRowBuilder.addValue(row.getValue(ix)); + } + } + payloadRow = payloadRowBuilder.build(); + } receiver - .get(ERROR_TAG) - .output(Row.withSchema(errorSchema).addValues(e.toString(), row).build()); + .get(OUTPUT_TAG) + .output(new PubsubMessage(valueMapper.apply(payloadRow), messageAttributes)); + } catch (Exception e) { + if (useErrorOutput) { + receiver + .get(ERROR_TAG) + .output(Row.withSchema(errorSchema).addValues(e.toString(), row).build()); + } else { + throw e; + } } } } @Override public SchemaTransform from(PubsubWriteSchemaTransformConfiguration configuration) { - if (!VALID_DATA_FORMATS.contains(configuration.getFormat())) { + if (!VALID_DATA_FORMATS.contains(configuration.getFormat().toUpperCase())) { throw new IllegalArgumentException( String.format( "Format %s not supported. Only supported formats are %s", configuration.getFormat(), VALID_FORMATS_STR)); } - return new PubsubWriteSchemaTransform(configuration.getTopic(), configuration.getFormat()); + return new PubsubWriteSchemaTransform(configuration); } private static class PubsubWriteSchemaTransform extends SchemaTransform implements Serializable { - final String topic; - final String format; + final PubsubWriteSchemaTransformConfiguration configuration; - PubsubWriteSchemaTransform(String topic, String format) { - this.topic = topic; - this.format = format; + PubsubWriteSchemaTransform(PubsubWriteSchemaTransformConfiguration configuration) { + this.configuration = configuration; } @Override + @SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) + }) public PCollectionRowTuple expand(PCollectionRowTuple input) { + String errorOutput = + configuration.getErrorHandling() == null + ? null + : configuration.getErrorHandling().getOutput(); + final Schema errorSchema = Schema.builder() .addStringField("error") .addNullableRowField("row", input.get("input").getSchema()) .build(); - SerializableFunction fn = - format.equals("AVRO") - ? AvroUtils.getRowToAvroBytesFunction(input.get("input").getSchema()) - : JsonUtils.getRowToJsonBytesFunction(input.get("input").getSchema()); + + String format = configuration.getFormat(); + Schema beamSchema = input.get("input").getSchema(); + Schema payloadSchema; + if (configuration.getAttributes() == null && configuration.getAttributesMap() == null) { + payloadSchema = beamSchema; + } else { + Schema.Builder payloadSchemaBuilder = Schema.builder(); + for (Schema.Field f : beamSchema.getFields()) { + if (!configuration.getAttributes().contains(f.getName()) + && !f.getName().equals(configuration.getAttributesMap())) { + payloadSchemaBuilder.addField(f); + } + } + payloadSchema = payloadSchemaBuilder.build(); + } + SerializableFunction fn; + if (Objects.equals(format, "RAW")) { + if (payloadSchema.getFieldCount() != 1) { + throw new IllegalArgumentException( + String.format( + "Raw output only supported for single-field schemas, got %s", payloadSchema)); + } + if (payloadSchema.getField(0).getType().equals(Schema.FieldType.BYTES)) { + fn = row -> row.getBytes(0); + } else if (payloadSchema.getField(0).getType().equals(Schema.FieldType.STRING)) { + fn = row -> row.getString(0).getBytes(StandardCharsets.UTF_8); + } else { + throw new IllegalArgumentException( + String.format( + "Raw output only supports bytes and string fields, got %s", + payloadSchema.getField(0))); + } + } else if (Objects.equals(format, "JSON")) { + fn = JsonUtils.getRowToJsonBytesFunction(payloadSchema); + } else if (Objects.equals(format, "AVRO")) { + fn = AvroUtils.getRowToAvroBytesFunction(payloadSchema); + } else { + throw new IllegalArgumentException( + String.format( + "Format %s not supported. Only supported formats are %s", + format, VALID_FORMATS_STR)); + } PCollectionTuple outputTuple = input .get("input") .apply( - ParDo.of(new ErrorFn(fn, errorSchema)) + ParDo.of( + new ErrorFn( + fn, + configuration.getAttributes(), + configuration.getAttributesMap(), + payloadSchema, + errorSchema, + errorOutput != null)) .withOutputTags(OUTPUT_TAG, TupleTagList.of(ERROR_TAG))); - outputTuple.get(OUTPUT_TAG).apply(PubsubIO.writeMessages().to(topic)); + PubsubIO.Write writeTransform = + PubsubIO.writeMessages().to(configuration.getTopic()); + if (!Strings.isNullOrEmpty(configuration.getIdAttribute())) { + writeTransform = writeTransform.withIdAttribute(configuration.getIdAttribute()); + } + if (!Strings.isNullOrEmpty(configuration.getTimestampAttribute())) { + writeTransform = writeTransform.withIdAttribute(configuration.getTimestampAttribute()); + } + outputTuple.get(OUTPUT_TAG).apply(writeTransform); + outputTuple.get(ERROR_TAG).setRowSchema(errorSchema); - return PCollectionRowTuple.of("errors", outputTuple.get(ERROR_TAG).setRowSchema(errorSchema)); + if (errorOutput == null) { + return PCollectionRowTuple.empty(input.getPipeline()); + } else { + return PCollectionRowTuple.of( + errorOutput, outputTuple.get(ERROR_TAG).setRowSchema(errorSchema)); + } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/SubscribeTransform.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/SubscribeTransform.java index f3ffbb13c2495..882294de1771e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/SubscribeTransform.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/SubscribeTransform.java @@ -56,7 +56,7 @@ public class SubscribeTransform extends PTransform queryUnflattened( String query, String projectId, boolean typed, boolean useStandardSql) throws IOException, InterruptedException { + return queryUnflattened(query, projectId, typed, useStandardSql, null); + } + + /** + * Performs a query without flattening results. May choose a location (GCP region) to perform this + * operation in. + */ + @Nonnull + public List queryUnflattened( + String query, + String projectId, + boolean typed, + boolean useStandardSql, + @Nullable String location) + throws IOException, InterruptedException { Random rnd = new Random(System.currentTimeMillis()); String temporaryDatasetId = String.format("_dataflow_temporary_dataset_%s_%s", System.nanoTime(), rnd.nextInt(1000000)); @@ -302,9 +317,11 @@ public List queryUnflattened( .setDatasetId(temporaryDatasetId) .setTableId(temporaryTableId); - createNewDataset(projectId, temporaryDatasetId); + createNewDataset(projectId, temporaryDatasetId, null, location); createNewTable( - projectId, temporaryDatasetId, new Table().setTableReference(tempTableReference)); + projectId, + temporaryDatasetId, + new Table().setTableReference(tempTableReference).setLocation(location)); JobConfigurationQuery jcQuery = new JobConfigurationQuery() @@ -325,6 +342,7 @@ public List queryUnflattened( bqClient .jobs() .getQueryResults(projectId, insertedJob.getJobReference().getJobId()) + .setLocation(location) .execute(); } while (!qResponse.getJobComplete()); @@ -395,6 +413,18 @@ public void createNewDataset(String projectId, String datasetId) public void createNewDataset( String projectId, String datasetId, @Nullable Long defaultTableExpirationMs) throws IOException, InterruptedException { + createNewDataset(projectId, datasetId, defaultTableExpirationMs, null); + } + + /** + * Creates a new dataset with defaultTableExpirationMs and in a specified location (GCP region). + */ + public void createNewDataset( + String projectId, + String datasetId, + @Nullable Long defaultTableExpirationMs, + @Nullable String location) + throws IOException, InterruptedException { Sleeper sleeper = Sleeper.DEFAULT; BackOff backoff = BackOffAdapter.toGcpBackOff(BACKOFF_FACTORY.backoff()); IOException lastException = null; @@ -410,7 +440,8 @@ public void createNewDataset( projectId, new Dataset() .setDatasetReference(new DatasetReference().setDatasetId(datasetId)) - .setDefaultTableExpirationMs(defaultTableExpirationMs)) + .setDefaultTableExpirationMs(defaultTableExpirationMs) + .setLocation(location)) .execute(); if (response != null) { LOG.info("Successfully created new dataset : " + response.getId()); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java index 347a3513d8968..f26c38d1e3c86 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java @@ -32,6 +32,7 @@ import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; import com.google.cloud.bigquery.storage.v1.AppendRowsResponse; import com.google.cloud.bigquery.storage.v1.BatchCommitWriteStreamsResponse; import com.google.cloud.bigquery.storage.v1.Exceptions; @@ -600,7 +601,10 @@ public WriteStream getWriteStream(String streamName) { @Override public StreamAppendClient getStreamAppendClient( - String streamName, DescriptorProtos.DescriptorProto descriptor, boolean useConnectionPool) + String streamName, + DescriptorProtos.DescriptorProto descriptor, + boolean useConnectionPool, + AppendRowsRequest.MissingValueInterpretation missingValueInterpretation) throws Exception { return new StreamAppendClient() { private Descriptor protoDescriptor; diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryIT.java index 692a12c0f4a7c..d355d6bb93366 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageQueryIT.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; +import static org.apache.beam.sdk.io.gcp.bigquery.TestBigQueryOptions.BIGQUERY_EARLY_ROLLOUT_REGION; + import java.util.Map; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; @@ -52,7 +54,13 @@ public class BigQueryIOStorageQueryIT { "1G", 11110839L, "1T", 11110839000L); - private static final String DATASET_ID = "big_query_storage"; + private static final String DATASET_ID = + TestPipeline.testingPipelineOptions() + .as(TestBigQueryOptions.class) + .getBigQueryLocation() + .equals(BIGQUERY_EARLY_ROLLOUT_REGION) + ? "big_query_storage_day0" + : "big_query_storage"; private static final String TABLE_PREFIX = "storage_read_"; private BigQueryIOStorageQueryOptions options; diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadIT.java index 570938470b9de..b4f6ddb76f720 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadIT.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; +import static org.apache.beam.sdk.io.gcp.bigquery.TestBigQueryOptions.BIGQUERY_EARLY_ROLLOUT_REGION; import static org.junit.Assert.assertEquals; import com.google.cloud.bigquery.storage.v1.DataFormat; @@ -65,7 +66,13 @@ public class BigQueryIOStorageReadIT { "1T", 11110839000L, "multi_field", 11110839L); - private static final String DATASET_ID = "big_query_storage"; + private static final String DATASET_ID = + TestPipeline.testingPipelineOptions() + .as(TestBigQueryOptions.class) + .getBigQueryLocation() + .equals(BIGQUERY_EARLY_ROLLOUT_REGION) + ? "big_query_storage_day0" + : "big_query_storage"; private static final String TABLE_PREFIX = "storage_read_"; private BigQueryIOStorageReadOptions options; diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTableRowIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTableRowIT.java index 734c3af2c4d43..35e2676c70ef9 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTableRowIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageReadTableRowIT.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; +import static org.apache.beam.sdk.io.gcp.bigquery.TestBigQueryOptions.BIGQUERY_EARLY_ROLLOUT_REGION; + import com.google.api.services.bigquery.model.TableRow; import java.util.HashSet; import java.util.Set; @@ -52,7 +54,13 @@ @RunWith(JUnit4.class) public class BigQueryIOStorageReadTableRowIT { - private static final String DATASET_ID = "big_query_import_export"; + private static final String DATASET_ID = + TestPipeline.testingPipelineOptions() + .as(TestBigQueryOptions.class) + .getBigQueryLocation() + .equals(BIGQUERY_EARLY_ROLLOUT_REGION) + ? "big_query_import_export_day0" + : "big_query_import_export"; private static final String TABLE_PREFIX = "parallel_read_table_row_"; private BigQueryIOStorageReadTableRowOptions options; @@ -67,12 +75,11 @@ public interface BigQueryIOStorageReadTableRowOptions void setInputTable(String table); } - private static class TableRowToKVPairFn extends SimpleFunction> { + private static class TableRowToKVPairFn extends SimpleFunction> { @Override - public KV apply(TableRow input) { - CharSequence sampleString = (CharSequence) input.get("sample_string"); - String key = sampleString != null ? sampleString.toString() : "null"; - return KV.of(key, BigQueryHelpers.toJsonString(input)); + public KV apply(TableRow input) { + Integer rowId = Integer.parseInt((String) input.get("id")); + return KV.of(rowId, BigQueryHelpers.toJsonString(input)); } } @@ -87,7 +94,7 @@ private void setUpTestEnvironment(String tableName) { private static void runPipeline(BigQueryIOStorageReadTableRowOptions pipelineOptions) { Pipeline pipeline = Pipeline.create(pipelineOptions); - PCollection> jsonTableRowsFromExport = + PCollection> jsonTableRowsFromExport = pipeline .apply( "ExportTable", @@ -96,7 +103,7 @@ private static void runPipeline(BigQueryIOStorageReadTableRowOptions pipelineOpt .withMethod(Method.EXPORT)) .apply("MapExportedRows", MapElements.via(new TableRowToKVPairFn())); - PCollection> jsonTableRowsFromDirectRead = + PCollection> jsonTableRowsFromDirectRead = pipeline .apply( "DirectReadTable", @@ -108,16 +115,16 @@ private static void runPipeline(BigQueryIOStorageReadTableRowOptions pipelineOpt final TupleTag exportTag = new TupleTag<>(); final TupleTag directReadTag = new TupleTag<>(); - PCollection>> unmatchedRows = + PCollection>> unmatchedRows = KeyedPCollectionTuple.of(exportTag, jsonTableRowsFromExport) .and(directReadTag, jsonTableRowsFromDirectRead) .apply(CoGroupByKey.create()) .apply( ParDo.of( - new DoFn, KV>>() { + new DoFn, KV>>() { @ProcessElement - public void processElement(ProcessContext c) throws Exception { - KV element = c.element(); + public void processElement(ProcessContext c) { + KV element = c.element(); // Add all the exported rows for the key to a collection. Set uniqueRows = new HashSet<>(); @@ -147,20 +154,20 @@ public void processElement(ProcessContext c) throws Exception { } @Test - public void testBigQueryStorageReadTableRow1() throws Exception { - setUpTestEnvironment("1"); + public void testBigQueryStorageReadTableRow100() { + setUpTestEnvironment("100"); runPipeline(options); } @Test - public void testBigQueryStorageReadTableRow10k() throws Exception { - setUpTestEnvironment("10k"); + public void testBigQueryStorageReadTableRow1k() { + setUpTestEnvironment("1K"); runPipeline(options); } @Test - public void testBigQueryStorageReadTableRow100k() throws Exception { - setUpTestEnvironment("100k"); + public void testBigQueryStorageReadTableRow10k() { + setUpTestEnvironment("10K"); runPipeline(options); } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java index fc3ce0be4b691..d061898d55c77 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java @@ -26,11 +26,11 @@ import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import java.io.IOException; +import java.security.SecureRandom; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.GenerateSequence; import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; -import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.MapElements; @@ -43,6 +43,8 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Duration; import org.joda.time.Instant; +import org.junit.AfterClass; +import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -60,24 +62,37 @@ private enum WriteMode { AT_LEAST_ONCE } - private String project; - private static final String DATASET_ID = "big_query_storage"; + private static String project; + private static final String DATASET_ID = + "big_query_storage_write_it_" + + System.currentTimeMillis() + + "_" + + new SecureRandom().nextInt(32); private static final String TABLE_PREFIX = "storage_write_"; - private BigQueryOptions bqOptions; + private static TestBigQueryOptions bqOptions; private static final BigqueryClient BQ_CLIENT = new BigqueryClient("BigQueryStorageIOWriteIT"); + @BeforeClass + public static void setup() throws Exception { + bqOptions = TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class); + project = bqOptions.as(GcpOptions.class).getProject(); + // Create one BQ dataset for all test cases. + BQ_CLIENT.createNewDataset(project, DATASET_ID, null, bqOptions.getBigQueryLocation()); + } + + @AfterClass + public static void cleanup() { + BQ_CLIENT.deleteDataset(project, DATASET_ID); + } + private void setUpTestEnvironment(WriteMode writeMode) { - PipelineOptionsFactory.register(BigQueryOptions.class); - bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class); - bqOptions.setProject(TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject()); bqOptions.setUseStorageWriteApi(true); if (writeMode == WriteMode.AT_LEAST_ONCE) { bqOptions.setUseStorageWriteApiAtLeastOnce(true); } bqOptions.setNumStorageWriteApiStreams(2); bqOptions.setStorageWriteApiTriggeringFrequencySec(1); - project = TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); } static class FillRowFn extends DoFn { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaUpdateOptionsIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaUpdateOptionsIT.java index 611c691dca126..833a0a0829c7f 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaUpdateOptionsIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaUpdateOptionsIT.java @@ -87,7 +87,11 @@ public class BigQuerySchemaUpdateOptionsIT { @BeforeClass public static void setupTestEnvironment() throws Exception { project = TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); - BQ_CLIENT.createNewDataset(project, BIG_QUERY_DATASET_ID); + BQ_CLIENT.createNewDataset( + project, + BIG_QUERY_DATASET_ID, + null, + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation()); } @AfterClass diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTimePartitioningClusteringIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTimePartitioningClusteringIT.java index 3ceb6f0966b75..da5f396e8d893 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTimePartitioningClusteringIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryTimePartitioningClusteringIT.java @@ -24,9 +24,11 @@ import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.api.services.bigquery.model.TimePartitioning; +import java.security.SecureRandom; import java.util.Arrays; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; @@ -38,8 +40,10 @@ import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.ValueInSingleWindow; import org.checkerframework.checker.nullness.qual.Nullable; +import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -49,7 +53,15 @@ public class BigQueryTimePartitioningClusteringIT { private static final String WEATHER_SAMPLES_TABLE = "apache-beam-testing.samples.weather_stations"; - private static final String DATASET_NAME = "BigQueryTimePartitioningIT"; + + private static String project; + private static final BigqueryClient BQ_CLIENT = + new BigqueryClient("BigQueryTimePartitioningClusteringIT"); + private static final String DATASET_NAME = + "BigQueryTimePartitioningIT_" + + System.currentTimeMillis() + + "_" + + new SecureRandom().nextInt(32); private static final TimePartitioning TIME_PARTITIONING = new TimePartitioning().setField("date").setType("DAY"); private static final Clustering CLUSTERING = @@ -64,6 +76,16 @@ public class BigQueryTimePartitioningClusteringIT { private Bigquery bqClient; private BigQueryClusteringITOptions options; + @BeforeClass + public static void setupTestEnvironment() throws Exception { + project = TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); + BQ_CLIENT.createNewDataset( + project, + DATASET_NAME, + null, + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation()); + } + @Before public void setUp() { PipelineOptionsFactory.register(BigQueryClusteringITOptions.class); @@ -72,6 +94,11 @@ public void setUp() { bqClient = BigqueryClient.getNewBigqueryClient(options.getAppName()); } + @AfterClass + public static void cleanup() { + BQ_CLIENT.deleteDataset(project, DATASET_NAME); + } + /** Customized PipelineOptions for BigQueryClustering Integration Test. */ public interface BigQueryClusteringITOptions extends TestPipelineOptions, ExperimentalOptions, BigQueryOptions { @@ -110,8 +137,7 @@ public ClusteredDestinations(String tableName) { @Override public TableDestination getDestination(ValueInSingleWindow element) { - return new TableDestination( - String.format("%s.%s", DATASET_NAME, tableName), null, TIME_PARTITIONING, CLUSTERING); + return new TableDestination(tableName, null, TIME_PARTITIONING, CLUSTERING); } @Override @@ -176,6 +202,7 @@ public void testE2EBigQueryClustering() throws Exception { @Test public void testE2EBigQueryClusteringTableFunction() throws Exception { String tableName = "weather_stations_clustered_table_function_" + System.currentTimeMillis(); + String destination = String.format("%s.%s", DATASET_NAME, tableName); Pipeline p = Pipeline.create(options); @@ -185,11 +212,7 @@ public void testE2EBigQueryClusteringTableFunction() throws Exception { BigQueryIO.writeTableRows() .to( (ValueInSingleWindow vsw) -> - new TableDestination( - String.format("%s.%s", DATASET_NAME, tableName), - null, - TIME_PARTITIONING, - CLUSTERING)) + new TableDestination(destination, null, TIME_PARTITIONING, CLUSTERING)) .withClustering() .withSchema(SCHEMA) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) @@ -206,6 +229,7 @@ public void testE2EBigQueryClusteringTableFunction() throws Exception { public void testE2EBigQueryClusteringDynamicDestinations() throws Exception { String tableName = "weather_stations_clustered_dynamic_destinations_" + System.currentTimeMillis(); + String destination = String.format("%s.%s", DATASET_NAME, tableName); Pipeline p = Pipeline.create(options); @@ -213,7 +237,7 @@ public void testE2EBigQueryClusteringDynamicDestinations() throws Exception { .apply(ParDo.of(new KeepStationNumberAndConvertDate())) .apply( BigQueryIO.writeTableRows() - .to(new ClusteredDestinations(tableName)) + .to(new ClusteredDestinations(destination)) .withClustering() .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryToTableIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryToTableIT.java index d6b7f8e16412b..1abe7752b2e0b 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryToTableIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryToTableIT.java @@ -46,7 +46,6 @@ import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.Validation; import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.TestPipelineOptions; import org.apache.beam.sdk.transforms.Reshuffle; import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.transforms.WithKeys; @@ -214,7 +213,7 @@ private void verifyStandardQueryRes(String outputTable) throws Exception { } /** Customized PipelineOption for BigQueryToTable Pipeline. */ - public interface BigQueryToTableOptions extends TestPipelineOptions, ExperimentalOptions { + public interface BigQueryToTableOptions extends TestBigQueryOptions, ExperimentalOptions { @Description("The BigQuery query to be used for creating the source") @Validation.Required @@ -252,9 +251,11 @@ public interface BigQueryToTableOptions extends TestPipelineOptions, Experimenta @BeforeClass public static void setupTestEnvironment() throws Exception { PipelineOptionsFactory.register(BigQueryToTableOptions.class); - project = TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); + BigQueryToTableOptions options = + TestPipeline.testingPipelineOptions().as(BigQueryToTableOptions.class); + project = options.as(GcpOptions.class).getProject(); // Create one BQ dataset for all test cases. - BQ_CLIENT.createNewDataset(project, BIG_QUERY_DATASET_ID); + BQ_CLIENT.createNewDataset(project, BIG_QUERY_DATASET_ID, null, options.getBigQueryLocation()); // Create table and insert data for new type query test cases. BQ_CLIENT.createNewTable( diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java index f4074cc1a556b..d73ff5e2b7124 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java @@ -19,6 +19,7 @@ import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils.toTableRow; import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils.toTableSchema; +import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils.toTableSpec; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.containsInAnyOrder; @@ -994,6 +995,27 @@ public void testToBeamRow_avro_array_array_row() { assertEquals(expected, beamRow); } + @Test + public void testToTableSpec() { + TableReference withProject = + new TableReference().setProjectId("project").setDatasetId("dataset").setTableId("table"); + TableReference withoutProject = + new TableReference().setDatasetId("dataset").setTableId("table"); + TableReference withDatasetOnly = new TableReference().setDatasetId("dataset"); + TableReference withTableOnly = new TableReference().setTableId("table"); + + assertEquals("project.dataset.table", toTableSpec(withProject)); + assertEquals("dataset.table", toTableSpec(withoutProject)); + assertThrows( + "must include at least a dataset and a table", + IllegalArgumentException.class, + () -> toTableSpec(withDatasetOnly)); + assertThrows( + "must include at least a dataset and a table", + IllegalArgumentException.class, + () -> toTableSpec(withTableOnly)); + } + @Test public void testToTableReference() { { @@ -1020,6 +1042,14 @@ public void testToTableReference() { assertEquals("mytable", tr.getTableId()); } + { + // Test project that contains a dot and colon + TableReference tr = BigQueryUtils.toTableReference("project.with:domain.mydataset.mytable"); + assertEquals("project.with:domain", tr.getProjectId()); + assertEquals("mydataset", tr.getDatasetId()); + assertEquals("mytable", tr.getTableId()); + } + // Invalid scenarios assertNull(BigQueryUtils.toTableReference("")); assertNull(BigQueryUtils.toTableReference(":.")); @@ -1031,12 +1061,15 @@ public void testToTableReference() { assertNull(BigQueryUtils.toTableReference("myproject:mydataset.")); assertNull(BigQueryUtils.toTableReference("myproject:mydataset.mytable.")); assertNull(BigQueryUtils.toTableReference("myproject:mydataset:mytable:")); + assertNull(BigQueryUtils.toTableReference("myproject:my dataset:mytable:")); assertNull(BigQueryUtils.toTableReference(".invalidleadingdot.mydataset.mytable")); assertNull(BigQueryUtils.toTableReference("invalidtrailingdot.mydataset.mytable.")); assertNull(BigQueryUtils.toTableReference(":invalidleadingcolon.mydataset.mytable")); assertNull(BigQueryUtils.toTableReference("invalidtrailingcolon.mydataset.mytable:")); - assertNull(BigQueryUtils.toTableReference("myproject.mydataset.mytable.myinvalidpart")); - assertNull(BigQueryUtils.toTableReference("myproject:mydataset.mytable.myinvalidpart")); + assertNull(BigQueryUtils.toTableReference("projectendswithhyphen-.mydataset.mytable")); + assertNull( + BigQueryUtils.toTableReference( + "projectnamegoesbeyondthe30characterlimit.mydataset.mytable")); assertNull( BigQueryUtils.toTableReference("/projects/extraslash/datasets/mydataset/tables/mytable")); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FileLoadsStreamingIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FileLoadsStreamingIT.java index 012afed6fb436..678708062b8d3 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FileLoadsStreamingIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/FileLoadsStreamingIT.java @@ -106,11 +106,16 @@ public static Iterable data() { private final Random randomGenerator = new Random(); + // used when test suite specifies a particular GCP location for BigQuery operations + private static String bigQueryLocation; + @BeforeClass public static void setUpTestEnvironment() throws IOException, InterruptedException { // Create one BQ dataset for all test cases. cleanUp(); - BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID); + bigQueryLocation = + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation(); + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null, bigQueryLocation); } @AfterClass @@ -293,7 +298,7 @@ private static void checkRowCompleteness( throws IOException, InterruptedException { List actualTableRows = BQ_CLIENT.queryUnflattened( - String.format("SELECT * FROM [%s]", tableSpec), PROJECT, true, false); + String.format("SELECT * FROM [%s]", tableSpec), PROJECT, true, false, bigQueryLocation); Schema rowSchema = BigQueryUtils.fromTableSchema(schema); List actualBeamRows = diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDirectWriteProtosIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDirectWriteProtosIT.java index 93bc4162409f1..3da93c42a4800 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDirectWriteProtosIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDirectWriteProtosIT.java @@ -80,10 +80,15 @@ private BigQueryIO.Write.Method getMethod() { : BigQueryIO.Write.Method.STORAGE_WRITE_API; } + // used when test suite specifies a particular GCP location for BigQuery operations + private static String bigQueryLocation; + @BeforeClass public static void setUpTestEnvironment() throws IOException, InterruptedException { // Create one BQ dataset for all test cases. - BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID); + bigQueryLocation = + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation(); + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null, bigQueryLocation); } @AfterClass @@ -191,7 +196,7 @@ public void testDirectWriteProtos() throws Exception { void assertRowsWritten(String tableSpec, Iterable expectedItems) throws Exception { List rows = BQ_CLIENT.queryUnflattened( - String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true); + String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true, bigQueryLocation); assertThat(rows, containsInAnyOrder(Iterables.toArray(expectedItems, TableRow.class))); } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkDefaultValuesIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkDefaultValuesIT.java new file mode 100644 index 0000000000000..87c3659fa0811 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkDefaultValuesIT.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigquery; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +import com.google.api.services.bigquery.model.Table; +import com.google.api.services.bigquery.model.TableFieldSchema; +import com.google.api.services.bigquery.model.TableReference; +import com.google.api.services.bigquery.model.TableRow; +import com.google.api.services.bigquery.model.TableSchema; +import com.google.cloud.bigquery.storage.v1.AppendRowsRequest; +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.util.Preconditions; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.hamcrest.Matchers; +import org.joda.time.Duration; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class StorageApiSinkDefaultValuesIT { + private static final BigqueryClient BQ_CLIENT = + new BigqueryClient("StorageApiSinkDefaultValuesIT"); + private static final String PROJECT = + TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); + private static final String BIG_QUERY_DATASET_ID = + "storage_api_sink_default_values" + System.nanoTime(); + + private static String bigQueryLocation; + + @BeforeClass + public static void setUpTestEnvironment() throws IOException, InterruptedException { + // Create one BQ dataset for all test cases. + bigQueryLocation = + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation(); + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null, bigQueryLocation); + } + + @AfterClass + public static void cleanup() { + BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); + } + + private static String createAndGetTablespec(TableSchema tableSchema) + throws IOException, InterruptedException { + String tableName = "table" + System.nanoTime(); + TableReference tableReference = + new TableReference() + .setProjectId(PROJECT) + .setDatasetId(BIG_QUERY_DATASET_ID) + .setTableId(tableName); + BQ_CLIENT.createNewTable( + PROJECT, + BIG_QUERY_DATASET_ID, + new Table().setSchema(tableSchema).setTableReference(tableReference)); + return PROJECT + "." + BIG_QUERY_DATASET_ID + "." + tableName; + } + + @Test + public void testMissingValueSchemaKnownTakeDefault() throws IOException, InterruptedException { + runTest(true, true, false); + } + + @Test + public void testMissingRequiredValueSchemaKnownTakeDefault() + throws IOException, InterruptedException { + runTest(true, true, true); + } + + @Test + public void testMissingRequiredValueSchemaKnownTakeNull() + throws IOException, InterruptedException { + runTest(true, false, true); + } + + @Test + public void testMissingRequiredValueSchemaUnknownTakeDefault() + throws IOException, InterruptedException { + runTest(false, true, true); + } + + @Test + public void testMissingValueSchemaUnknownTakeDefault() throws IOException, InterruptedException { + + runTest(false, true, false); + } + + @Test + public void testMissingValueSchemaKnownTakeNull() throws IOException, InterruptedException { + runTest(true, false, false); + } + + @Test + @Ignore // This currently appears broke in BigQuery. + public void testMissingValueSchemaUnknownTakeNull() throws IOException, InterruptedException { + runTest(false, false, false); + } + + public void runTest( + boolean sinkKnowsDefaultFields, boolean takeDefault, boolean defaultFieldsRequired) + throws IOException, InterruptedException { + boolean expectDeadLetter = !takeDefault && defaultFieldsRequired; + TableSchema bqSchema; + if (defaultFieldsRequired) { + bqSchema = + new TableSchema() + .setFields( + ImmutableList.of( + new TableFieldSchema().setName("id").setType("STRING"), + new TableFieldSchema().setName("key2").setType("STRING"), + new TableFieldSchema().setName("value").setType("STRING"), + new TableFieldSchema() + .setName("defaultrepeated") + .setType("STRING") + .setMode("REPEATED") + .setDefaultValueExpression("['a','b', 'c']"), + new TableFieldSchema() + .setName("defaultliteral") + .setType("INT64") + .setDefaultValueExpression("42") + .setMode("REQUIRED"), + new TableFieldSchema() + .setName("defaulttime") + .setType("TIMESTAMP") + .setDefaultValueExpression("CURRENT_TIMESTAMP()") + .setMode("REQUIRED"))); + } else { + bqSchema = + new TableSchema() + .setFields( + ImmutableList.of( + new TableFieldSchema().setName("id").setType("STRING"), + new TableFieldSchema().setName("key2").setType("STRING"), + new TableFieldSchema().setName("value").setType("STRING"), + new TableFieldSchema() + .setName("defaultrepeated") + .setType("STRING") + .setMode("REPEATED") + .setDefaultValueExpression("['a','b', 'c']"), + new TableFieldSchema() + .setName("defaultliteral") + .setType("INT64") + .setDefaultValueExpression("42"), + new TableFieldSchema() + .setName("defaulttime") + .setType("TIMESTAMP") + .setDefaultValueExpression("CURRENT_TIMESTAMP()"))); + } + + TableSchema sinkSchema = bqSchema; + if (!sinkKnowsDefaultFields) { + sinkSchema = + new TableSchema() + .setFields( + bqSchema.getFields().stream() + .filter(tfs -> tfs.getDefaultValueExpression() == null) + .collect(Collectors.toList())); + } + final TableRow row1 = + new TableRow() + .set("id", "row1") + .set("key2", "bar0") + .set("value", "1") + .set("defaultliteral", 12) + .set("defaultrepeated", Lists.newArrayList("foo", "bar")); + final TableRow row2 = new TableRow().set("id", "row2").set("key2", "bar1").set("value", "1"); + final TableRow row3 = new TableRow().set("id", "row3").set("key2", "bar2").set("value", "2"); + + List tableRows = Lists.newArrayList(row1, row2, row3); + + String tableSpec = createAndGetTablespec(bqSchema); + Pipeline p = Pipeline.create(); + + BigQueryIO.Write write = + BigQueryIO.writeTableRows() + .to(tableSpec) + .withSchema(sinkSchema) + .withNumStorageWriteApiStreams(2) + .ignoreUnknownValues() + .withTriggeringFrequency(Duration.standardSeconds(1)) + .withMethod(BigQueryIO.Write.Method.STORAGE_WRITE_API) + .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER); + if (!takeDefault) { + write = + write.withDefaultMissingValueInterpretation( + AppendRowsRequest.MissingValueInterpretation.NULL_VALUE); + } + WriteResult writeResult = + p.apply("Create rows", Create.of(tableRows)) + .setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED) + .apply("write", write); + if (expectDeadLetter) { + PAssert.that(writeResult.getFailedStorageApiInserts()) + .satisfies( + (SerializableFunction, Void>) + input -> { + assertThat(Lists.newArrayList(input).size(), is(3)); + // assertThat(input, containsInAnyOrder(tableRows)); + return null; + }); + } + p.run(); + + if (!expectDeadLetter) { + Map queryResponse = + BQ_CLIENT + .queryUnflattened( + String.format("SELECT * FROM %s", tableSpec), + PROJECT, + true, + true, + bigQueryLocation) + .stream() + .collect(Collectors.toMap(tr -> (String) tr.get("id"), Function.identity())); + assertThat(queryResponse.size(), equalTo(3)); + + TableRow resultRow1 = Preconditions.checkArgumentNotNull(queryResponse.get("row1")); + TableRow resultRow2 = Preconditions.checkArgumentNotNull(queryResponse.get("row2")); + TableRow resultRow3 = Preconditions.checkArgumentNotNull(queryResponse.get("row3")); + + if (sinkKnowsDefaultFields) { + assertThat(resultRow1.get("defaultliteral"), equalTo("12")); + assertThat( + (Collection) resultRow1.get("defaultrepeated"), + containsInAnyOrder("foo", "bar")); + if (takeDefault) { + assertNotNull(resultRow1.get("defaulttime")); + assertNotNull(resultRow2.get("defaulttime")); + assertThat(resultRow2.get("defaultliteral"), equalTo("42")); + assertThat( + (Collection) resultRow2.get("defaultrepeated"), + containsInAnyOrder("a", "b", "c")); + assertNotNull(resultRow3.get("defaulttime")); + assertThat(resultRow3.get("defaultliteral"), equalTo("42")); + assertThat( + (Collection) resultRow3.get("defaultrepeated"), + containsInAnyOrder("a", "b", "c")); + } else { + assertNull(resultRow1.get("defaulttime")); + assertNull(resultRow2.get("defaulttime")); + assertNull(resultRow2.get("defaultliteral")); + assertThat((Collection) resultRow2.get("defaultrepeated"), Matchers.empty()); + assertNull(resultRow3.get("defaulttime")); + assertNull(resultRow3.get("defaultliteral")); + assertThat((Collection) resultRow3.get("defaultrepeated"), Matchers.empty()); + } + } else { + if (takeDefault) { + assertNotNull(resultRow1.get("defaulttime")); + assertThat(resultRow1.get("defaultliteral"), equalTo("42")); + assertThat( + (Collection) resultRow1.get("defaultrepeated"), + containsInAnyOrder("a", "b", "c")); + assertNotNull(resultRow2.get("defaulttime")); + assertThat(resultRow2.get("defaultliteral"), equalTo("42")); + assertThat( + (Collection) resultRow2.get("defaultrepeated"), + containsInAnyOrder("a", "b", "c")); + assertNotNull(resultRow3.get("defaulttime")); + assertThat(resultRow3.get("defaultliteral"), equalTo("42")); + assertThat( + (Collection) resultRow3.get("defaultrepeated"), + containsInAnyOrder("a", "b", "c")); + } else { + assertNull(resultRow1.get("defaulttime")); + assertNull(resultRow1.get("defaultliteral")); + assertThat((Collection) resultRow1.get("defaultrepeated"), Matchers.empty()); + assertNull(resultRow2.get("defaulttime")); + assertNull(resultRow2.get("defaultliteral")); + assertThat((Collection) resultRow2.get("defaultrepeated"), Matchers.empty()); + assertNull(resultRow3.get("defaulttime")); + assertNull(resultRow3.get("defaultliteral")); + assertThat((Collection) resultRow3.get("defaultrepeated"), Matchers.empty()); + } + } + } + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkFailedRowsIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkFailedRowsIT.java index 3dcde8f39cd74..f721f57147e3d 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkFailedRowsIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkFailedRowsIT.java @@ -108,10 +108,15 @@ private BigQueryIO.Write.Method getMethod() { : BigQueryIO.Write.Method.STORAGE_WRITE_API; } + // used when test suite specifies a particular GCP location for BigQuery operations + private static String bigQueryLocation; + @BeforeClass public static void setUpTestEnvironment() throws IOException, InterruptedException { // Create one BQ dataset for all test cases. - BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID); + bigQueryLocation = + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation(); + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null, bigQueryLocation); } @AfterClass @@ -217,7 +222,11 @@ private void assertGoodRowsWritten(String tableSpec, Iterable goodRows TableRow queryResponse = Iterables.getOnlyElement( BQ_CLIENT.queryUnflattened( - String.format("SELECT COUNT(*) FROM %s", tableSpec), PROJECT, true, true)); + String.format("SELECT COUNT(*) FROM `%s`", tableSpec), + PROJECT, + true, + true, + bigQueryLocation)); int numRowsWritten = Integer.parseInt((String) queryResponse.get("f0_")); if (useAtLeastOnce) { assertThat(numRowsWritten, Matchers.greaterThanOrEqualTo(Iterables.size(goodRows))); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkRowUpdateIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkRowUpdateIT.java index d5366fe296130..f8cc797a87cd5 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkRowUpdateIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkRowUpdateIT.java @@ -49,10 +49,15 @@ public class StorageApiSinkRowUpdateIT { private static final String BIG_QUERY_DATASET_ID = "storage_api_sink_rows_update" + System.nanoTime(); + // used when test suite specifies a particular GCP location for BigQuery operations + private static String bigQueryLocation; + @BeforeClass public static void setUpTestEnvironment() throws IOException, InterruptedException { // Create one BQ dataset for all test cases. - BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID); + bigQueryLocation = + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation(); + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null, bigQueryLocation); } @AfterClass @@ -129,7 +134,7 @@ private void assertRowsWritten(String tableSpec, Iterable expected) throws IOException, InterruptedException { List queryResponse = BQ_CLIENT.queryUnflattened( - String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true); + String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true, bigQueryLocation); assertThat(queryResponse, containsInAnyOrder(Iterables.toArray(expected, TableRow.class))); } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java index 6931b7ac9b98a..bc99a4f50f700 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkSchemaUpdateIT.java @@ -121,17 +121,21 @@ public static Iterable data() { // an updated schema. If that happens consistently, just increase these two numbers // to give it more time. // Total number of rows written to the sink - private static final int TOTAL_N = 60; + private static final int TOTAL_N = 70; // Number of rows with the original schema - private static final int ORIGINAL_N = 50; + private static final int ORIGINAL_N = 60; private final Random randomGenerator = new Random(); + // used when test suite specifies a particular GCP location for BigQuery operations + private static String bigQueryLocation; + @BeforeClass public static void setUpTestEnvironment() throws IOException, InterruptedException { // Create one BQ dataset for all test cases. - LOG.info("Creating dataset {}.", BIG_QUERY_DATASET_ID); - BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID); + bigQueryLocation = + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation(); + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null, bigQueryLocation); } @AfterClass @@ -459,7 +463,8 @@ private static void checkRowCompleteness( String.format("SELECT COUNT(DISTINCT(id)), COUNT(id) FROM [%s]", tableSpec), PROJECT, true, - false)); + false, + bigQueryLocation)); int distinctCount = Integer.parseInt((String) queryResponse.get("f0_")); int totalCount = Integer.parseInt((String) queryResponse.get("f1_")); @@ -479,7 +484,7 @@ public void checkRowsWithUpdatedSchema( throws IOException, InterruptedException { List actualRows = BQ_CLIENT.queryUnflattened( - String.format("SELECT * FROM [%s]", tableSpec), PROJECT, true, false); + String.format("SELECT * FROM [%s]", tableSpec), PROJECT, true, false, bigQueryLocation); for (TableRow row : actualRows) { // Rows written to the table should not have the extra field if diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java index 218aa7411414c..f28ae588a5ecb 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java @@ -318,10 +318,15 @@ public class TableRowToStorageApiProtoIT { .setFields(BASE_TABLE_SCHEMA.getFields())) .build()); + // used when test suite specifies a particular GCP location for BigQuery operations + private static String bigQueryLocation; + @BeforeClass public static void setUpTestEnvironment() throws IOException, InterruptedException { // Create one BQ dataset for all test cases. - BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID); + bigQueryLocation = + TestPipeline.testingPipelineOptions().as(TestBigQueryOptions.class).getBigQueryLocation(); + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null, bigQueryLocation); } @AfterClass @@ -338,7 +343,7 @@ public void testBaseTableRow() throws IOException, InterruptedException { List actualTableRows = BQ_CLIENT.queryUnflattened( - String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true); + String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true, bigQueryLocation); assertEquals(1, actualTableRows.size()); assertEquals(BASE_TABLE_ROW_EXPECTED, actualTableRows.get(0)); @@ -364,7 +369,7 @@ public void testNestedRichTypesAndNull() throws IOException, InterruptedExceptio List actualTableRows = BQ_CLIENT.queryUnflattened( - String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true); + String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true, bigQueryLocation); assertEquals(1, actualTableRows.size()); assertEquals(BASE_TABLE_ROW_EXPECTED, actualTableRows.get(0).get("nestedValue1")); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java index bb70eb78984c4..714dc9f8619d8 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIOTest.java @@ -219,11 +219,13 @@ public void testReadBuildsCorrectly() { .withTableId("table") .withInstanceId("instance") .withProjectId("project") + .withAppProfileId("app-profile") .withBigtableOptionsConfigurator(PORT_CONFIGURATOR); assertEquals("options_project", read.getBigtableOptions().getProjectId()); assertEquals("options_instance", read.getBigtableOptions().getInstanceId()); assertEquals("instance", read.getBigtableConfig().getInstanceId().get()); assertEquals("project", read.getBigtableConfig().getProjectId().get()); + assertEquals("app-profile", read.getBigtableConfig().getAppProfileId().get()); assertEquals("table", read.getTableId()); assertEquals(PORT_CONFIGURATOR, read.getBigtableConfig().getBigtableOptionsConfigurator()); } @@ -373,12 +375,14 @@ public void testWriteBuildsCorrectly() { .withBigtableOptions(BIGTABLE_OPTIONS) .withTableId("table") .withInstanceId("instance") - .withProjectId("project"); + .withProjectId("project") + .withAppProfileId("app-profile"); assertEquals("table", write.getBigtableWriteOptions().getTableId().get()); assertEquals("options_project", write.getBigtableOptions().getProjectId()); assertEquals("options_instance", write.getBigtableOptions().getInstanceId()); assertEquals("instance", write.getBigtableConfig().getInstanceId().get()); assertEquals("project", write.getBigtableConfig().getProjectId().get()); + assertEquals("app-profile", write.getBigtableConfig().getAppProfileId().get()); } @Test @@ -766,6 +770,39 @@ public void testReadingWithSplits() throws Exception { assertSourcesEqualReferenceSource(source, splits, null /* options */); } + /** + * Regression test for [Bug]: BigtableSource + * "Desired bundle size 0 bytes must be greater than 0" #28793. + */ + @Test + public void testSplittingWithDesiredBundleSizeZero() throws Exception { + final String table = "TEST-SPLIT-DESIRED-BUNDLE-SIZE-ZERO-TABLE"; + final int numRows = 10; + final int numSamples = 10; + final long bytesPerRow = 1L; + + // Set up test table data and sample row keys for size estimation and splitting. + makeTableData(table, numRows); + service.setupSampleRowKeys(table, numSamples, bytesPerRow); + + // Generate source and split it. + BigtableSource source = + new BigtableSource( + factory, + configId, + config, + BigtableReadOptions.builder() + .setTableId(StaticValueProvider.of(table)) + .setKeyRanges(ALL_KEY_RANGE) + .build(), + null /*size*/); + List splits = source.split(0, null /* options */); + + // Test num splits and split equality. + assertThat(splits, hasSize(numSamples)); + assertSourcesEqualReferenceSource(source, splits, null /* options */); + } + @Test public void testReadingWithSplitFailed() throws Exception { FailureBigtableService failureService = diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableTestUtils.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableTestUtils.java index c35b7c54c4d9e..6bd2f3b25b3c7 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableTestUtils.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableTestUtils.java @@ -31,12 +31,14 @@ import com.google.bigtable.v2.Mutation; import com.google.protobuf.ByteString; import java.util.List; +import org.apache.beam.sdk.io.gcp.bigtable.changestreams.dao.BigtableClientOverride; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.values.KV; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.primitives.Longs; +import org.joda.time.Instant; -class BigtableTestUtils { +public class BigtableTestUtils { static final String BOOL_COLUMN = "boolColumn"; static final String LONG_COLUMN = "longColumn"; @@ -144,4 +146,27 @@ private static Cell createCell(ByteString value, long timestamp, String... label } return builder.build(); } + + // We have to build the pipeline at this package level and not changestreams package because + // endTime is package private and we can only create a pipeline with endTime here. Setting endTime + // allows the tests to predictably terminate. + public static BigtableIO.ReadChangeStream buildTestPipelineInput( + String projectId, + String instanceId, + String tableId, + String appProfileId, + String metadataTableName, + Instant startTime, + Instant endTime, + BigtableClientOverride clientOverride) { + return BigtableIO.readChangeStream() + .withProjectId(projectId) + .withInstanceId(instanceId) + .withTableId(tableId) + .withAppProfileId(appProfileId) + .withMetadataTableTableId(metadataTableName) + .withStartTime(startTime) + .withEndTime(endTime) + .withBigtableClientOverride(clientOverride); + } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/it/BigtableChangeStreamIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/it/BigtableChangeStreamIT.java new file mode 100644 index 0000000000000..e6455cbfd5814 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/it/BigtableChangeStreamIT.java @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigtable.changestreams.it; + +import com.google.api.gax.batching.Batcher; +import com.google.bigtable.v2.MutateRowsRequest; +import com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient; +import com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings; +import com.google.cloud.bigtable.admin.v2.models.CreateTableRequest; +import com.google.cloud.bigtable.admin.v2.models.UpdateTableRequest; +import com.google.cloud.bigtable.data.v2.BigtableDataClient; +import com.google.cloud.bigtable.data.v2.BigtableDataSettings; +import com.google.cloud.bigtable.data.v2.models.ChangeStreamMutation; +import com.google.cloud.bigtable.data.v2.models.Range; +import com.google.cloud.bigtable.data.v2.models.RowMutationEntry; +import com.google.cloud.bigtable.data.v2.stub.EnhancedBigtableStubSettings; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.Collectors; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.common.IOITHelper; +import org.apache.beam.sdk.io.gcp.bigtable.BigtableTestUtils; +import org.apache.beam.sdk.io.gcp.bigtable.changestreams.BigtableChangeStreamTestOptions; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** End-to-end tests of Bigtable Change Stream. */ +@SuppressWarnings("FutureReturnValueIgnored") +@RunWith(JUnit4.class) +public class BigtableChangeStreamIT { + private static final Logger LOG = LoggerFactory.getLogger(BigtableChangeStreamIT.class); + private static final String COLUMN_FAMILY1 = "CF"; + private static final String COLUMN_FAMILY2 = "CF2"; + private static final String COLUMN_QUALIFIER = "CQ"; + private static String projectId; + private static String instanceId; + private static String tableId; + private static String appProfileId; + private static String metadataTableId; + private static BigtableTableAdminClient adminClient; + private static BigtableDataClient dataClient; + private static BigtableClientIntegrationTestOverride bigtableClientOverride; + private static Batcher mutationBatcher; + private static BigtableChangeStreamTestOptions options; + private transient TestPipeline pipeline; + + @BeforeClass + public static void beforeClass() throws IOException { + options = IOITHelper.readIOTestPipelineOptions(BigtableChangeStreamTestOptions.class); + LOG.info("Pipeline options: {}", options); + projectId = options.as(GcpOptions.class).getProject(); + instanceId = options.getBigtableChangeStreamInstanceId(); + + long randomId = Instant.now().getMillis(); + tableId = "beam-change-stream-test-" + randomId; + metadataTableId = "beam-change-stream-test-md-" + randomId; + appProfileId = "default"; + + bigtableClientOverride = new BigtableClientIntegrationTestOverride(); + LOG.info(bigtableClientOverride.toString()); + + BigtableDataSettings.Builder dataSettingsBuilder = BigtableDataSettings.newBuilder(); + BigtableTableAdminSettings.Builder tableAdminSettingsBuilder = + BigtableTableAdminSettings.newBuilder(); + dataSettingsBuilder.setProjectId(projectId); + tableAdminSettingsBuilder.setProjectId(projectId); + dataSettingsBuilder.setInstanceId(instanceId); + tableAdminSettingsBuilder.setInstanceId(instanceId); + dataSettingsBuilder.setAppProfileId(appProfileId); + // TODO: Remove this later. But for now, disable direct path. + dataSettingsBuilder + .stubSettings() + .setTransportChannelProvider( + EnhancedBigtableStubSettings.defaultGrpcTransportProviderBuilder() + .setAttemptDirectPath(false) + .build()); + + bigtableClientOverride.updateDataClientSettings(dataSettingsBuilder); + bigtableClientOverride.updateTableAdminClientSettings(tableAdminSettingsBuilder); + + // These clients are used to modify the table and write to it + dataClient = BigtableDataClient.create(dataSettingsBuilder.build()); + adminClient = BigtableTableAdminClient.create(tableAdminSettingsBuilder.build()); + + // Create change stream enabled table + adminClient.createTable( + CreateTableRequest.of(tableId) + .addChangeStreamRetention(org.threeten.bp.Duration.ofDays(1)) + .addFamily(COLUMN_FAMILY1) + .addFamily(COLUMN_FAMILY2)); + + mutationBatcher = dataClient.newBulkMutationBatcher(tableId); + } + + @Before + public void before() { + pipeline = TestPipeline.fromOptions(options).enableAbandonedNodeEnforcement(false); + } + + @AfterClass + public static void afterClass() { + if (adminClient != null) { + if (adminClient.exists(tableId)) { + adminClient.updateTable(UpdateTableRequest.of(tableId).disableChangeStreamRetention()); + adminClient.deleteTable(tableId); + adminClient.deleteTable(metadataTableId); + } + adminClient.close(); + } + if (dataClient != null) { + dataClient.close(); + } + } + + @Test + public void testReadBigtableChangeStream() throws InterruptedException { + Instant startTime = Instant.now(); + String rowKey = "rowKeySetCell"; + RowMutationEntry setCellEntry = + RowMutationEntry.create(rowKey).setCell(COLUMN_FAMILY1, COLUMN_QUALIFIER, "cell value 1"); + mutationBatcher.add(setCellEntry); + mutationBatcher.flush(); + Instant endTime = Instant.now().plus(Duration.standardSeconds(1)); + + PCollection changeStream = buildPipeline(startTime, endTime); + PAssert.that(changeStream).containsInAnyOrder(setCellEntry.toProto()); + pipeline.run().waitUntilFinish(); + } + + @Test + public void testDeleteRow() throws InterruptedException { + Instant startTime = Instant.now(); + String rowKeyToDelete = "rowKeyToDelete"; + RowMutationEntry setCellMutationToDelete = + RowMutationEntry.create(rowKeyToDelete) + .setCell(COLUMN_FAMILY1, COLUMN_QUALIFIER, "cell value 1"); + RowMutationEntry deleteRowMutation = RowMutationEntry.create(rowKeyToDelete).deleteRow(); + mutationBatcher.add(setCellMutationToDelete); + mutationBatcher.flush(); + mutationBatcher.add(deleteRowMutation); + mutationBatcher.flush(); + Instant endTime = Instant.now().plus(Duration.standardSeconds(1)); + + PCollection changeStream = buildPipeline(startTime, endTime); + PAssert.that(changeStream) + .containsInAnyOrder( + setCellMutationToDelete.toProto(), + // Delete row becomes one deleteFamily per family + RowMutationEntry.create(rowKeyToDelete) + .deleteFamily(COLUMN_FAMILY1) + .deleteFamily(COLUMN_FAMILY2) + .toProto()); + pipeline.run().waitUntilFinish(); + } + + @Test + public void testDeleteColumnFamily() throws InterruptedException { + Instant startTime = Instant.now(); + String cellValue = "cell value 1"; + String rowKeyMultiFamily = "rowKeyMultiFamily"; + RowMutationEntry setCells = + RowMutationEntry.create(rowKeyMultiFamily) + .setCell(COLUMN_FAMILY1, COLUMN_QUALIFIER, cellValue) + .setCell(COLUMN_FAMILY2, COLUMN_QUALIFIER, cellValue); + mutationBatcher.add(setCells); + mutationBatcher.flush(); + RowMutationEntry deleteCF2 = + RowMutationEntry.create(rowKeyMultiFamily).deleteFamily(COLUMN_FAMILY2); + mutationBatcher.add(deleteCF2); + mutationBatcher.flush(); + Instant endTime = Instant.now().plus(Duration.standardSeconds(1)); + + PCollection changeStream = buildPipeline(startTime, endTime); + PAssert.that(changeStream).containsInAnyOrder(setCells.toProto(), deleteCF2.toProto()); + pipeline.run().waitUntilFinish(); + } + + @Test + public void testDeleteCell() throws InterruptedException { + Instant startTime = Instant.now(); + String cellValue = "cell value 1"; + String rowKeyMultiCell = "rowKeyMultiCell"; + RowMutationEntry setCells = + RowMutationEntry.create(rowKeyMultiCell) + .setCell(COLUMN_FAMILY1, COLUMN_QUALIFIER, cellValue) + .setCell(COLUMN_FAMILY1, "CQ2", cellValue); + mutationBatcher.add(setCells); + mutationBatcher.flush(); + RowMutationEntry deleteCQ2 = + RowMutationEntry.create(rowKeyMultiCell) + // need to set timestamp range to make change stream output match + .deleteCells( + COLUMN_FAMILY1, + ByteString.copyFromUtf8("CQ2"), + Range.TimestampRange.create( + startTime.getMillis() * 1000, + startTime.plus(Duration.standardMinutes(2)).getMillis() * 1000)); + mutationBatcher.add(deleteCQ2); + mutationBatcher.flush(); + Instant endTime = Instant.now().plus(Duration.standardSeconds(1)); + + PCollection changeStream = buildPipeline(startTime, endTime); + PAssert.that(changeStream).containsInAnyOrder(setCells.toProto(), deleteCQ2.toProto()); + pipeline.run().waitUntilFinish(); + } + + @Test + public void testComplexMutation() throws InterruptedException { + Instant startTime = Instant.now(); + String rowKey = "rowKeyComplex"; + // We'll delete this in the next mutation + RowMutationEntry setCell = + RowMutationEntry.create(rowKey).setCell(COLUMN_FAMILY1, COLUMN_QUALIFIER, "cell value 1"); + mutationBatcher.add(setCell); + mutationBatcher.flush(); + RowMutationEntry complexMutation = + RowMutationEntry.create(rowKey) + .setCell(COLUMN_FAMILY1, "CQ2", "cell value 2") + .setCell(COLUMN_FAMILY1, "CQ3", "cell value 3") + // need to set timestamp range to make change stream output match + .deleteCells( + COLUMN_FAMILY1, + ByteString.copyFromUtf8(COLUMN_QUALIFIER), + Range.TimestampRange.create( + startTime.getMillis() * 1000, + startTime.plus(Duration.standardMinutes(2)).getMillis() * 1000)); + mutationBatcher.add(complexMutation); + mutationBatcher.flush(); + Instant endTime = Instant.now().plus(Duration.standardSeconds(1)); + + PCollection changeStream = buildPipeline(startTime, endTime); + PAssert.that(changeStream).containsInAnyOrder(setCell.toProto(), complexMutation.toProto()); + pipeline.run().waitUntilFinish(); + } + + @Test + public void testLargeMutation() throws InterruptedException { + Instant startTime = Instant.now(); + // test set cell w size > 1MB so it triggers chunking + char[] chars = new char[1024 * 1500]; + Arrays.fill(chars, '\u200B'); // zero-width space + String largeString = String.valueOf(chars); + String rowKeyLargeCell = "rowKeyLargeCell"; + RowMutationEntry setLargeCell = + RowMutationEntry.create(rowKeyLargeCell) + .setCell(COLUMN_FAMILY1, COLUMN_QUALIFIER, largeString); + mutationBatcher.add(setLargeCell); + mutationBatcher.flush(); + Instant endTime = Instant.now().plus(Duration.standardSeconds(1)); + + PCollection changeStream = buildPipeline(startTime, endTime); + PAssert.that(changeStream).containsInAnyOrder(setLargeCell.toProto()); + pipeline.run().waitUntilFinish(); + } + + @Test + public void testManyMutations() throws InterruptedException { + Instant startTime = Instant.now(); + // test set cell w size > 1MB so it triggers chunking + char[] chars = new char[1024 * 3]; + Arrays.fill(chars, '\u200B'); // zero-width space + String largeString = String.valueOf(chars); + + ImmutableList.Builder originalWrites = ImmutableList.builder(); + for (int i = 0; i < 100; ++i) { + String rowKey = "rowKey" + i; + // SetCell. + RowMutationEntry setLargeCell = + RowMutationEntry.create(rowKey).setCell(COLUMN_FAMILY1, COLUMN_QUALIFIER, largeString); + // DeleteFamily. + RowMutationEntry deleteFamily = RowMutationEntry.create(rowKey).deleteFamily(COLUMN_FAMILY1); + // DeleteCells. + RowMutationEntry deleteCells = + RowMutationEntry.create(rowKey) + // need to set timestamp range to make change stream output match + .deleteCells( + COLUMN_FAMILY1, + ByteString.copyFromUtf8(COLUMN_QUALIFIER), + Range.TimestampRange.create( + startTime.getMillis() * 1000, + startTime.plus(Duration.standardMinutes(2)).getMillis() * 1000)); + // Apply the mutations. + originalWrites.add(setLargeCell); + mutationBatcher.add(setLargeCell); + mutationBatcher.flush(); + + originalWrites.add(deleteFamily); + mutationBatcher.add(deleteFamily); + mutationBatcher.flush(); + + originalWrites.add(deleteCells); + mutationBatcher.add(deleteCells); + mutationBatcher.flush(); + } + Instant endTime = Instant.now().plus(Duration.standardSeconds(1)); + + PCollection changeStream = buildPipeline(startTime, endTime); + PAssert.that(changeStream) + .containsInAnyOrder( + originalWrites.build().stream() + .map(RowMutationEntry::toProto) + .collect(Collectors.toList())); + pipeline.run().waitUntilFinish(); + } + + private PCollection buildPipeline(Instant startTime, Instant endTime) { + return pipeline + .apply( + BigtableTestUtils.buildTestPipelineInput( + projectId, + instanceId, + tableId, + appProfileId, + metadataTableId, + startTime, + endTime, + bigtableClientOverride)) + .apply(ParDo.of(new ConvertToEntry())); + } + + private static class ConvertToEntry + extends DoFn, MutateRowsRequest.Entry> { + @ProcessElement + public void processElement( + @Element KV element, + OutputReceiver out) { + out.output(element.getValue().toRowMutationEntry().toProto()); + } + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/it/BigtableClientIntegrationTestOverride.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/it/BigtableClientIntegrationTestOverride.java new file mode 100644 index 0000000000000..0d6766aa20df8 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/changestreams/it/BigtableClientIntegrationTestOverride.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigtable.changestreams.it; + +import com.google.cloud.bigtable.admin.v2.BigtableInstanceAdminSettings; +import com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings; +import com.google.cloud.bigtable.data.v2.BigtableDataSettings; +import com.google.errorprone.annotations.CheckReturnValue; +import java.io.Serializable; +import org.apache.beam.sdk.io.gcp.bigtable.changestreams.dao.BigtableClientOverride; + +/** Implements BigtableClientOverride to override data and admin endpoints. */ +@CheckReturnValue +final class BigtableClientIntegrationTestOverride implements Serializable, BigtableClientOverride { + private static final long serialVersionUID = 4188505491566837311L; + + // The address of the admin API endpoint. + private static final String ADMIN_ENDPOINT_ENV_VAR = + getenv("BIGTABLE_ENV_ADMIN_ENDPOINT", "bigtableadmin.googleapis.com:443"); + // The address of the data API endpoint. + private static final String DATA_ENDPOINT_ENV_VAR = + getenv("BIGTABLE_ENV_DATA_ENDPOINT", "bigtable.googleapis.com:443"); + + private final String adminEndpoint; + private final String dataEndpoint; + + @Override + public String toString() { + return "BigtableClientIntegrationTestOverride{" + + "adminEndpoint=" + + adminEndpoint + + ", dataEndpoint=" + + dataEndpoint + + "}"; + } + + /** Applies the test environment settings to the builder. */ + @Override + public void updateInstanceAdminClientSettings(BigtableInstanceAdminSettings.Builder builder) { + builder.stubSettings().setEndpoint(adminEndpoint); + } + + /** Applies the test environment settings to the builder. */ + @Override + public void updateTableAdminClientSettings(BigtableTableAdminSettings.Builder builder) { + builder.stubSettings().setEndpoint(adminEndpoint); + } + + /** Applies the test environment settings to the builder. */ + @Override + public void updateDataClientSettings(BigtableDataSettings.Builder builder) { + builder.stubSettings().setEndpoint(dataEndpoint); + } + + /** Returns the value of the environment variable, or default string if not found. */ + private static String getenv(String name, String defaultValue) { + final String value = System.getenv(name); + if (value != null) { + return value; + } + return defaultValue; + } + + BigtableClientIntegrationTestOverride() { + adminEndpoint = ADMIN_ENDPOINT_ENV_VAR; + dataEndpoint = DATA_ENDPOINT_ENV_VAR; + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/firestore/it/FirestoreTestingHelper.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/firestore/it/FirestoreTestingHelper.java index d8c55d44f3c8e..a57dd688d4afd 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/firestore/it/FirestoreTestingHelper.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/firestore/it/FirestoreTestingHelper.java @@ -134,7 +134,7 @@ public FirestoreTestingHelper(CleanupMode cleanupMode) { .setCredentials(gcpOptions.getGcpCredential()) .setProjectId(gcpOptions.getProject()) .setDatabaseId(firestoreBeamOptions.getFirestoreDb()) - .setHost(firestoreBeamOptions.getHost()) + .setHost(firestoreBeamOptions.getFirestoreHost()) .build(); fs = firestoreOptions.getService(); rpc = (FirestoreRpc) firestoreOptions.getRpc(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java index c11a071ab0eb0..f7f9f5f91b74d 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java @@ -51,10 +51,10 @@ import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; +import org.apache.beam.sdk.extensions.avro.io.AvroGeneratedUser; import org.apache.beam.sdk.extensions.protobuf.Proto3SchemaMessages.Primitive; import org.apache.beam.sdk.extensions.protobuf.ProtoCoder; import org.apache.beam.sdk.extensions.protobuf.ProtoDomain; -import org.apache.beam.sdk.io.AvroGeneratedUser; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.IncomingMessage; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.OutgoingMessage; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.SubscriptionPath; diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java index 0de998f111276..dd5a9abd5ac8e 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java @@ -28,6 +28,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; import org.apache.beam.sdk.PipelineResult; @@ -46,6 +47,9 @@ import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.Rule; import org.junit.Test; import org.junit.runner.RunWith; @@ -161,6 +165,88 @@ public void testNoSchema() { p.run().waitUntilFinish(); } + @Test + public void testReadRaw() throws IOException { + PCollectionRowTuple begin = PCollectionRowTuple.empty(p); + + Schema rawSchema = Schema.of(Schema.Field.of("payload", Schema.FieldType.BYTES)); + byte[] payload = "some payload".getBytes(Charsets.UTF_8); + + try (PubsubTestClientFactory clientFactory = + clientFactory(ImmutableList.of(incomingMessageOf(payload, CLOCK.currentTimeMillis())))) { + PubsubReadSchemaTransformConfiguration config = + PubsubReadSchemaTransformConfiguration.builder() + .setFormat("RAW") + .setSchema("") + .setSubscription(SUBSCRIPTION) + .setClientFactory(clientFactory) + .setClock(CLOCK) + .build(); + SchemaTransform transform = new PubsubReadSchemaTransformProvider().from(config); + PCollectionRowTuple reads = begin.apply(transform); + + PAssert.that(reads.get("output")) + .containsInAnyOrder( + ImmutableList.of(Row.withSchema(rawSchema).addValue(payload).build())); + + p.run().waitUntilFinish(); + } catch (Exception e) { + throw e; + } + } + + @Test + public void testReadAttributes() throws IOException { + PCollectionRowTuple begin = PCollectionRowTuple.empty(p); + + Schema.builder() + .addByteArrayField("payload") + .addStringField("attr") + .addMapField("attrMap", Schema.FieldType.STRING, Schema.FieldType.STRING) + .build(); + + Schema rawSchema = + Schema.builder() + .addByteArrayField("payload") + .addStringField("attr") + .addMapField("attrMap", Schema.FieldType.STRING, Schema.FieldType.STRING) + .build(); + byte[] payload = "some payload".getBytes(Charsets.UTF_8); + String attr = "attr value"; + + try (PubsubTestClientFactory clientFactory = + clientFactory( + ImmutableList.of( + incomingMessageOf( + payload, CLOCK.currentTimeMillis(), ImmutableMap.of("attr", attr))))) { + PubsubReadSchemaTransformConfiguration config = + PubsubReadSchemaTransformConfiguration.builder() + .setFormat("RAW") + .setSchema("") + .setSubscription(SUBSCRIPTION) + .setAttributes(ImmutableList.of("attr")) + .setAttributesMap("attrMap") + .setClientFactory(clientFactory) + .setClock(CLOCK) + .build(); + SchemaTransform transform = new PubsubReadSchemaTransformProvider().from(config); + PCollectionRowTuple reads = begin.apply(transform); + + PAssert.that(reads.get("output")) + .containsInAnyOrder( + ImmutableList.of( + Row.withSchema(rawSchema) + .addValue(payload) + .addValue(attr) + .addValue(ImmutableMap.of("attr", attr)) + .build())); + + p.run().waitUntilFinish(); + } catch (Exception e) { + throw e; + } + } + @Test public void testReadAvro() throws IOException { PCollectionRowTuple begin = PCollectionRowTuple.empty(p); @@ -195,6 +281,10 @@ public void testReadAvroWithError() throws IOException { .setFormat("AVRO") .setSchema(SCHEMA) .setSubscription(SUBSCRIPTION) + .setErrorHandling( + PubsubReadSchemaTransformConfiguration.ErrorHandling.builder() + .setOutput("errors") + .build()) .setClientFactory(clientFactory) .setClock(CLOCK) .build(); @@ -253,12 +343,18 @@ private static List beamRowToMessageWithError() { private static PubsubClient.IncomingMessage incomingMessageOf( byte[] bytes, long millisSinceEpoch) { + return incomingMessageOf(bytes, millisSinceEpoch, ImmutableMap.of()); + } + + private static PubsubClient.IncomingMessage incomingMessageOf( + byte[] bytes, long millisSinceEpoch, Map attributes) { int nanos = Long.valueOf(millisSinceEpoch).intValue() * 1000; Timestamp timestamp = Timestamp.newBuilder().setNanos(nanos).build(); return PubsubClient.IncomingMessage.of( com.google.pubsub.v1.PubsubMessage.newBuilder() .setData(ByteString.copyFrom(bytes)) .setPublishTime(timestamp) + .putAllAttributes(attributes) .build(), millisSinceEpoch, 0, diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/MutationSizeEstimatorTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/MutationSizeEstimatorTest.java index f05159cbbe359..ebabfa8b575fd 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/MutationSizeEstimatorTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/MutationSizeEstimatorTest.java @@ -65,6 +65,11 @@ public void primitives() throws Exception { .to(Value.json("{\"key1\":\"value1\", \"key2\":\"value2\"}")) .build(); Mutation deleteDouble = Mutation.delete("test", Key.of(1223.)); + Mutation jsonb = + Mutation.newInsertOrUpdateBuilder("test") + .set("one") + .to(Value.pgJsonb("{\"key123\":\"value123\", \"key321\":\"value321\"}")) + .build(); assertThat(MutationSizeEstimator.sizeOf(int64), is(8L)); assertThat(MutationSizeEstimator.sizeOf(float64), is(8L)); @@ -74,6 +79,7 @@ public void primitives() throws Exception { assertThat(MutationSizeEstimator.sizeOf(pgNumericNaN), is(3L)); assertThat(MutationSizeEstimator.sizeOf(json), is(34L)); assertThat(MutationSizeEstimator.sizeOf(deleteDouble), is(8L)); + assertThat(MutationSizeEstimator.sizeOf(jsonb), is(42L)); } @Test @@ -131,6 +137,14 @@ public void primitiveArrays() throws Exception { ByteArray.copyFrom("some_bytes".getBytes(UTF_8)), ByteArray.copyFrom("some_bytes".getBytes(UTF_8)))) .build(); + Mutation jsonb = + Mutation.newInsertOrUpdateBuilder("test") + .set("one") + .toPgJsonbArray( + ImmutableList.of( + "{\"key123\":\"value123\", \"key321\":\"value321\"}", + "{\"key456\":\"value456\", \"key789\":600}")) + .build(); assertThat(MutationSizeEstimator.sizeOf(int64), is(24L)); assertThat(MutationSizeEstimator.sizeOf(float64), is(16L)); assertThat(MutationSizeEstimator.sizeOf(bool), is(4L)); @@ -138,6 +152,7 @@ public void primitiveArrays() throws Exception { assertThat(MutationSizeEstimator.sizeOf(pgNumeric), is(156L)); assertThat(MutationSizeEstimator.sizeOf(json), is(62L)); assertThat(MutationSizeEstimator.sizeOf(bytes), is(20L)); + assertThat(MutationSizeEstimator.sizeOf(jsonb), is(77L)); } @Test @@ -162,6 +177,8 @@ public void nullPrimitiveArrays() throws Exception { .toPgNumericArray((Iterable) null) .build(); Mutation json = Mutation.newInsertOrUpdateBuilder("test").set("one").toJsonArray(null).build(); + Mutation jsonb = + Mutation.newInsertOrUpdateBuilder("test").set("one").toPgJsonbArray(null).build(); assertThat(MutationSizeEstimator.sizeOf(int64), is(0L)); assertThat(MutationSizeEstimator.sizeOf(float64), is(0L)); @@ -169,6 +186,7 @@ public void nullPrimitiveArrays() throws Exception { assertThat(MutationSizeEstimator.sizeOf(numeric), is(0L)); assertThat(MutationSizeEstimator.sizeOf(pgNumeric), is(0L)); assertThat(MutationSizeEstimator.sizeOf(json), is(0L)); + assertThat(MutationSizeEstimator.sizeOf(jsonb), is(0L)); } @Test @@ -237,6 +255,29 @@ public void jsons() throws Exception { assertThat(MutationSizeEstimator.sizeOf(nullArray), is(0L)); } + @Test + public void pgJsonb() throws Exception { + Mutation empty = + Mutation.newInsertOrUpdateBuilder("test").set("one").to(Value.pgJsonb("{}")).build(); + Mutation nullValue = + Mutation.newInsertOrUpdateBuilder("test") + .set("one") + .to(Value.pgJsonb((String) null)) + .build(); + Mutation sample = + Mutation.newInsertOrUpdateBuilder("test") + .set("one") + .to(Value.pgJsonb("{\"type_name\":\"number\",\"value\":12345.123}")) + .build(); + Mutation nullArray = + Mutation.newInsertOrUpdateBuilder("test").set("one").toPgJsonbArray(null).build(); + + assertThat(MutationSizeEstimator.sizeOf(empty), is(2L)); + assertThat(MutationSizeEstimator.sizeOf(nullValue), is(0L)); + assertThat(MutationSizeEstimator.sizeOf(sample), is(40L)); + assertThat(MutationSizeEstimator.sizeOf(nullArray), is(0L)); + } + @Test public void dates() throws Exception { Mutation timestamp = diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java index d65486b3938a7..7ba345a24885d 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java @@ -75,10 +75,11 @@ public void testSinglePgTable() throws Exception { .addColumn("test", "maxKey", "character varying") .addColumn("test", "numericVal", "numeric") .addColumn("test", "commitTime", "spanner.commit_timestamp") + .addColumn("test", "jsonbCol", "jsonb") .build(); assertEquals(1, schema.getTables().size()); - assertEquals(4, schema.getColumns("test").size()); + assertEquals(5, schema.getColumns("test").size()); assertEquals(1, schema.getKeyParts("test").size()); assertEquals(Type.timestamp(), schema.getColumns("test").get(3).getType()); } @@ -90,6 +91,7 @@ public void testTwoPgTables() throws Exception { .addColumn("test", "pk", "character varying(48)") .addKeyPart("test", "pk", false) .addColumn("test", "maxKey", "character varying") + .addColumn("test", "jsonbCol", "jsonb") .addColumn("other", "pk", "bigint") .addKeyPart("other", "pk", true) .addColumn("other", "maxKey", "character varying") @@ -97,7 +99,7 @@ public void testTwoPgTables() throws Exception { .build(); assertEquals(2, schema.getTables().size()); - assertEquals(2, schema.getColumns("test").size()); + assertEquals(3, schema.getColumns("test").size()); assertEquals(1, schema.getKeyParts("test").size()); assertEquals(3, schema.getColumns("other").size()); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/SpannerChangeStreamErrorTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/SpannerChangeStreamErrorTest.java index bf2ccd454bb5b..9ffa61c930781 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/SpannerChangeStreamErrorTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/SpannerChangeStreamErrorTest.java @@ -52,7 +52,9 @@ import com.google.spanner.v1.TypeCode; import io.grpc.Status; import java.io.Serializable; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import org.apache.beam.runners.direct.DirectOptions; import org.apache.beam.runners.direct.DirectRunner; import org.apache.beam.sdk.Pipeline; @@ -68,7 +70,6 @@ import org.joda.time.Duration; import org.junit.After; import org.junit.Before; -import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; @@ -114,16 +115,22 @@ public void tearDown() throws NoSuchFieldException, IllegalAccessException { } @Test - @Ignore("BEAM-12164 Reenable this test when databaseClient.getDialect returns the right message.") - public void testResourceExhaustedDoesNotRetry() { + // Error code UNAVAILABLE is retried repeatedly until the RPC times out. + public void testUnavailableExceptionRetries() throws InterruptedException { + DirectOptions options = PipelineOptionsFactory.as(DirectOptions.class); + options.setBlockOnRun(false); + options.setRunner(DirectRunner.class); + Pipeline nonBlockingPipeline = TestPipeline.create(options); + mockSpannerService.setExecuteStreamingSqlExecutionTime( - SimulatedExecutionTime.ofStickyException(Status.RESOURCE_EXHAUSTED.asRuntimeException())); + SimulatedExecutionTime.ofStickyException(Status.UNAVAILABLE.asRuntimeException())); final Timestamp startTimestamp = Timestamp.ofTimeSecondsAndNanos(0, 1000); final Timestamp endTimestamp = Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() + 1); + try { - pipeline.apply( + nonBlockingPipeline.apply( SpannerIO.readChangeStream() .withSpannerConfig(getSpannerConfig()) .withChangeStreamName(TEST_CHANGE_STREAM) @@ -131,33 +138,36 @@ public void testResourceExhaustedDoesNotRetry() { .withMetadataTable(TEST_TABLE) .withInclusiveStartAt(startTimestamp) .withInclusiveEndAt(endTimestamp)); - pipeline.run().waitUntilFinish(); + PipelineResult result = nonBlockingPipeline.run(); + while (result.getState() != RUNNING) { + Thread.sleep(50); + } + // The pipeline continues making requests to Spanner to retry the Unavailable errors. + assertNull(result.waitUntilFinish(Duration.millis(500))); } finally { - thrown.expect(SpannerException.class); // databaseClient.getDialect does not currently bubble up the correct message. // Instead, the error returned is: "DEADLINE_EXCEEDED: Operation did not complete " // "in the given time" - thrown.expectMessage("RESOURCE_EXHAUSTED - Statement: 'SELECT 'POSTGRESQL' AS DIALECT"); + thrown.expectMessage("DEADLINE_EXCEEDED"); assertThat( mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.equalTo(0)); } } @Test - @Ignore("BEAM-12164 Reenable this test when databaseClient.getDialect returns the right message.") - public void testUnavailableExceptionRetries() throws InterruptedException { + // Error code ABORTED is retried repeatedly until it times out. + public void testAbortedExceptionRetries() throws InterruptedException { + mockSpannerService.setExecuteStreamingSqlExecutionTime( + SimulatedExecutionTime.ofStickyException(Status.ABORTED.asRuntimeException())); + DirectOptions options = PipelineOptionsFactory.as(DirectOptions.class); options.setBlockOnRun(false); options.setRunner(DirectRunner.class); Pipeline nonBlockingPipeline = TestPipeline.create(options); - mockSpannerService.setExecuteStreamingSqlExecutionTime( - SimulatedExecutionTime.ofStickyException(Status.UNAVAILABLE.asRuntimeException())); - final Timestamp startTimestamp = Timestamp.ofTimeSecondsAndNanos(0, 1000); final Timestamp endTimestamp = Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() + 1); - try { nonBlockingPipeline.apply( SpannerIO.readChangeStream() @@ -171,23 +181,20 @@ public void testUnavailableExceptionRetries() throws InterruptedException { while (result.getState() != RUNNING) { Thread.sleep(50); } - // The pipeline continues making requests to Spanner to retry the Unavailable errors. + // The pipeline continues making requests to Spanner to retry the Aborted errors. assertNull(result.waitUntilFinish(Duration.millis(500))); } finally { - // databaseClient.getDialect does not currently bubble up the correct message. - // Instead, the error returned is: "DEADLINE_EXCEEDED: Operation did not complete " - // "in the given time" - thrown.expectMessage("UNAVAILABLE - Statement: 'SELECT 'POSTGRESQL' AS DIALECT"); + thrown.expectMessage("DEADLINE_EXCEEDED"); assertThat( mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.equalTo(0)); } } @Test - @Ignore("BEAM-12164 Reenable this test when databaseClient.getDialect returns the right message.") - public void testAbortedExceptionNotRetried() { + // Error code UNKNOWN is not retried. + public void testUnknownExceptionDoesNotRetry() { mockSpannerService.setExecuteStreamingSqlExecutionTime( - SimulatedExecutionTime.ofStickyException(Status.ABORTED.asRuntimeException())); + SimulatedExecutionTime.ofStickyException(Status.UNKNOWN.asRuntimeException())); final Timestamp startTimestamp = Timestamp.ofTimeSecondsAndNanos(0, 1000); final Timestamp endTimestamp = @@ -204,19 +211,43 @@ public void testAbortedExceptionNotRetried() { pipeline.run().waitUntilFinish(); } finally { thrown.expect(SpannerException.class); - // databaseClient.getDialect does not currently bubble up the correct message. - // Instead, the error returned is: "DEADLINE_EXCEEDED: Operation did not complete " - // "in the given time" - thrown.expectMessage("ABORTED - Statement: 'SELECT 'POSTGRESQL' AS DIALECT"); + thrown.expectMessage("UNKNOWN"); assertThat( mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.equalTo(0)); } } @Test - public void testAbortedExceptionNotRetriedithDefaultsForStreamSqlRetrySettings() { + // Error code RESOURCE_EXHAUSTED is retried repeatedly. + public void testResourceExhaustedRetry() { mockSpannerService.setExecuteStreamingSqlExecutionTime( - SimulatedExecutionTime.ofStickyException(Status.ABORTED.asRuntimeException())); + SimulatedExecutionTime.ofStickyException(Status.RESOURCE_EXHAUSTED.asRuntimeException())); + + final Timestamp startTimestamp = Timestamp.ofTimeSecondsAndNanos(0, 1000); + final Timestamp endTimestamp = + Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() + 1); + + try { + pipeline.apply( + SpannerIO.readChangeStream() + .withSpannerConfig(getSpannerConfig()) + .withChangeStreamName(TEST_CHANGE_STREAM) + .withMetadataDatabase(TEST_DATABASE) + .withMetadataTable(TEST_TABLE) + .withInclusiveStartAt(startTimestamp) + .withInclusiveEndAt(endTimestamp)); + pipeline.run().waitUntilFinish(); + } finally { + thrown.expectMessage("DEADLINE_EXCEEDED"); + assertThat( + mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.equalTo(0)); + } + } + + @Test + public void testResourceExhaustedRetryWithDefaultSettings() { + mockSpannerService.setExecuteStreamingSqlExecutionTime( + SimulatedExecutionTime.ofStickyException(Status.RESOURCE_EXHAUSTED.asRuntimeException())); final Timestamp startTimestamp = Timestamp.ofTimeSecondsAndNanos(0, 1000); final Timestamp endTimestamp = @@ -230,6 +261,7 @@ public void testAbortedExceptionNotRetriedithDefaultsForStreamSqlRetrySettings() .withProjectId(TEST_PROJECT) .withInstanceId(TEST_INSTANCE) .withDatabaseId(TEST_DATABASE); + try { pipeline.apply( SpannerIO.readChangeStream() @@ -241,24 +273,34 @@ public void testAbortedExceptionNotRetriedithDefaultsForStreamSqlRetrySettings() .withInclusiveEndAt(endTimestamp)); pipeline.run().waitUntilFinish(); } finally { - // databaseClient.getDialect does not currently bubble up the correct message. - // Instead, the error returned is: "DEADLINE_EXCEEDED: Operation did not complete " - // "in the given time" thrown.expect(SpannerException.class); - thrown.expectMessage("ABORTED - Statement: 'SELECT 'POSTGRESQL' AS DIALECT"); + thrown.expectMessage("RESOURCE_EXHAUSTED"); assertThat( mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.equalTo(0)); } } @Test - public void testUnknownExceptionDoesNotRetry() { - mockSpannerService.setExecuteStreamingSqlExecutionTime( - SimulatedExecutionTime.ofStickyException(Status.UNKNOWN.asRuntimeException())); - + public void testInvalidRecordReceived() { final Timestamp startTimestamp = Timestamp.ofTimeSecondsAndNanos(0, 1000); final Timestamp endTimestamp = Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() + 1); + + mockGetDialect(); + mockTableExists(); + mockGetWatermark(startTimestamp); + ResultSet getPartitionResultSet = mockGetParentPartition(startTimestamp, endTimestamp); + mockGetPartitionsAfter( + Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() - 1), + getPartitionResultSet); + mockGetPartitionsAfter( + Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos()), + ResultSet.newBuilder().setMetadata(PARTITION_METADATA_RESULT_SET_METADATA).build()); + mockGetPartitionsAfter( + Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() + 1), + ResultSet.newBuilder().setMetadata(PARTITION_METADATA_RESULT_SET_METADATA).build()); + mockInvalidChangeStreamRecordReceived(startTimestamp, endTimestamp); + try { pipeline.apply( SpannerIO.readChangeStream() @@ -271,15 +313,16 @@ public void testUnknownExceptionDoesNotRetry() { pipeline.run().waitUntilFinish(); } finally { thrown.expect(SpannerException.class); - thrown.expectMessage("UNKNOWN - Statement: 'SELECT 'POSTGRESQL' AS DIALECT"); + // DatabaseClient.getDialect returns "DEADLINE_EXCEEDED: Operation did not complete in the " + // given time" even though we mocked it out. + thrown.expectMessage("DEADLINE_EXCEEDED"); assertThat( mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.equalTo(0)); } } @Test - @Ignore("BEAM-12164 Reenable this test when databaseClient.getDialect works.") - public void testInvalidRecordReceived() { + public void testInvalidRecordReceivedWithDefaultSettings() { final Timestamp startTimestamp = Timestamp.ofTimeSecondsAndNanos(0, 1000); final Timestamp endTimestamp = Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() + 1); @@ -288,6 +331,8 @@ public void testInvalidRecordReceived() { mockTableExists(); mockGetWatermark(startTimestamp); ResultSet getPartitionResultSet = mockGetParentPartition(startTimestamp, endTimestamp); + mockchangePartitionState(startTimestamp, endTimestamp, "CREATED"); + mockchangePartitionState(startTimestamp, endTimestamp, "SCHEDULED"); mockGetPartitionsAfter( Timestamp.ofTimeSecondsAndNanos(startTimestamp.getSeconds(), startTimestamp.getNanos() - 1), getPartitionResultSet); @@ -300,9 +345,26 @@ public void testInvalidRecordReceived() { mockInvalidChangeStreamRecordReceived(startTimestamp, endTimestamp); try { + RetrySettings quickRetrySettings = + RetrySettings.newBuilder() + .setInitialRetryDelay(org.threeten.bp.Duration.ofMillis(250)) + .setMaxRetryDelay(org.threeten.bp.Duration.ofSeconds(1)) + .setRetryDelayMultiplier(5) + .setTotalTimeout(org.threeten.bp.Duration.ofSeconds(1)) + .build(); + final SpannerConfig changeStreamConfig = + SpannerConfig.create() + .withEmulatorHost(StaticValueProvider.of(SPANNER_HOST)) + .withIsLocalChannelProvider(StaticValueProvider.of(true)) + .withCommitRetrySettings(quickRetrySettings) + .withExecuteStreamingSqlRetrySettings(null) + .withProjectId(TEST_PROJECT) + .withInstanceId(TEST_INSTANCE) + .withDatabaseId(TEST_DATABASE); + pipeline.apply( SpannerIO.readChangeStream() - .withSpannerConfig(getSpannerConfig()) + .withSpannerConfig(changeStreamConfig) .withChangeStreamName(TEST_CHANGE_STREAM) .withMetadataDatabase(TEST_DATABASE) .withMetadataTable(TEST_TABLE) @@ -311,11 +373,9 @@ public void testInvalidRecordReceived() { pipeline.run().waitUntilFinish(); } finally { thrown.expect(PipelineExecutionException.class); - // DatabaseClient.getDialect returns "DEADLINE_EXCEEDED: Operation did not complete in the " - // given time" even though we mocked it out. thrown.expectMessage("Field not found"); assertThat( - mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.equalTo(0)); + mockSpannerService.countRequestsOfType(ExecuteSqlRequest.class), Matchers.greaterThan(0)); } } @@ -487,6 +547,41 @@ private void mockTableExists() { StatementResult.query(tableExistsStatement, tableExistsResultSet)); } + private ResultSet mockchangePartitionState( + Timestamp startTimestamp, Timestamp after3Seconds, String state) { + List tokens = new ArrayList<>(); + tokens.add("Parent0"); + Statement getPartitionStatement = + Statement.newBuilder( + "SELECT * FROM my-metadata-table WHERE PartitionToken IN UNNEST(@partitionTokens) AND State = @state") + .bind("partitionTokens") + .toStringArray(tokens) + .bind("state") + .to(state) + .build(); + ResultSet getPartitionResultSet = + ResultSet.newBuilder() + .addRows( + ListValue.newBuilder() + .addValues(Value.newBuilder().setStringValue("Parent0")) + .addValues(Value.newBuilder().setListValue(ListValue.newBuilder().build())) + .addValues(Value.newBuilder().setStringValue(startTimestamp.toString())) + .addValues(Value.newBuilder().setStringValue(after3Seconds.toString())) + .addValues(Value.newBuilder().setStringValue("500")) + .addValues(Value.newBuilder().setStringValue(State.CREATED.name())) + .addValues(Value.newBuilder().setStringValue(startTimestamp.toString())) + .addValues(Value.newBuilder().setStringValue(startTimestamp.toString())) + .addValues(Value.newBuilder().setNullValue(NullValue.NULL_VALUE).build()) + .addValues(Value.newBuilder().setNullValue(NullValue.NULL_VALUE).build()) + .addValues(Value.newBuilder().setNullValue(NullValue.NULL_VALUE).build()) + .build()) + .setMetadata(PARTITION_METADATA_RESULT_SET_METADATA) + .build(); + mockSpannerService.putStatementResult( + StatementResult.query(getPartitionStatement, getPartitionResultSet)); + return getPartitionResultSet; + } + private void mockGetDialect() { Statement determineDialectStatement = Statement.newBuilder( diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java index 6e7ad865cc35f..e2a4a8e1072ae 100644 --- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java +++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcIO.java @@ -360,6 +360,7 @@ public static ReadWithPartitions read return new AutoValue_JdbcIO_ReadWithPartitions.Builder() .setPartitionColumnType(partitioningColumnType) .setNumPartitions(DEFAULT_NUM_PARTITIONS) + .setFetchSize(DEFAULT_FETCH_SIZE) .setUseBeamSchema(false) .build(); } @@ -1195,6 +1196,9 @@ public abstract static class ReadWithPartitions @Pure abstract @Nullable String getPartitionColumn(); + @Pure + abstract int getFetchSize(); + @Pure abstract boolean getUseBeamSchema(); @@ -1233,6 +1237,8 @@ abstract Builder setDataSourceProviderFn( abstract Builder setUseBeamSchema(boolean useBeamSchema); + abstract Builder setFetchSize(int fetchSize); + abstract Builder setTable(String tableName); abstract Builder setPartitionColumnType( @@ -1282,6 +1288,12 @@ public ReadWithPartitions withPartitionColumn(String partit return toBuilder().setPartitionColumn(partitionColumn).build(); } + /** The number of rows to fetch from the database in the same {@link ResultSet} round-trip. */ + public ReadWithPartitions withFetchSize(int fetchSize) { + checkArgument(fetchSize > 0, "fetchSize can not be less than 1"); + return toBuilder().setFetchSize(fetchSize).build(); + } + /** Data output type is {@link Row}, and schema is auto-inferred from the database. */ public ReadWithPartitions withRowOutput() { return toBuilder().setUseBeamSchema(true).build(); @@ -1357,7 +1369,8 @@ && getLowerBound() instanceof Comparable) { .withRowMapper( checkStateNotNull( JdbcUtil.JdbcReadWithPartitionsHelper.getPartitionsHelper( - getPartitionColumnType())))) + getPartitionColumnType()))) + .withFetchSize(getFetchSize())) .apply( MapElements.via( new SimpleFunction< @@ -1421,6 +1434,7 @@ public KV> apply( String.format( "select * from %1$s where %2$s >= ? and %2$s < ?", table, partitionColumn)) .withRowMapper(rowMapper) + .withFetchSize(getFetchSize()) .withParameterSetter( checkStateNotNull( JdbcUtil.JdbcReadWithPartitionsHelper.getPartitionsHelper( diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcReadSchemaTransformProvider.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcReadSchemaTransformProvider.java index dbf12f35024af..3b504b1a90d4a 100644 --- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcReadSchemaTransformProvider.java +++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcReadSchemaTransformProvider.java @@ -38,6 +38,9 @@ * An implementation of {@link org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider} for * reading from JDBC connections using {@link org.apache.beam.sdk.io.jdbc.JdbcIO}. */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) @AutoService(SchemaTransformProvider.class) public class JdbcReadSchemaTransformProvider extends TypedSchemaTransformProvider< @@ -80,6 +83,11 @@ protected JdbcIO.DataSourceConfiguration dataSourceConfiguration() { dsConfig = dsConfig.withConnectionInitSqls(initialSql); } + String driverJars = config.getDriverJars(); + if (driverJars != null) { + dsConfig = dsConfig.withDriverJars(config.getDriverJars()); + } + return dsConfig; } @@ -152,6 +160,9 @@ public abstract static class JdbcReadSchemaTransformConfiguration implements Ser @Nullable public abstract Boolean getOutputParallelization(); + @Nullable + public abstract String getDriverJars(); + public void validate() throws IllegalArgumentException { if (Strings.isNullOrEmpty(getDriverClassName())) { throw new IllegalArgumentException("JDBC Driver class name cannot be blank."); @@ -199,6 +210,8 @@ public abstract static class Builder { public abstract Builder setOutputParallelization(Boolean value); + public abstract Builder setDriverJars(String value); + public abstract JdbcReadSchemaTransformConfiguration build(); } } diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcSchemaIOProvider.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcSchemaIOProvider.java index c68b33a026077..4b5dc0d7e24a9 100644 --- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcSchemaIOProvider.java +++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcSchemaIOProvider.java @@ -134,6 +134,12 @@ public PCollection expand(PBegin input) { if (partitions != null) { readRows = readRows.withNumPartitions(partitions); } + + @Nullable Short fetchSize = config.getInt16("fetchSize"); + if (fetchSize != null) { + readRows = readRows.withFetchSize(fetchSize); + } + return input.apply(readRows); } else { diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcWriteSchemaTransformProvider.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcWriteSchemaTransformProvider.java index cb9d79631ca8e..e9f67969626e7 100644 --- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcWriteSchemaTransformProvider.java +++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcWriteSchemaTransformProvider.java @@ -29,6 +29,9 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; @@ -40,6 +43,9 @@ * An implementation of {@link org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider} for * writing to a JDBC connections using {@link org.apache.beam.sdk.io.jdbc.JdbcIO}. */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) @AutoService(SchemaTransformProvider.class) public class JdbcWriteSchemaTransformProvider extends TypedSchemaTransformProvider< @@ -82,6 +88,11 @@ protected JdbcIO.DataSourceConfiguration dataSourceConfiguration() { dsConfig = dsConfig.withConnectionInitSqls(initialSql); } + String driverJars = config.getDriverJars(); + if (driverJars != null) { + dsConfig = dsConfig.withDriverJars(config.getDriverJars()); + } + return dsConfig; } @@ -92,7 +103,9 @@ protected String writeStatement(Schema schema) { } else { StringBuilder statement = new StringBuilder("INSERT INTO "); statement.append(config.getLocation()); - statement.append(" VALUES("); + statement.append(" ("); + statement.append(String.join(", ", schema.getFieldNames())); + statement.append(") VALUES("); for (int i = 0; i < schema.getFieldCount() - 1; i++) { statement.append("?, "); } @@ -101,19 +114,30 @@ protected String writeStatement(Schema schema) { } } + private static class NoOutputDoFn extends DoFn { + @ProcessElement + public void process(ProcessContext c) {} + } + @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { - JdbcIO.Write writeRows = + JdbcIO.WriteVoid writeRows = JdbcIO.write() .withDataSourceConfiguration(dataSourceConfiguration()) .withStatement(writeStatement(input.get("input").getSchema())) - .withPreparedStatementSetter(new JdbcUtil.BeamRowPreparedStatementSetter()); + .withPreparedStatementSetter(new JdbcUtil.BeamRowPreparedStatementSetter()) + .withResults(); Boolean autosharding = config.getAutosharding(); if (autosharding != null && autosharding) { writeRows = writeRows.withAutoSharding(); } - input.get("input").apply(writeRows); - return PCollectionRowTuple.empty(input.getPipeline()); + PCollection postWrite = + input + .get("input") + .apply(writeRows) + .apply("post-write", ParDo.of(new NoOutputDoFn<>())) + .setRowSchema(Schema.of()); + return PCollectionRowTuple.of("post_write", postWrite); } } @@ -164,6 +188,9 @@ public abstract static class JdbcWriteSchemaTransformConfiguration implements Se @Nullable public abstract Boolean getAutosharding(); + @Nullable + public abstract String getDriverJars(); + public void validate() throws IllegalArgumentException { if (Strings.isNullOrEmpty(getDriverClassName())) { throw new IllegalArgumentException("JDBC Driver class name cannot be blank."); @@ -211,6 +238,8 @@ public abstract Builder setConnectionInitSql( public abstract Builder setAutosharding(Boolean value); + public abstract Builder setDriverJars(String value); + public abstract JdbcWriteSchemaTransformConfiguration build(); } } diff --git a/sdks/java/io/json/build.gradle b/sdks/java/io/json/build.gradle new file mode 100644 index 0000000000000..fe1f607a3696f --- /dev/null +++ b/sdks/java/io/json/build.gradle @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +plugins { id 'org.apache.beam.module' } +applyJavaNature( + automaticModuleName: 'org.apache.beam.sdk.io.json' +) + +description = "Apache Beam :: SDKs :: Java :: IO :: JSON" +ext.summary = "IO to read and write JSON files." + +dependencies { + implementation project(path: ":sdks:java:core", configuration: "shadow") + implementation library.java.vendored_guava_32_1_2_jre + implementation library.java.everit_json_schema + testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") + testImplementation library.java.junit + testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") + testImplementation project(path: ":sdks:java:io:common", configuration: "testRuntimeMigration") +} \ No newline at end of file diff --git a/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/JsonIO.java b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/JsonIO.java new file mode 100644 index 0000000000000..3abb29a804272 --- /dev/null +++ b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/JsonIO.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.json; + +import static org.apache.beam.sdk.values.TypeDescriptors.rows; +import static org.apache.beam.sdk.values.TypeDescriptors.strings; + +import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.coders.RowCoder; +import org.apache.beam.sdk.io.Compression; +import org.apache.beam.sdk.io.FileBasedSink; +import org.apache.beam.sdk.io.FileIO; +import org.apache.beam.sdk.io.ShardNameTemplate; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.io.WriteFiles; +import org.apache.beam.sdk.io.WriteFilesResult; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.utils.JsonUtils; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; + +/** + * {@link PTransform}s for reading and writing JSON files. + * + *

Reading JSON files

+ * + *

Reading from JSON files is not yet implemented in Java. Please see https://github.com/apache/beam/issues/24552. + * + *

Writing JSON files

+ * + *

To write a {@link PCollection} to one or more line-delimited JSON files, use {@link + * JsonIO.Write}, using{@link JsonIO#writeRows} or {@link JsonIO#write}. {@link JsonIO.Write} + * supports writing {@link Row} or custom Java types using an inferred {@link Schema}. Examples + * below show both scenarios. See the Beam Programming Guide on inferring + * schemas for more information on how to enable Beam to infer a {@link Schema} from a custom + * Java type. + * + *

Example usage:

+ * + *

Suppose we have the following Transaction class annotated with + * {@code @DefaultSchema(JavaBeanSchema.class)} so that Beam can infer its {@link Schema}: + * + *

{@code @DefaultSchema(JavaBeanSchema.class)
+ * public class Transaction {
+ *   public Transaction() { … }
+ *   public Long getTransactionId();
+ *   public void setTransactionId(Long transactionId) { … }
+ *   public String getBank() { … }
+ *   public void setBank(String bank) { … }
+ *   public double getPurchaseAmount() { … }
+ *   public void setPurchaseAmount(double purchaseAmount) { … }
+ * }
+ * }
+ * + *

From a {@code PCollection}, {@link JsonIO.Write} can write one or many JSON + * files. + * + *

{@code
+ * PCollection transactions = ...
+ * transactions.apply(JsonIO.write("path/to/folder/prefix"));
+ * }
+ * + *

The resulting JSON files will look like the following where the header is repeated for every + * file, whereas by default, {@link JsonIO.Write} will write all fields in sorted order of + * the field names. + * + *

{@code
+ * {"bank": "A", "purchaseAmount": 10.23, "transactionId": 12345}
+ * {"bank": "B", "purchaseAmount": 54.65, "transactionId": 54321}
+ * {"bank": "C", "purchaseAmount": 11,76, "transactionId": 98765}
+ * }
+ * + *

A {@link PCollection} of {@link Row}s works just like custom Java types illustrated above, + * except we use {@link JsonIO#writeRows} as shown below for the same {@code Transaction} class. We + * derive {@code Transaction}'s {@link Schema} using a {@link + * org.apache.beam.sdk.schemas.annotations.DefaultSchema.DefaultSchemaProvider}. Note that + * hard-coding the {@link Row}s below is for illustration purposes. Developers are instead + * encouraged to take advantage of {@link + * org.apache.beam.sdk.schemas.annotations.DefaultSchema.DefaultSchemaProvider#toRowFunction}. + * + *

{@code
+ * DefaultSchemaProvider defaultSchemaProvider = new DefaultSchemaProvider();
+ * Schema schema = defaultSchemaProvider.schemaFor(TypeDescriptor.of(Transaction.class));
+ * PCollection transactions = pipeline.apply(Create.of(
+ *  Row
+ *    .withSchema(schema)
+ *    .withFieldValue("bank", "A")
+ *    .withFieldValue("purchaseAmount", 10.23)
+ *    .withFieldValue("transactionId", "12345")
+ *    .build(),
+ *  Row
+ *    .withSchema(schema)
+ *    .withFieldValue("bank", "B")
+ *    .withFieldValue("purchaseAmount", 54.65)
+ *    .withFieldValue("transactionId", "54321")
+ *    .build(),
+ *  Row
+ *    .withSchema(schema)
+ *    .withFieldValue("bank", "C")
+ *    .withFieldValue("purchaseAmount", 11.76)
+ *    .withFieldValue("transactionId", "98765")
+ *    .build()
+ * );
+ *
+ * transactions.apply(
+ *  JsonIO
+ *    .writeRowsTo("gs://bucket/path/to/folder/prefix")
+ * );
+ * }
+ * + *

Writing the transactions {@link PCollection} of {@link Row}s would yield the following JSON + * file content. + * + *

{@code
+ * {"bank": "A", "purchaseAmount": 10.23, "transactionId": 12345}
+ * {"bank": "B", "purchaseAmount": 54.65, "transactionId": 54321}
+ * {"bank": "C", "purchaseAmount": 11,76, "transactionId": 98765}
+ * }
+ */ +public class JsonIO { + static final String DEFAULT_FILENAME_SUFFIX = ".json"; + + /** Instantiates a {@link Write} for writing user types in {@link JSONFormat} format. */ + public static Write write(String to) { + return new AutoValue_JsonIO_Write.Builder() + .setTextIOWrite(createDefaultTextIOWrite(to)) + .build(); + } + + /** Instantiates a {@link Write} for writing {@link Row}s in {@link JSONFormat} format. */ + public static Write writeRows(String to) { + return new AutoValue_JsonIO_Write.Builder() + .setTextIOWrite(createDefaultTextIOWrite(to)) + .build(); + } + + /** {@link PTransform} for writing JSON files. */ + @AutoValue + public abstract static class Write + extends PTransform, WriteFilesResult> { + + /** Specifies the {@link Compression} of all generated shard files. */ + public Write withCompression(Compression compression) { + return toBuilder().setTextIOWrite(getTextIOWrite().withCompression(compression)).build(); + } + + /** Whether to skip the spilling of data. See {@link WriteFiles#withNoSpilling}. */ + public Write withNoSpilling() { + return toBuilder().setTextIOWrite(getTextIOWrite().withNoSpilling()).build(); + } + + /** + * Specifies to use a given fixed number of shards per window. See {@link + * TextIO.Write#withNumShards}. + */ + public Write withNumShards(Integer numShards) { + return toBuilder().setTextIOWrite(getTextIOWrite().withNumShards(numShards)).build(); + } + + /** + * Forces a single file as output and empty shard name template. See {@link + * TextIO.Write#withoutSharding}. + */ + public Write withoutSharding() { + return toBuilder().setTextIOWrite(getTextIOWrite().withoutSharding()).build(); + } + + /** + * Uses the given {@link ShardNameTemplate} for naming output files. See {@link + * TextIO.Write#withShardNameTemplate}. + */ + public Write withShardTemplate(String shardTemplate) { + return toBuilder() + .setTextIOWrite(getTextIOWrite().withShardNameTemplate(shardTemplate)) + .build(); + } + + /** Configures the filename suffix for written files. See {@link TextIO.Write#withSuffix}. */ + public Write withSuffix(String suffix) { + return toBuilder().setTextIOWrite(getTextIOWrite().withSuffix(suffix)).build(); + } + + /** + * Set the base directory used to generate temporary files. See {@link + * TextIO.Write#withTempDirectory}. + */ + public Write withTempDirectory(ResourceId tempDirectory) { + return toBuilder().setTextIOWrite(getTextIOWrite().withTempDirectory(tempDirectory)).build(); + } + + /** + * Preserves windowing of input elements and writes them to files based on the element's window. + * See {@link TextIO.Write#withWindowedWrites}. + */ + public Write withWindowedWrites() { + return toBuilder().setTextIOWrite(getTextIOWrite().withWindowedWrites()).build(); + } + + /** + * Returns a transform for writing to text files like this one but that has the given {@link + * FileBasedSink.WritableByteChannelFactory} to be used by the {@link FileBasedSink} during + * output. See {@link TextIO.Write#withWritableByteChannelFactory}. + */ + public Write withWritableByteChannelFactory( + FileBasedSink.WritableByteChannelFactory writableByteChannelFactory) { + return toBuilder() + .setTextIOWrite( + getTextIOWrite().withWritableByteChannelFactory(writableByteChannelFactory)) + .build(); + } + + /** The underlying {@link FileIO.Write} that writes converted input to JSON formatted output. */ + abstract TextIO.Write getTextIOWrite(); + + abstract Builder toBuilder(); + + @AutoValue.Builder + abstract static class Builder { + + /** + * The underlying {@link FileIO.Write} that writes converted input to JSON formatted output. + */ + abstract Builder setTextIOWrite(TextIO.Write value); + + abstract Write autoBuild(); + + final Write build() { + return autoBuild(); + } + } + + @Override + public WriteFilesResult expand(PCollection input) { + if (!input.hasSchema()) { + throw new IllegalArgumentException( + String.format( + "%s requires an input Schema. Note that only Row or user classes are supported. Consider using TextIO or FileIO directly when writing primitive types", + Write.class.getName())); + } + + Schema schema = input.getSchema(); + + RowCoder rowCoder = RowCoder.of(schema); + + PCollection rows = + input + .apply("To Rows", MapElements.into(rows()).via(input.getToRowFunction())) + .setCoder(rowCoder); + + SerializableFunction toJsonFn = + JsonUtils.getRowToJsonStringsFunction(input.getSchema()); + + PCollection json = rows.apply("To JSON", MapElements.into(strings()).via(toJsonFn)); + + return json.apply("Write JSON", getTextIOWrite().withOutputFilenames()); + } + } + + private static TextIO.Write createDefaultTextIOWrite(String to) { + return TextIO.write().to(to).withSuffix(DEFAULT_FILENAME_SUFFIX); + } +} diff --git a/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/package-info.java b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/package-info.java new file mode 100644 index 0000000000000..1ee1918357135 --- /dev/null +++ b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Transforms for reading and writing JSON files. */ +package org.apache.beam.sdk.io.json; diff --git a/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/providers/JsonWriteTransformProvider.java b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/providers/JsonWriteTransformProvider.java new file mode 100644 index 0000000000000..9e030821e5ca1 --- /dev/null +++ b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/providers/JsonWriteTransformProvider.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.json.providers; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.service.AutoService; +import com.google.auto.value.AutoValue; +import java.util.Collections; +import java.util.List; +import org.apache.beam.sdk.io.WriteFilesResult; +import org.apache.beam.sdk.io.json.JsonIO; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.Schema.Field; +import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; + +/** + * An implementation of {@link TypedSchemaTransformProvider} for {@link JsonIO#write}. + * + *

Internal only: This class is actively being worked on, and it will likely change. We + * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam + * repository. + */ +@SuppressWarnings({ + "nullness" // TODO(https://github.com/apache/beam/issues/20497) +}) +@AutoService(SchemaTransformProvider.class) +public class JsonWriteTransformProvider + extends TypedSchemaTransformProvider { + private static final String INPUT_ROWS_TAG = "input"; + private static final String WRITE_RESULTS = "output"; + + @Override + protected Class configurationClass() { + return JsonWriteConfiguration.class; + } + + @Override + protected SchemaTransform from(JsonWriteConfiguration configuration) { + return new JsonWriteTransform(configuration); + } + + @Override + public String identifier() { + return String.format("beam:schematransform:org.apache.beam:json_write:v1"); + } + + @Override + public List inputCollectionNames() { + return Collections.singletonList(INPUT_ROWS_TAG); + } + + @Override + public List outputCollectionNames() { + return Collections.singletonList(WRITE_RESULTS); + } + + /** Configuration for writing to BigQuery with Storage Write API. */ + @DefaultSchema(AutoValueSchema.class) + @AutoValue + public abstract static class JsonWriteConfiguration { + + public void validate() { + checkArgument( + !Strings.isNullOrEmpty(this.getPath()), "Path for a JSON Write must be specified."); + } + + public static Builder builder() { + return new AutoValue_JsonWriteTransformProvider_JsonWriteConfiguration.Builder(); + } + + @SchemaFieldDescription("The file path to write to.") + public abstract String getPath(); + + /** Builder for {@link JsonWriteConfiguration}. */ + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder setPath(String path); + + /** Builds a {@link JsonWriteConfiguration} instance. */ + public abstract JsonWriteConfiguration build(); + } + } + + /** A {@link SchemaTransform} for {@link JsonIO#write}. */ + protected static class JsonWriteTransform extends SchemaTransform { + + private final JsonWriteConfiguration configuration; + + JsonWriteTransform(JsonWriteConfiguration configuration) { + configuration.validate(); + this.configuration = configuration; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + WriteFilesResult result = + input.get(INPUT_ROWS_TAG).apply(JsonIO.writeRows(configuration.getPath()).withSuffix("")); + Schema outputSchema = Schema.of(Field.of("filename", FieldType.STRING)); + return PCollectionRowTuple.of( + WRITE_RESULTS, + result + .getPerDestinationOutputFilenames() + .apply( + "Collect filenames", + MapElements.into(TypeDescriptors.rows()) + .via( + (destinationAndRow) -> + Row.withSchema(outputSchema) + .withFieldValue("filename", destinationAndRow.getValue()) + .build())) + .setRowSchema(outputSchema)); + } + } +} diff --git a/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/providers/package-info.java b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/providers/package-info.java new file mode 100644 index 0000000000000..312454f8733b9 --- /dev/null +++ b/sdks/java/io/json/src/main/java/org/apache/beam/sdk/io/json/providers/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Transforms for reading and writing JSON files. */ +package org.apache.beam.sdk.io.json.providers; diff --git a/sdks/java/io/json/src/test/java/org/apache/beam/sdk/io/json/JsonIOWriteTest.java b/sdks/java/io/json/src/test/java/org/apache/beam/sdk/io/json/JsonIOWriteTest.java new file mode 100644 index 0000000000000..71fdcd6b3d94d --- /dev/null +++ b/sdks/java/io/json/src/test/java/org/apache/beam/sdk/io/json/JsonIOWriteTest.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.json; + +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.allPrimitiveDataTypes; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; + +import java.io.File; +import java.io.IOException; +import java.math.BigDecimal; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.AllPrimitiveDataTypes; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.SerializableMatcher; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.hamcrest.BaseMatcher; +import org.hamcrest.Description; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link JsonIO.Write}. */ +@RunWith(JUnit4.class) +public class JsonIOWriteTest { + @Rule public TestPipeline writePipeline = TestPipeline.create(); + + @Rule public TestPipeline readPipeline = TestPipeline.create(); + + @Rule + public TestPipeline errorPipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + @Test + public void writesUserDefinedTypes() { + File folder = + createFolder(AllPrimitiveDataTypes.class.getSimpleName(), "writesUserDefinedTypes"); + + PCollection input = + writePipeline.apply( + Create.of( + allPrimitiveDataTypes(false, BigDecimal.TEN, 1.0, 1.0f, 1, 1L, "a"), + allPrimitiveDataTypes( + false, BigDecimal.TEN.add(BigDecimal.TEN), 2.0, 2.0f, 2, 2L, "b"), + allPrimitiveDataTypes( + false, + BigDecimal.TEN.add(BigDecimal.TEN).add(BigDecimal.TEN), + 3.0, + 3.0f, + 3, + 3L, + "c"))); + + input.apply(JsonIO.write(toFilenamePrefix(folder)).withNumShards(1)); + + writePipeline.run().waitUntilFinish(); + + PAssert.that(readPipeline.apply(TextIO.read().from(toFilenamePrefix(folder) + "*"))) + .containsInAnyOrder( + containsAll( + "\"aDouble\":1.0", + "\"aFloat\":1.0", + "\"aLong\":1", + "\"aString\":\"a\"", + "\"anInteger\":1", + "\"aDecimal\":10", + "\"aBoolean\":false"), + containsAll( + "\"aDouble\":2.0", + "\"aFloat\":2.0", + "\"aLong\":2", + "\"aString\":\"b\"", + "\"anInteger\":2", + "\"aDecimal\":20", + "\"aBoolean\":false"), + containsAll( + "\"aDouble\":3.0", + "\"aFloat\":3.0", + "\"aLong\":3", + "\"aString\":\"c\"", + "\"anInteger\":3", + "\"aDecimal\":30", + "\"aBoolean\":false")); + + readPipeline.run(); + } + + private static SerializableMatcher containsAll(String... needles) { + class Matcher extends BaseMatcher implements SerializableMatcher { + @Override + public boolean matches(Object item) { + if (!(item instanceof String)) { + return false; + } + + String haystack = (String) item; + for (String needle : needles) { + if (!haystack.contains(needle)) { + return false; + } + } + return true; + } + + @Override + public void describeTo(Description description) { + description.appendText("Contains all of: "); + description.appendValueList("[", ",", "]", needles); + } + } + return new Matcher(); + } + + private static String toFilenamePrefix(File folder) { + checkArgument(folder.isDirectory()); + return folder.getAbsolutePath() + "/out"; + } + + private File createFolder(String... paths) { + try { + return tempFolder.newFolder(paths); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } +} diff --git a/sdks/java/io/kafka/README.md b/sdks/java/io/kafka/README.md index 4ecf095bec5bc..b137e0b240a9c 100644 --- a/sdks/java/io/kafka/README.md +++ b/sdks/java/io/kafka/README.md @@ -47,3 +47,13 @@ complete list. The documentation is maintained in JavaDoc for KafkaIO class. It includes usage examples and primary concepts. - [KafkaIO.java](src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java) + +### Protobuf tests +This recreates the proto descriptor set included in this resource directory. + +```bash +protoc \ + -Isdks/java/io/kafka/src/test/resources/ \ + --descriptor_set_out=sdks/java/io/kafka/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb \ + sdks/java/io/kafka/src/test/resources/proto_byte/proto_byte_utils.proto +``` \ No newline at end of file diff --git a/sdks/java/io/kafka/build.gradle b/sdks/java/io/kafka/build.gradle index 63eba6edcd8f5..dc190ef9d8fd1 100644 --- a/sdks/java/io/kafka/build.gradle +++ b/sdks/java/io/kafka/build.gradle @@ -51,6 +51,7 @@ dependencies { permitUnusedDeclared library.java.jackson_dataformat_csv implementation project(path: ":sdks:java:core", configuration: "shadow") implementation project(":sdks:java:extensions:avro") + implementation project(":sdks:java:extensions:protobuf") implementation project(":runners:core-construction-java") implementation project(":sdks:java:expansion-service") permitUnusedDeclared project(":sdks:java:expansion-service") // BEAM-11761 @@ -82,7 +83,7 @@ dependencies { provided library.java.everit_json_schema testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") testImplementation project(":sdks:java:io:synthetic") - testImplementation project(":sdks:java:extensions:avro") + testImplementation project(path: ":sdks:java:extensions:avro", configuration: "testRuntimeMigration") testImplementation project(path: ":sdks:java:io:common", configuration: "testRuntimeMigration") testImplementation project(path: ":sdks:java:testing:test-utils", configuration: "testRuntimeMigration") // For testing Cross-language transforms diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/CheckStopReadingFn.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/CheckStopReadingFn.java new file mode 100644 index 0000000000000..20192508491de --- /dev/null +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/CheckStopReadingFn.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.kafka; + +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.kafka.common.TopicPartition; + +public interface CheckStopReadingFn extends SerializableFunction { + default void setup() {} + + default void teardown() {} +} diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/CheckStopReadingFnWrapper.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/CheckStopReadingFnWrapper.java new file mode 100644 index 0000000000000..e4012e9650971 --- /dev/null +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/CheckStopReadingFnWrapper.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.kafka; + +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.kafka.common.TopicPartition; +import org.checkerframework.checker.nullness.qual.Nullable; + +public class CheckStopReadingFnWrapper implements CheckStopReadingFn { + private final SerializableFunction serializableFunction; + + private CheckStopReadingFnWrapper( + SerializableFunction serializableFunction) { + this.serializableFunction = serializableFunction; + } + + public static @Nullable CheckStopReadingFnWrapper of( + @Nullable SerializableFunction serializableFunction) { + return serializableFunction != null + ? new CheckStopReadingFnWrapper(serializableFunction) + : null; + } + + @Override + public Boolean apply(TopicPartition input) { + return serializableFunction.apply(input); + } +} diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java index 7275986de8b5b..26f6c3448801b 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaIO.java @@ -685,7 +685,7 @@ public abstract static class Read abstract @Nullable DeserializerProvider getValueDeserializerProvider(); @Pure - abstract @Nullable SerializableFunction getCheckStopReadingFn(); + abstract @Nullable CheckStopReadingFn getCheckStopReadingFn(); abstract Builder toBuilder(); @@ -733,8 +733,12 @@ abstract Builder setKeyDeserializerProvider( abstract Builder setValueDeserializerProvider( DeserializerProvider deserializerProvider); - abstract Builder setCheckStopReadingFn( - SerializableFunction checkStopReadingFn); + abstract Builder setCheckStopReadingFn(@Nullable CheckStopReadingFn checkStopReadingFn); + + Builder setCheckStopReadingFn( + @Nullable SerializableFunction checkStopReadingFn) { + return setCheckStopReadingFn(CheckStopReadingFnWrapper.of(checkStopReadingFn)); + } abstract Read build(); @@ -1269,13 +1273,23 @@ public Read withConsumerConfigUpdates(Map configUpdates) { return toBuilder().setConsumerConfig(config).build(); } + /** + * A custom {@link CheckStopReadingFn} that determines whether the {@link ReadFromKafkaDoFn} + * should stop reading from the given {@link TopicPartition}. + */ + public Read withCheckStopReadingFn(CheckStopReadingFn checkStopReadingFn) { + return toBuilder().setCheckStopReadingFn(checkStopReadingFn).build(); + } + /** * A custom {@link SerializableFunction} that determines whether the {@link ReadFromKafkaDoFn} * should stop reading from the given {@link TopicPartition}. */ public Read withCheckStopReadingFn( SerializableFunction checkStopReadingFn) { - return toBuilder().setCheckStopReadingFn(checkStopReadingFn).build(); + return toBuilder() + .setCheckStopReadingFn(CheckStopReadingFnWrapper.of(checkStopReadingFn)) + .build(); } /** Returns a {@link PTransform} for PCollection of {@link KV}, dropping Kafka metatdata. */ @@ -1759,10 +1773,10 @@ public void populateDisplayData(DisplayData.Builder builder) { static class KafkaHeader { String key; - byte[] value; + byte @Nullable [] value; @SchemaCreate - public KafkaHeader(String key, byte[] value) { + public KafkaHeader(String key, byte @Nullable [] value) { this.key = key; this.value = value; } @@ -1947,7 +1961,7 @@ public abstract static class ReadSourceDescriptors getConsumerFactoryFn(); @Pure - abstract @Nullable SerializableFunction getCheckStopReadingFn(); + abstract @Nullable CheckStopReadingFn getCheckStopReadingFn(); @Pure abstract @Nullable SerializableFunction, Instant> @@ -1978,7 +1992,12 @@ abstract ReadSourceDescriptors.Builder setConsumerFactoryFn( SerializableFunction, Consumer> consumerFactoryFn); abstract ReadSourceDescriptors.Builder setCheckStopReadingFn( - @Nullable SerializableFunction checkStopReadingFn); + @Nullable CheckStopReadingFn checkStopReadingFn); + + ReadSourceDescriptors.Builder setCheckStopReadingFn( + @Nullable SerializableFunction checkStopReadingFn) { + return setCheckStopReadingFn(CheckStopReadingFnWrapper.of(checkStopReadingFn)); + } abstract ReadSourceDescriptors.Builder setKeyDeserializerProvider( @Nullable DeserializerProvider deserializerProvider); @@ -2096,13 +2115,24 @@ public ReadSourceDescriptors withConsumerFactoryFn( return toBuilder().setConsumerFactoryFn(consumerFactoryFn).build(); } + /** + * A custom {@link CheckStopReadingFn} that determines whether the {@link ReadFromKafkaDoFn} + * should stop reading from the given {@link TopicPartition}. + */ + public ReadSourceDescriptors withCheckStopReadingFn( + @Nullable CheckStopReadingFn checkStopReadingFn) { + return toBuilder().setCheckStopReadingFn(checkStopReadingFn).build(); + } + /** * A custom {@link SerializableFunction} that determines whether the {@link ReadFromKafkaDoFn} * should stop reading from the given {@link TopicPartition}. */ public ReadSourceDescriptors withCheckStopReadingFn( @Nullable SerializableFunction checkStopReadingFn) { - return toBuilder().setCheckStopReadingFn(checkStopReadingFn).build(); + return toBuilder() + .setCheckStopReadingFn(CheckStopReadingFnWrapper.of(checkStopReadingFn)) + .build(); } /** diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformConfiguration.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformConfiguration.java index f7e0915806803..2fa365b1c7f30 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformConfiguration.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformConfiguration.java @@ -24,6 +24,7 @@ import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.sdk.schemas.transforms.providers.ErrorHandling; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; /** @@ -39,7 +40,7 @@ public abstract class KafkaReadSchemaTransformConfiguration { public static final Set VALID_START_OFFSET_VALUES = Sets.newHashSet("earliest", "latest"); - public static final String VALID_FORMATS_STR = "AVRO,JSON"; + public static final String VALID_FORMATS_STR = "RAW,AVRO,JSON,PROTO"; public static final Set VALID_DATA_FORMATS = Sets.newHashSet(VALID_FORMATS_STR.split(",")); @@ -86,6 +87,18 @@ public static Builder builder() { @Nullable public abstract String getSchema(); + @SchemaFieldDescription( + "The path to the Protocol Buffer File Descriptor Set file. This file is used for schema" + + " definition and message serialization.") + @Nullable + public abstract String getFileDescriptorPath(); + + @SchemaFieldDescription( + "The name of the Protocol Buffer message to be used for schema" + + " extraction and data conversion.") + @Nullable + public abstract String getMessageName(); + @SchemaFieldDescription( "What to do when there is no initial offset in Kafka or if the current offset" + " does not exist any more on the server. (1) earliest: automatically reset the offset to the earliest" @@ -105,6 +118,10 @@ public static Builder builder() { /** Sets the topic from which to read. */ public abstract String getTopic(); + @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") + @Nullable + public abstract ErrorHandling getErrorHandling(); + /** Builder for the {@link KafkaReadSchemaTransformConfiguration}. */ @AutoValue.Builder public abstract static class Builder { @@ -118,6 +135,10 @@ public abstract static class Builder { public abstract Builder setSchema(String schema); + public abstract Builder setFileDescriptorPath(String fileDescriptorPath); + + public abstract Builder setMessageName(String messageName); + public abstract Builder setFormat(String format); public abstract Builder setAutoOffsetResetConfig(String startOffset); @@ -127,6 +148,8 @@ public abstract static class Builder { /** Sets the topic from which to read. */ public abstract Builder setTopic(String value); + public abstract Builder setErrorHandling(ErrorHandling errorHandling); + /** Builds a {@link KafkaReadSchemaTransformConfiguration} instance. */ public abstract KafkaReadSchemaTransformConfiguration build(); } diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java index 0c091bf9ba847..996976ee9a758 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProvider.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.kafka; +import static org.apache.beam.sdk.io.kafka.KafkaReadSchemaTransformConfiguration.VALID_DATA_FORMATS; + import com.google.auto.service.AutoService; import java.io.FileOutputStream; import java.io.IOException; @@ -35,6 +37,7 @@ import org.apache.avro.generic.GenericRecord; import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; +import org.apache.beam.sdk.extensions.protobuf.ProtoByteUtils; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; @@ -43,12 +46,12 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.providers.ErrorHandling; import org.apache.beam.sdk.schemas.utils.JsonUtils; import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.DoFn.FinishBundle; -import org.apache.beam.sdk.transforms.DoFn.ProcessElement; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.transforms.SimpleFunction; import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; @@ -76,8 +79,6 @@ public class KafkaReadSchemaTransformProvider public static final TupleTag OUTPUT_TAG = new TupleTag() {}; public static final TupleTag ERROR_TAG = new TupleTag() {}; - public static final Schema ERROR_SCHEMA = - Schema.builder().addStringField("error").addNullableByteArrayField("row").build(); final Boolean isTest; final Integer testTimeoutSecs; @@ -112,18 +113,49 @@ protected SchemaTransform from(KafkaReadSchemaTransformConfiguration configurati consumerConfigs.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 100); consumerConfigs.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, autoOffsetReset); - if (inputSchema != null && !inputSchema.isEmpty()) { - assert Strings.isNullOrEmpty(configuration.getConfluentSchemaRegistryUrl()) - : "To read from Kafka, a schema must be provided directly or though Confluent " - + "Schema Registry, but not both."; - final Schema beamSchema = - Objects.equals(configuration.getFormat(), "JSON") - ? JsonUtils.beamSchemaFromJsonSchema(inputSchema) - : AvroUtils.toBeamSchema(new org.apache.avro.Schema.Parser().parse(inputSchema)); - SerializableFunction valueMapper = - Objects.equals(configuration.getFormat(), "JSON") - ? JsonUtils.getJsonBytesToRowFunction(beamSchema) - : AvroUtils.getAvroBytesToRowFunction(beamSchema); + String format = configuration.getFormat(); + boolean handleErrors = ErrorHandling.hasOutput(configuration.getErrorHandling()); + String descriptorPath = configuration.getFileDescriptorPath(); + String messageName = configuration.getMessageName(); + + if ((format != null && VALID_DATA_FORMATS.contains(format)) + || (!Strings.isNullOrEmpty(inputSchema) && !Objects.equals(format, "RAW")) + || (Objects.equals(format, "PROTO") + && !Strings.isNullOrEmpty(descriptorPath) + && !Strings.isNullOrEmpty(messageName))) { + SerializableFunction valueMapper; + Schema beamSchema; + if (format != null && format.equals("RAW")) { + if (inputSchema != null) { + throw new IllegalArgumentException( + "To read from Kafka in RAW format, you can't provide a schema."); + } + beamSchema = Schema.builder().addField("payload", Schema.FieldType.BYTES).build(); + valueMapper = getRawBytesToRowFunction(beamSchema); + } else if (format != null && format.equals("PROTO")) { + if (descriptorPath == null || messageName == null) { + throw new IllegalArgumentException( + "Expecting both descriptorPath and messageName to be non-null."); + } + valueMapper = ProtoByteUtils.getProtoBytesToRowFunction(descriptorPath, messageName); + beamSchema = ProtoByteUtils.getBeamSchemaFromProto(descriptorPath, messageName); + } else { + assert Strings.isNullOrEmpty(configuration.getConfluentSchemaRegistryUrl()) + : "To read from Kafka, a schema must be provided directly or though Confluent " + + "Schema Registry, but not both."; + if (inputSchema == null) { + throw new IllegalArgumentException( + "To read from Kafka in JSON or AVRO format, you must provide a schema."); + } + beamSchema = + Objects.equals(format, "JSON") + ? JsonUtils.beamSchemaFromJsonSchema(inputSchema) + : AvroUtils.toBeamSchema(new org.apache.avro.Schema.Parser().parse(inputSchema)); + valueMapper = + Objects.equals(format, "JSON") + ? JsonUtils.getJsonBytesToRowFunction(beamSchema) + : AvroUtils.getAvroBytesToRowFunction(beamSchema); + } return new SchemaTransform() { @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { @@ -140,16 +172,27 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { PCollection kafkaValues = input.getPipeline().apply(kafkaRead.withoutMetadata()).apply(Values.create()); + Schema errorSchema = ErrorHandling.errorSchemaBytes(); PCollectionTuple outputTuple = kafkaValues.apply( - ParDo.of(new ErrorFn("Kafka-read-error-counter", valueMapper)) + ParDo.of( + new ErrorFn( + "Kafka-read-error-counter", valueMapper, errorSchema, handleErrors)) .withOutputTags(OUTPUT_TAG, TupleTagList.of(ERROR_TAG))); - return PCollectionRowTuple.of( - "output", - outputTuple.get(OUTPUT_TAG).setRowSchema(beamSchema), - "errors", - outputTuple.get(ERROR_TAG).setRowSchema(ERROR_SCHEMA)); + PCollectionRowTuple outputRows = + PCollectionRowTuple.of( + "output", outputTuple.get(OUTPUT_TAG).setRowSchema(beamSchema)); + + PCollection errorOutput = outputTuple.get(ERROR_TAG).setRowSchema(errorSchema); + if (handleErrors) { + ErrorHandling errorHandling = configuration.getErrorHandling(); + if (errorHandling == null) { + throw new IllegalArgumentException("You must specify an error handling option."); + } + outputRows = outputRows.and(errorHandling.getOutput(), errorOutput); + } + return outputRows; } }; } else { @@ -193,6 +236,15 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { } } + public static SerializableFunction getRawBytesToRowFunction(Schema rawSchema) { + return new SimpleFunction() { + @Override + public Row apply(byte[] input) { + return Row.withSchema(rawSchema).addValue(input).build(); + } + }; + } + @Override public String identifier() { return "beam:schematransform:org.apache.beam:kafka_read:v1"; @@ -209,25 +261,38 @@ public List outputCollectionNames() { } public static class ErrorFn extends DoFn { - private SerializableFunction valueMapper; - private Counter errorCounter; + private final SerializableFunction valueMapper; + private final Counter errorCounter; private Long errorsInBundle = 0L; + private final boolean handleErrors; + private final Schema errorSchema; - public ErrorFn(String name, SerializableFunction valueMapper) { + public ErrorFn( + String name, + SerializableFunction valueMapper, + Schema errorSchema, + boolean handleErrors) { this.errorCounter = Metrics.counter(KafkaReadSchemaTransformProvider.class, name); this.valueMapper = valueMapper; + this.handleErrors = handleErrors; + this.errorSchema = errorSchema; } @ProcessElement public void process(@DoFn.Element byte[] msg, MultiOutputReceiver receiver) { + Row mappedRow = null; try { - receiver.get(OUTPUT_TAG).output(valueMapper.apply(msg)); + mappedRow = valueMapper.apply(msg); } catch (Exception e) { + if (!handleErrors) { + throw new RuntimeException(e); + } errorsInBundle += 1; LOG.warn("Error while parsing the element", e); - receiver - .get(ERROR_TAG) - .output(Row.withSchema(ERROR_SCHEMA).addValues(e.toString(), msg).build()); + receiver.get(ERROR_TAG).output(ErrorHandling.errorRecord(errorSchema, msg, e)); + } + if (mappedRow != null) { + receiver.get(OUTPUT_TAG).output(mappedRow); } } diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoder.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoder.java index 2cb1efe657041..dbb3a053099cc 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoder.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoder.java @@ -27,6 +27,7 @@ import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.IterableCoder; import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.NullableCoder; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.coders.StructuredCoder; import org.apache.beam.sdk.coders.VarIntCoder; @@ -44,7 +45,7 @@ public class KafkaRecordCoder extends StructuredCoder> { private static final Coder longCoder = VarLongCoder.of(); private static final Coder intCoder = VarIntCoder.of(); private static final Coder>> headerCoder = - IterableCoder.of(KvCoder.of(stringCoder, ByteArrayCoder.of())); + IterableCoder.of(KvCoder.of(stringCoder, NullableCoder.of(ByteArrayCoder.of()))); private final KvCoder kvCoder; diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProvider.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProvider.java index 876ef9a49e8a2..694c3e9f2c146 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProvider.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProvider.java @@ -27,6 +27,7 @@ import java.util.Set; import javax.annotation.Nullable; import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; +import org.apache.beam.sdk.extensions.protobuf.ProtoByteUtils; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.schemas.AutoValueSchema; @@ -41,6 +42,7 @@ import org.apache.beam.sdk.transforms.DoFn.ProcessElement; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.transforms.SimpleFunction; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.PCollectionTuple; @@ -60,7 +62,7 @@ public class KafkaWriteSchemaTransformProvider extends TypedSchemaTransformProvider< KafkaWriteSchemaTransformProvider.KafkaWriteSchemaTransformConfiguration> { - public static final String SUPPORTED_FORMATS_STR = "JSON,AVRO"; + public static final String SUPPORTED_FORMATS_STR = "RAW,JSON,AVRO,PROTO"; public static final Set SUPPORTED_FORMATS = Sets.newHashSet(SUPPORTED_FORMATS_STR.split(",")); public static final TupleTag ERROR_TAG = new TupleTag() {}; @@ -131,10 +133,30 @@ public void finish() { @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { Schema inputSchema = input.get("input").getSchema(); - final SerializableFunction toBytesFn = - configuration.getFormat().equals("JSON") - ? JsonUtils.getRowToJsonBytesFunction(inputSchema) - : AvroUtils.getRowToAvroBytesFunction(inputSchema); + final SerializableFunction toBytesFn; + if (configuration.getFormat().equals("RAW")) { + int numFields = inputSchema.getFields().size(); + if (numFields != 1) { + throw new IllegalArgumentException("Expecting exactly one field, found " + numFields); + } + if (inputSchema.getField(0).getType().equals(Schema.FieldType.BYTES)) { + throw new IllegalArgumentException( + "The input schema must have exactly one field of type byte."); + } + toBytesFn = getRowToRawBytesFunction(inputSchema.getField(0).getName()); + } else if (configuration.getFormat().equals("JSON")) { + toBytesFn = JsonUtils.getRowToJsonBytesFunction(inputSchema); + } else if (configuration.getFormat().equals("PROTO")) { + String descriptorPath = configuration.getFileDescriptorPath(); + String messageName = configuration.getMessageName(); + if (descriptorPath == null || messageName == null) { + throw new IllegalArgumentException( + "Expecting both descriptorPath and messageName to be non-null."); + } + toBytesFn = ProtoByteUtils.getRowToProtoBytes(descriptorPath, messageName); + } else { + toBytesFn = AvroUtils.getRowToAvroBytesFunction(inputSchema); + } final Map configOverrides = configuration.getProducerConfigUpdates(); PCollectionTuple outputTuple = @@ -163,6 +185,19 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { } } + public static SerializableFunction getRowToRawBytesFunction(String rowFieldName) { + return new SimpleFunction() { + @Override + public byte[] apply(Row input) { + byte[] rawBytes = input.getBytes(rowFieldName); + if (rawBytes == null) { + throw new NullPointerException(); + } + return rawBytes; + } + }; + } + @Override public @UnknownKeyFor @NonNull @Initialized String identifier() { return "beam:schematransform:org.apache.beam:kafka_write:v1"; @@ -197,6 +232,18 @@ public abstract static class KafkaWriteSchemaTransformConfiguration implements S + " of servers. | Format: host1:port1,host2:port2,...") public abstract String getBootstrapServers(); + @SchemaFieldDescription( + "The path to the Protocol Buffer File Descriptor Set file. This file is used for schema" + + " definition and message serialization.") + @Nullable + public abstract String getFileDescriptorPath(); + + @SchemaFieldDescription( + "The name of the Protocol Buffer message to be used for schema" + + " extraction and data conversion.") + @Nullable + public abstract String getMessageName(); + @SchemaFieldDescription( "A list of key-value pairs that act as configuration parameters for Kafka producers." + " Most of these configurations will not be needed, but if you need to customize your Kafka producer," @@ -218,6 +265,10 @@ public abstract static class Builder { public abstract Builder setBootstrapServers(String bootstrapServers); + public abstract Builder setFileDescriptorPath(String fileDescriptorPath); + + public abstract Builder setMessageName(String messageName); + public abstract Builder setProducerConfigUpdates(Map producerConfigUpdates); public abstract KafkaWriteSchemaTransformConfiguration build(); diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java index 31620549ab222..4b0035aa3564c 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFn.java @@ -182,7 +182,7 @@ private ReadFromKafkaDoFn(ReadSourceDescriptors transform) { private final @Nullable Map offsetConsumerConfig; - private final @Nullable SerializableFunction checkStopReadingFn; + private final @Nullable CheckStopReadingFn checkStopReadingFn; private final SerializableFunction, Consumer> consumerFactoryFn; @@ -514,6 +514,9 @@ public AverageRecordSize load(TopicPartition topicPartition) throws Exception { keyDeserializerInstance = keyDeserializerProvider.getDeserializer(consumerConfig, true); valueDeserializerInstance = valueDeserializerProvider.getDeserializer(consumerConfig, false); offsetEstimatorCache = new HashMap<>(); + if (checkStopReadingFn != null) { + checkStopReadingFn.setup(); + } } @Teardown @@ -532,6 +535,9 @@ public void teardown() throws Exception { if (offsetEstimatorCache != null) { offsetEstimatorCache.clear(); } + if (checkStopReadingFn != null) { + checkStopReadingFn.teardown(); + } } private Map overrideBootstrapServersConfig( diff --git a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/WatchForKafkaTopicPartitions.java b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/WatchForKafkaTopicPartitions.java index ed67257ca4542..0d60640316e85 100644 --- a/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/WatchForKafkaTopicPartitions.java +++ b/sdks/java/io/kafka/src/main/java/org/apache/beam/sdk/io/kafka/WatchForKafkaTopicPartitions.java @@ -63,7 +63,7 @@ class WatchForKafkaTopicPartitions extends PTransform, Consumer> kafkaConsumerFactoryFn; private final Map kafkaConsumerConfig; - private final @Nullable SerializableFunction checkStopReadingFn; + private final @Nullable CheckStopReadingFn checkStopReadingFn; private final Set topics; private final @Nullable Pattern topicPattern; private final @Nullable Instant startReadTime; @@ -73,7 +73,7 @@ public WatchForKafkaTopicPartitions( @Nullable Duration checkDuration, SerializableFunction, Consumer> kafkaConsumerFactoryFn, Map kafkaConsumerConfig, - @Nullable SerializableFunction checkStopReadingFn, + @Nullable CheckStopReadingFn checkStopReadingFn, Set topics, @Nullable Pattern topicPattern, @Nullable Instant startReadTime, @@ -104,12 +104,12 @@ public PCollection expand(PBegin input) { private static class ConvertToDescriptor extends DoFn, KafkaSourceDescriptor> { - private final @Nullable SerializableFunction checkStopReadingFn; + private final @Nullable CheckStopReadingFn checkStopReadingFn; private final @Nullable Instant startReadTime; private final @Nullable Instant stopReadTime; private ConvertToDescriptor( - @Nullable SerializableFunction checkStopReadingFn, + @Nullable CheckStopReadingFn checkStopReadingFn, @Nullable Instant startReadTime, @Nullable Instant stopReadTime) { this.checkStopReadingFn = checkStopReadingFn; @@ -131,6 +131,20 @@ public void processElement( topicPartition, null, startReadTime, null, stopReadTime, null)); } } + + @Setup + public void setup() throws Exception { + if (checkStopReadingFn != null) { + checkStopReadingFn.setup(); + } + } + + @Teardown + public void teardown() throws Exception { + if (checkStopReadingFn != null) { + checkStopReadingFn.teardown(); + } + } } private static class WatchPartitionFn extends PollFn { diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaDlqTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaDlqTest.java index 48fe969bc9f35..e65d9591a0bed 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaDlqTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaDlqTest.java @@ -22,6 +22,7 @@ import java.util.List; import org.apache.beam.sdk.io.kafka.KafkaReadSchemaTransformProvider.ErrorFn; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.transforms.providers.ErrorHandling; import org.apache.beam.sdk.schemas.utils.JsonUtils; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; @@ -47,7 +48,6 @@ public class KafkaDlqTest { private static final Schema BEAMSCHEMA = Schema.of(Schema.Field.of("name", Schema.FieldType.STRING)); - private static final Schema ERRORSCHEMA = KafkaReadSchemaTransformProvider.ERROR_SCHEMA; private static final List ROWS = Arrays.asList( @@ -75,13 +75,14 @@ public void testKafkaErrorFnSuccess() throws Exception { } catch (Exception e) { } PCollection input = p.apply(Create.of(messages)); + Schema errorSchema = ErrorHandling.errorSchemaBytes(); PCollectionTuple output = input.apply( - ParDo.of(new ErrorFn("Kafka-read-error-counter", valueMapper)) + ParDo.of(new ErrorFn("Kafka-read-error-counter", valueMapper, errorSchema, true)) .withOutputTags(OUTPUTTAG, TupleTagList.of(ERRORTAG))); output.get(OUTPUTTAG).setRowSchema(BEAMSCHEMA); - output.get(ERRORTAG).setRowSchema(ERRORSCHEMA); + output.get(ERRORTAG).setRowSchema(errorSchema); PAssert.that(output.get(OUTPUTTAG)).containsInAnyOrder(ROWS); p.run().waitUntilFinish(); @@ -98,13 +99,14 @@ public void testKafkaErrorFnFailure() throws Exception { } catch (Exception e) { } PCollection input = p.apply(Create.of(messagesWithError)); + Schema errorSchema = ErrorHandling.errorSchemaBytes(); PCollectionTuple output = input.apply( - ParDo.of(new ErrorFn("Read-Error-Counter", valueMapper)) + ParDo.of(new ErrorFn("Read-Error-Counter", valueMapper, errorSchema, true)) .withOutputTags(OUTPUTTAG, TupleTagList.of(ERRORTAG))); output.get(OUTPUTTAG).setRowSchema(BEAMSCHEMA); - output.get(ERRORTAG).setRowSchema(ERRORSCHEMA); + output.get(ERRORTAG).setRowSchema(errorSchema); PCollection count = output.get(ERRORTAG).apply("error_count", Count.globally()); diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOIT.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOIT.java index f600e14d30f69..2c8ace9c66c1e 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOIT.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOIT.java @@ -71,7 +71,6 @@ import org.apache.beam.sdk.transforms.Keys; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.transforms.windowing.CalendarWindows; import org.apache.beam.sdk.transforms.windowing.FixedWindows; @@ -483,14 +482,14 @@ public void testKafkaWithDynamicPartitions() throws IOException { @Test public void testKafkaWithStopReadingFunction() { - CheckStopReadingFn checkStopReadingFn = new CheckStopReadingFn(); + AlwaysStopCheckStopReadingFn checkStopReadingFn = new AlwaysStopCheckStopReadingFn(); PipelineResult readResult = runWithStopReadingFn(checkStopReadingFn, "stop-reading"); assertEquals(-1, readElementMetric(readResult, NAMESPACE, READ_ELEMENT_METRIC_NAME)); } - private static class CheckStopReadingFn implements SerializableFunction { + private static class AlwaysStopCheckStopReadingFn implements CheckStopReadingFn { @Override public Boolean apply(TopicPartition input) { return true; @@ -640,8 +639,7 @@ public void runReadWriteKafkaViaSchemaTransforms( assertEquals(PipelineResult.State.DONE, readResult.getState()); } - private static class DelayedCheckStopReadingFn - implements SerializableFunction { + private static class DelayedCheckStopReadingFn implements CheckStopReadingFn { int checkCount = 0; @Override @@ -654,8 +652,7 @@ public Boolean apply(TopicPartition input) { } } - private PipelineResult runWithStopReadingFn( - SerializableFunction function, String topicSuffix) { + private PipelineResult runWithStopReadingFn(CheckStopReadingFn function, String topicSuffix) { writePipeline .apply("Generate records", Read.from(new SyntheticBoundedSource(sourceOptions))) .apply("Measure write time", ParDo.of(new TimeMonitor<>(NAMESPACE, WRITE_TIME_METRIC_NAME))) diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java index a686c5f1ae9a0..52ab3e20f793c 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaIOTest.java @@ -68,7 +68,7 @@ import org.apache.beam.sdk.coders.BigEndianLongCoder; import org.apache.beam.sdk.coders.VarIntCoder; import org.apache.beam.sdk.coders.VarLongCoder; -import org.apache.beam.sdk.io.AvroGeneratedUser; +import org.apache.beam.sdk.extensions.avro.io.AvroGeneratedUser; import org.apache.beam.sdk.io.Read; import org.apache.beam.sdk.io.UnboundedSource; import org.apache.beam.sdk.io.UnboundedSource.UnboundedReader; diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProviderTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProviderTest.java index 6b9dde4dc9528..27fa18715c32f 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProviderTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaReadSchemaTransformProviderTest.java @@ -100,7 +100,10 @@ public void testFindTransformAndMakeItWork() { "consumerConfigUpdates", "format", "confluentSchemaRegistrySubject", - "confluentSchemaRegistryUrl"), + "confluentSchemaRegistryUrl", + "errorHandling", + "fileDescriptorPath", + "messageName"), kafkaProvider.configurationSchema().getFields().stream() .map(field -> field.getName()) .collect(Collectors.toSet())); @@ -147,4 +150,74 @@ public void testBuildTransformWithJsonSchema() throws IOException { StandardCharsets.UTF_8)) .build()); } + + @Test + public void testBuildTransformWithRawFormat() { + ServiceLoader serviceLoader = + ServiceLoader.load(SchemaTransformProvider.class); + List providers = + StreamSupport.stream(serviceLoader.spliterator(), false) + .filter(provider -> provider.getClass() == KafkaReadSchemaTransformProvider.class) + .collect(Collectors.toList()); + KafkaReadSchemaTransformProvider kafkaProvider = + (KafkaReadSchemaTransformProvider) providers.get(0); + kafkaProvider.from( + KafkaReadSchemaTransformConfiguration.builder() + .setTopic("anytopic") + .setBootstrapServers("anybootstrap") + .setFormat("RAW") + .build()); + } + + @Test + public void testBuildTransformWithProtoFormat() { + ServiceLoader serviceLoader = + ServiceLoader.load(SchemaTransformProvider.class); + List providers = + StreamSupport.stream(serviceLoader.spliterator(), false) + .filter(provider -> provider.getClass() == KafkaReadSchemaTransformProvider.class) + .collect(Collectors.toList()); + KafkaReadSchemaTransformProvider kafkaProvider = + (KafkaReadSchemaTransformProvider) providers.get(0); + + kafkaProvider.from( + KafkaReadSchemaTransformConfiguration.builder() + .setTopic("anytopic") + .setBootstrapServers("anybootstrap") + .setFormat("PROTO") + .setMessageName("MyMessage") + .setFileDescriptorPath( + Objects.requireNonNull( + getClass().getResource("/proto_byte/file_descriptor/proto_byte_utils.pb")) + .getPath()) + .build()); + } + + @Test + public void testBuildTransformWithProtoFormatWrongMessageName() { + ServiceLoader serviceLoader = + ServiceLoader.load(SchemaTransformProvider.class); + List providers = + StreamSupport.stream(serviceLoader.spliterator(), false) + .filter(provider -> provider.getClass() == KafkaReadSchemaTransformProvider.class) + .collect(Collectors.toList()); + KafkaReadSchemaTransformProvider kafkaProvider = + (KafkaReadSchemaTransformProvider) providers.get(0); + + assertThrows( + NullPointerException.class, + () -> + kafkaProvider.from( + KafkaReadSchemaTransformConfiguration.builder() + .setTopic("anytopic") + .setBootstrapServers("anybootstrap") + .setFormat("PROTO") + .setMessageName("MyOtherMessage") + .setFileDescriptorPath( + Objects.requireNonNull( + getClass() + .getResource("/proto_byte/file_descriptor/proto_byte_utils.pb")) + .getPath()) + .build())); + } } diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoderTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoderTest.java index 6720d67821ae9..84d8cedb895a5 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoderTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaRecordCoderTest.java @@ -55,6 +55,13 @@ public void testKafkaRecordSerializableWithoutHeaders() throws IOException { verifySerialization(consumerRecord.headers()); } + @Test + public void testKafkaRecordSerializableWithNullValueHeader() throws IOException { + RecordHeaders headers = new RecordHeaders(); + headers.add("headerKey", null); + verifySerialization(headers); + } + private void verifySerialization(Headers headers) throws IOException { KafkaRecord kafkaRecord = new KafkaRecord<>( diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProviderTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProviderTest.java index 8d01ebe8233ca..20f474790cc71 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/KafkaWriteSchemaTransformProviderTest.java @@ -17,8 +17,14 @@ */ package org.apache.beam.sdk.io.kafka; +import static org.apache.beam.sdk.io.kafka.KafkaWriteSchemaTransformProvider.getRowToRawBytesFunction; + +import java.io.UnsupportedEncodingException; import java.util.Arrays; +import java.util.Collections; import java.util.List; +import java.util.Objects; +import org.apache.beam.sdk.extensions.protobuf.ProtoByteUtils; import org.apache.beam.sdk.io.kafka.KafkaWriteSchemaTransformProvider.KafkaWriteSchemaTransform.ErrorCounterFn; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.utils.JsonUtils; @@ -47,17 +53,77 @@ public class KafkaWriteSchemaTransformProviderTest { private static final Schema BEAMSCHEMA = Schema.of(Schema.Field.of("name", Schema.FieldType.STRING)); + + private static final Schema BEAM_RAW_SCHEMA = + Schema.of(Schema.Field.of("payload", Schema.FieldType.BYTES)); private static final Schema ERRORSCHEMA = KafkaWriteSchemaTransformProvider.ERROR_SCHEMA; + private static final Schema BEAM_PROTO_SCHEMA = + Schema.builder() + .addField("id", Schema.FieldType.INT32) + .addField("name", Schema.FieldType.STRING) + .addField("active", Schema.FieldType.BOOLEAN) + .addField( + "address", + Schema.FieldType.row( + Schema.builder() + .addField("city", Schema.FieldType.STRING) + .addField("street", Schema.FieldType.STRING) + .addField("state", Schema.FieldType.STRING) + .addField("zip_code", Schema.FieldType.STRING) + .build())) + .build(); + + private static final List PROTO_ROWS = + Collections.singletonList( + Row.withSchema(BEAM_PROTO_SCHEMA) + .withFieldValue("id", 1234) + .withFieldValue("name", "Doe") + .withFieldValue("active", false) + .withFieldValue("address.city", "seattle") + .withFieldValue("address.street", "fake street") + .withFieldValue("address.zip_code", "TO-1234") + .withFieldValue("address.state", "wa") + .build()); + private static final List ROWS = Arrays.asList( Row.withSchema(BEAMSCHEMA).withFieldValue("name", "a").build(), Row.withSchema(BEAMSCHEMA).withFieldValue("name", "b").build(), Row.withSchema(BEAMSCHEMA).withFieldValue("name", "c").build()); + private static final List RAW_ROWS; + + static { + try { + RAW_ROWS = + Arrays.asList( + Row.withSchema(BEAM_RAW_SCHEMA) + .withFieldValue("payload", "a".getBytes("UTF8")) + .build(), + Row.withSchema(BEAM_RAW_SCHEMA) + .withFieldValue("payload", "b".getBytes("UTF8")) + .build(), + Row.withSchema(BEAM_RAW_SCHEMA) + .withFieldValue("payload", "c".getBytes("UTF8")) + .build()); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + final SerializableFunction valueMapper = JsonUtils.getRowToJsonBytesFunction(BEAMSCHEMA); + final SerializableFunction valueRawMapper = getRowToRawBytesFunction("payload"); + + final SerializableFunction protoValueRawMapper = + ProtoByteUtils.getRowToProtoBytes( + Objects.requireNonNull( + getClass().getResource("/proto_byte/file_descriptor/proto_byte_utils.pb")) + .getPath(), + "MyMessage"); + @Rule public transient TestPipeline p = TestPipeline.create(); @Test @@ -79,4 +145,36 @@ public void testKafkaErrorFnSuccess() throws Exception { PAssert.that(output.get(OUTPUT_TAG)).containsInAnyOrder(msg); p.run().waitUntilFinish(); } + + @Test + public void testKafkaErrorFnRawSuccess() throws Exception { + List> msg = + Arrays.asList( + KV.of(new byte[1], "a".getBytes("UTF8")), + KV.of(new byte[1], "b".getBytes("UTF8")), + KV.of(new byte[1], "c".getBytes("UTF8"))); + + PCollection input = p.apply(Create.of(RAW_ROWS)); + PCollectionTuple output = + input.apply( + ParDo.of(new ErrorCounterFn("Kafka-write-error-counter", valueRawMapper)) + .withOutputTags(OUTPUT_TAG, TupleTagList.of(ERROR_TAG))); + + output.get(ERROR_TAG).setRowSchema(ERRORSCHEMA); + + PAssert.that(output.get(OUTPUT_TAG)).containsInAnyOrder(msg); + p.run().waitUntilFinish(); + } + + @Test + public void testKafkaErrorFnProtoSuccess() { + PCollection input = p.apply(Create.of(PROTO_ROWS)); + PCollectionTuple output = + input.apply( + ParDo.of(new ErrorCounterFn("Kafka-write-error-counter", protoValueRawMapper)) + .withOutputTags(OUTPUT_TAG, TupleTagList.of(ERROR_TAG))); + + PAssert.that(output.get(ERROR_TAG).setRowSchema(ERRORSCHEMA)).empty(); + p.run().waitUntilFinish(); + } } diff --git a/sdks/java/io/kafka/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb b/sdks/java/io/kafka/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb new file mode 100644 index 0000000000000..67e93cc177cce --- /dev/null +++ b/sdks/java/io/kafka/src/test/resources/proto_byte/file_descriptor/proto_byte_utils.pb @@ -0,0 +1,13 @@ + + +test_proto.proto" + MyMessage +id (Rid +name ( Rname +active (Ractive, +address ( 2.MyMessage.AddressRaddressf +Address +street ( Rstreet +city ( Rcity +state ( Rstate +zip_code ( RzipCodebproto3 \ No newline at end of file diff --git a/.test-infra/jenkins/job_CleanUpDataprocResources.groovy b/sdks/java/io/kafka/src/test/resources/proto_byte/proto_byte_utils.proto similarity index 62% rename from .test-infra/jenkins/job_CleanUpDataprocResources.groovy rename to sdks/java/io/kafka/src/test/resources/proto_byte/proto_byte_utils.proto index 142b4c732c6ee..aead141f4b9a8 100644 --- a/.test-infra/jenkins/job_CleanUpDataprocResources.groovy +++ b/sdks/java/io/kafka/src/test/resources/proto_byte/proto_byte_utils.proto @@ -16,22 +16,20 @@ * limitations under the License. */ -import CommonJobProperties as commonJobProperties - - -job('Cleanup Dataproc Resources') { - description('Deletes leaked resources for all the jobs that generates flink clusters.') - - def CLEANUP_DIR = '"$WORKSPACE/src/.test-infra/dataproc"' - def CLEANUP_SCRIPT = 'cleanup.sh' - - commonJobProperties.setTopLevelMainJobProperties(delegate) - - // Sets that this is a cron job. - commonJobProperties.setCronJob(delegate, 'H */6 * * *') - - steps { - shell("cd ${CLEANUP_DIR}; ./${CLEANUP_SCRIPT} -xe") +syntax = "proto3"; + +message MyMessage { + int32 id = 1; + string name = 2; + bool active = 3; + + // Nested field + message Address { + string street = 1; + string city = 2; + string state = 3; + string zip_code = 4; } + Address address = 4; } diff --git a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/SSLUtils.java b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/SSLUtils.java index 1c46972893117..f68aaaea95ddd 100644 --- a/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/SSLUtils.java +++ b/sdks/java/io/mongodb/src/main/java/org/apache/beam/sdk/io/mongodb/SSLUtils.java @@ -68,12 +68,12 @@ static SSLContext ignoreSSLCertificate() { InputStream inputStream = classLoader.getResourceAsStream("resources/.keystore"); if (inputStream != null) { LOG.info("Found keystore in classpath 'resources/.keystore'. Loading..."); - ks.load(inputStream, "changeit".toCharArray()); } else { LOG.info( "Unable to find keystore under 'resources/.keystore' in the classpath. " + "Continuing with an empty keystore."); } + ks.load(inputStream, "changeit".toCharArray()); KeyManagerFactory kmf = KeyManagerFactory.getInstance(KeyManagerFactory.getDefaultAlgorithm()); kmf.init(ks, "changeit".toCharArray()); diff --git a/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/SSLUtilsTest.java b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/SSLUtilsTest.java new file mode 100644 index 0000000000000..978163129f87a --- /dev/null +++ b/sdks/java/io/mongodb/src/test/java/org/apache/beam/sdk/io/mongodb/SSLUtilsTest.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.mongodb; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Test on the MongoDbIO SSLUtils. */ +@RunWith(JUnit4.class) +public class SSLUtilsTest { + @Test + public void testIgnoreSSLCertificate() { + // smoke test + SSLUtils.ignoreSSLCertificate(); + } +} diff --git a/sdks/java/io/rrio/build.gradle b/sdks/java/io/rrio/build.gradle index d65df370e0caf..bfd030ce61dc5 100644 --- a/sdks/java/io/rrio/build.gradle +++ b/sdks/java/io/rrio/build.gradle @@ -18,19 +18,40 @@ plugins { id 'org.apache.beam.module' } applyJavaNature( - automaticModuleName: 'org.apache.beam.sdk.io.rrio' + automaticModuleName: 'org.apache.beam.sdk.io.requestresponse' ) +provideIntegrationTestingDependencies() +enableJavaPerformanceTesting() description = "Apache Beam :: SDKS :: Java :: IO :: RequestResponseIO (RRIO)" ext.summary = "Support to read from and write to Web APIs" +var jedisVersion = "5.0.0" +var grpcVersion = "1.59.0" +var protobufVersion = "3.21.5" + dependencies { implementation project(path: ":sdks:java:core", configuration: "shadow") implementation library.java.joda_time implementation library.java.vendored_guava_32_1_2_jre + implementation library.java.jackson_core + implementation library.java.jackson_databind + implementation "redis.clients:jedis:$jedisVersion" testImplementation project(path: ":sdks:java:core", configuration: "shadowTest") + testImplementation project(path: ":sdks:java:io:common", configuration: "testRuntimeMigration") + testImplementation project(path: ":beam-test-infra-mock-apis") + // Vendored grpc library not fully compatible with proto autogenerated code + testImplementation "io.grpc:grpc-netty-shaded:${grpcVersion}" + testImplementation "io.grpc:grpc-protobuf:${grpcVersion}" + testImplementation "io.grpc:grpc-stub:${grpcVersion}" + testImplementation "com.google.protobuf:protobuf-java-util:${protobufVersion}" + + testImplementation platform(library.java.google_cloud_platform_libraries_bom) + testImplementation library.java.google_http_client testImplementation library.java.junit + testImplementation library.java.testcontainers_base + testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") testRuntimeOnly library.java.slf4j_jdk14 -} \ No newline at end of file +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ApiIOError.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ApiIOError.java new file mode 100644 index 0000000000000..cfff3bd894146 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ApiIOError.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.auto.value.AutoValue; +import java.util.Optional; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaCaseFormat; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.CaseFormat; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Throwables; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.joda.time.Instant; + +/** {@link ApiIOError} is a data class for storing details about an error. */ +@SchemaCaseFormat(CaseFormat.LOWER_UNDERSCORE) +@DefaultSchema(AutoValueSchema.class) +@AutoValue +public abstract class ApiIOError { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /** + * Instantiate an {@link ApiIOError} from an {@link ErrorT} {@link T} element. The {@link T} + * element is converted to a JSON string. + */ + static ApiIOError of(@NonNull ErrorT e, @NonNull T element) + throws JsonProcessingException { + + String json = OBJECT_MAPPER.writeValueAsString(element); + + return ApiIOError.builder() + .setRequestAsJsonString(json) + .setMessage(Optional.ofNullable(e.getMessage()).orElse("")) + .setObservedTimestamp(Instant.now()) + .setStackTrace(Throwables.getStackTraceAsString(e)) + .build(); + } + + static Builder builder() { + return new AutoValue_ApiIOError.Builder(); + } + + /** The JSON string representation of the request associated with the error. */ + public abstract String getRequestAsJsonString(); + + /** The observed timestamp of the error. */ + public abstract Instant getObservedTimestamp(); + + /** The {@link Exception} message. */ + public abstract String getMessage(); + + /** The {@link Exception} stack trace. */ + public abstract String getStackTrace(); + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setRequestAsJsonString(String value); + + abstract Builder setObservedTimestamp(Instant value); + + abstract Builder setMessage(String value); + + abstract Builder setStackTrace(String value); + + abstract ApiIOError build(); + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CacheRead.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CacheRead.java new file mode 100644 index 0000000000000..3765d25370a66 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CacheRead.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.auto.value.AutoValue; +import java.util.Map; +import org.apache.beam.io.requestresponse.CacheRead.Result; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; + +/** + * {@link CacheRead} reads associated {@link ResponseT} types from {@link RequestT} types, if any + * exist. + */ +class CacheRead + extends PTransform, Result> { + + private static final TupleTag FAILURE_TAG = new TupleTag() {}; + + // TODO(damondouglas): remove suppress warnings after instance utilized. + @SuppressWarnings({"unused"}) + private final Configuration configuration; + + private CacheRead(Configuration configuration) { + this.configuration = configuration; + } + + /** Configuration details for {@link CacheRead}. */ + @AutoValue + abstract static class Configuration { + + static Builder builder() { + return new AutoValue_CacheRead_Configuration.Builder<>(); + } + + abstract Builder toBuilder(); + + @AutoValue.Builder + abstract static class Builder { + + abstract Configuration build(); + } + } + + @Override + public Result expand(PCollection input) { + return Result.of( + new TupleTag>() {}, PCollectionTuple.empty(input.getPipeline())); + } + + /** + * The {@link Result} of reading RequestT {@link PCollection} elements yielding ResponseT {@link + * PCollection} elements. + */ + static class Result implements POutput { + + static Result of( + TupleTag> responseTag, PCollectionTuple pct) { + return new Result<>(responseTag, pct); + } + + private final Pipeline pipeline; + private final TupleTag> responseTag; + private final PCollection> responses; + private final PCollection failures; + + private Result(TupleTag> responseTag, PCollectionTuple pct) { + this.pipeline = pct.getPipeline(); + this.responseTag = responseTag; + this.responses = pct.get(responseTag); + this.failures = pct.get(FAILURE_TAG); + } + + PCollection> getResponses() { + return responses; + } + + PCollection getFailures() { + return failures; + } + + @Override + public Pipeline getPipeline() { + return this.pipeline; + } + + @Override + public Map, PValue> expand() { + return ImmutableMap.of( + responseTag, responses, + FAILURE_TAG, failures); + } + + @Override + public void finishSpecifyingOutput( + String transformName, PInput input, PTransform transform) {} + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CacheWrite.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CacheWrite.java new file mode 100644 index 0000000000000..25249c3e41b42 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CacheWrite.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.auto.value.AutoValue; +import java.util.Map; +import org.apache.beam.io.requestresponse.CacheWrite.Result; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; + +/** + * {@link CacheWrite} writes associated {@link RequestT} and {@link ResponseT} pairs to a cache. + * Using {@link RequestT} and {@link ResponseT}'s {@link org.apache.beam.sdk.coders.Coder}, this + * transform writes encoded representations of this association. + */ +class CacheWrite + extends PTransform>, Result> { + + private static final TupleTag FAILURE_TAG = new TupleTag() {}; + + // TODO(damondouglas): remove suppress warnings after configuration is used. + @SuppressWarnings({"unused"}) + private final Configuration configuration; + + private CacheWrite(Configuration configuration) { + this.configuration = configuration; + } + + /** Configuration details for {@link CacheWrite}. */ + @AutoValue + abstract static class Configuration { + + static Builder builder() { + return new AutoValue_CacheWrite_Configuration.Builder<>(); + } + + abstract Builder toBuilder(); + + @AutoValue.Builder + abstract static class Builder { + + abstract Configuration build(); + } + } + + @Override + public Result expand(PCollection> input) { + return Result.of( + new TupleTag>() {}, PCollectionTuple.empty(input.getPipeline())); + } + + /** The {@link Result} of writing a request/response {@link KV} {@link PCollection}. */ + static class Result implements POutput { + + static Result of( + TupleTag> responseTag, PCollectionTuple pct) { + return new Result<>(responseTag, pct); + } + + private final Pipeline pipeline; + private final TupleTag> responseTag; + private final PCollection> responses; + private final PCollection failures; + + private Result(TupleTag> responseTag, PCollectionTuple pct) { + this.pipeline = pct.getPipeline(); + this.responseTag = responseTag; + this.responses = pct.get(responseTag); + this.failures = pct.get(FAILURE_TAG); + } + + public PCollection> getResponses() { + return responses; + } + + public PCollection getFailures() { + return failures; + } + + @Override + public Pipeline getPipeline() { + return this.pipeline; + } + + @Override + public Map, PValue> expand() { + return ImmutableMap.of( + responseTag, responses, + FAILURE_TAG, failures); + } + + @Override + public void finishSpecifyingOutput( + String transformName, PInput input, PTransform transform) {} + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/Call.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/Call.java new file mode 100644 index 0000000000000..52181af534ed3 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/Call.java @@ -0,0 +1,419 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.google.auto.value.AutoValue; +import java.io.Serializable; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import org.apache.beam.io.requestresponse.Call.Result; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.joda.time.Duration; + +/** + * {@link Call} transforms a {@link RequestT} {@link PCollection} into a {@link ResponseT} {@link + * PCollection} and {@link ApiIOError} {@link PCollection}, both wrapped in a {@link Result}. + */ +class Call + extends PTransform<@NonNull PCollection, @NonNull Result> { + + /** + * The default {@link Duration} to wait until completion of user code. A {@link + * UserCodeTimeoutException} is thrown when {@link Caller#call}, {@link SetupTeardown#setup}, or + * {@link SetupTeardown#teardown} exceed this timeout. + */ + static final Duration DEFAULT_TIMEOUT = Duration.standardSeconds(30L); + + /** + * Instantiates a {@link Call} {@link PTransform} with the required {@link Caller} and {@link + * ResponseT} {@link Coder}. Checks for the {@link Caller}'s {@link + * SerializableUtils#ensureSerializable} serializable errors. + */ + static Call of( + Caller caller, Coder responseTCoder) { + caller = SerializableUtils.ensureSerializable(caller); + return new Call<>( + Configuration.builder() + .setCaller(caller) + .setResponseCoder(responseTCoder) + .build()); + } + + /** + * Instantiates a {@link Call} {@link PTransform} with an implementation of both the {@link + * Caller} and {@link SetupTeardown} in one class and the required {@link ResponseT} {@link + * Coder}. Checks for {@link SerializableUtils#ensureSerializable} to report serializable errors. + */ + static < + RequestT, + ResponseT, + CallerSetupTeardownT extends Caller & SetupTeardown> + Call ofCallerAndSetupTeardown( + CallerSetupTeardownT implementsCallerAndSetupTeardown, Coder responseTCoder) { + implementsCallerAndSetupTeardown = + SerializableUtils.ensureSerializable(implementsCallerAndSetupTeardown); + return new Call<>( + Configuration.builder() + .setCaller(implementsCallerAndSetupTeardown) + .setResponseCoder(responseTCoder) + .setSetupTeardown(implementsCallerAndSetupTeardown) + .build()); + } + + private static final TupleTag FAILURE_TAG = new TupleTag() {}; + + private final Configuration configuration; + + private Call(Configuration configuration) { + this.configuration = configuration; + } + + /** + * Sets the {@link SetupTeardown} to the {@link Call} {@link PTransform} instance. Checks for + * {@link SerializableUtils#ensureSerializable} serializable errors. + */ + Call withSetupTeardown(SetupTeardown setupTeardown) { + setupTeardown = SerializableUtils.ensureSerializable(setupTeardown); + return new Call<>(configuration.toBuilder().setSetupTeardown(setupTeardown).build()); + } + + /** + * Overrides the default {@link #DEFAULT_TIMEOUT}. A {@link UserCodeTimeoutException} is thrown + * when {@link Caller#call}, {@link SetupTeardown#setup}, or {@link SetupTeardown#teardown} exceed + * the timeout. + */ + Call withTimeout(Duration timeout) { + return new Call<>(configuration.toBuilder().setTimeout(timeout).build()); + } + + @Override + public @NonNull Result expand(PCollection input) { + TupleTag responseTag = new TupleTag() {}; + + PCollectionTuple pct = + input.apply( + CallFn.class.getSimpleName(), + ParDo.of(new CallFn<>(responseTag, configuration)) + .withOutputTags(responseTag, TupleTagList.of(FAILURE_TAG))); + + return Result.of(configuration.getResponseCoder(), responseTag, pct); + } + + private static class CallFn extends DoFn { + private final TupleTag responseTag; + private final CallerWithTimeout caller; + private final SetupTeardownWithTimeout setupTeardown; + + private transient @MonotonicNonNull ExecutorService executor; + + private CallFn( + TupleTag responseTag, Configuration configuration) { + this.responseTag = responseTag; + this.caller = new CallerWithTimeout<>(configuration.getTimeout(), configuration.getCaller()); + this.setupTeardown = + new SetupTeardownWithTimeout( + configuration.getTimeout(), configuration.getSetupTeardown()); + } + + /** + * Invokes {@link SetupTeardown#setup} forwarding its {@link UserCodeExecutionException}, if + * thrown. + */ + @Setup + public void setup() throws UserCodeExecutionException { + this.executor = Executors.newSingleThreadExecutor(); + this.caller.setExecutor(executor); + this.setupTeardown.setExecutor(executor); + + // TODO(damondouglas): Incorporate repeater when https://github.com/apache/beam/issues/28926 + // resolves. + this.setupTeardown.setup(); + } + + /** + * Invokes {@link SetupTeardown#teardown} forwarding its {@link UserCodeExecutionException}, if + * thrown. + */ + @Teardown + public void teardown() throws UserCodeExecutionException { + // TODO(damondouglas): Incorporate repeater when https://github.com/apache/beam/issues/28926 + // resolves. + this.setupTeardown.teardown(); + checkStateNotNull(executor).shutdown(); + try { + boolean ignored = executor.awaitTermination(3L, TimeUnit.SECONDS); + } catch (InterruptedException ignored) { + } + } + + @ProcessElement + public void process(@Element @NonNull RequestT request, MultiOutputReceiver receiver) + throws JsonProcessingException { + try { + // TODO(damondouglas): https://github.com/apache/beam/issues/29248 + ResponseT response = this.caller.call(request); + receiver.get(responseTag).output(response); + } catch (UserCodeExecutionException e) { + receiver.get(FAILURE_TAG).output(ApiIOError.of(e, request)); + } + } + } + + /** Configuration details for {@link Call}. */ + @AutoValue + abstract static class Configuration implements Serializable { + + static Builder builder() { + return new AutoValue_Call_Configuration.Builder<>(); + } + + /** The user custom code that converts a {@link RequestT} into a {@link ResponseT}. */ + abstract Caller getCaller(); + + /** The user custom code that implements setup and teardown methods. */ + abstract SetupTeardown getSetupTeardown(); + + /** + * The expected timeout of all user custom code. If user custom code exceeds this timeout, then + * a {@link UserCodeTimeoutException} is thrown. User custom code may throw this exception prior + * to the configured timeout value on their own. + */ + abstract Duration getTimeout(); + + /** + * The {@link Coder} for the {@link ResponseT}. Note that the {@link RequestT}'s {@link Coder} + * is derived from the input {@link PCollection} but can't be determined for the {@link + * ResponseT} and therefore requires explicit setting in the {@link Configuration}. + */ + abstract Coder getResponseCoder(); + + abstract Builder toBuilder(); + + @AutoValue.Builder + abstract static class Builder { + + /** See {@link #getCaller()}. */ + abstract Builder setCaller(Caller value); + + /** See {@link #getSetupTeardown()}. */ + abstract Builder setSetupTeardown(SetupTeardown value); + + abstract Optional getSetupTeardown(); + + /** See {@link #getTimeout()}. */ + abstract Builder setTimeout(Duration value); + + abstract Optional getTimeout(); + + abstract Builder setResponseCoder(Coder value); + + abstract Configuration autoBuild(); + + final Configuration build() { + if (!getSetupTeardown().isPresent()) { + setSetupTeardown(new NoopSetupTeardown()); + } + + if (!getTimeout().isPresent()) { + setTimeout(DEFAULT_TIMEOUT); + } + + return autoBuild(); + } + } + } + + /** + * The {@link Result} of processing request {@link PCollection} into response {@link PCollection}. + */ + static class Result implements POutput { + + static Result of( + Coder responseTCoder, TupleTag responseTag, PCollectionTuple pct) { + return new Result<>(responseTCoder, responseTag, pct); + } + + private final Pipeline pipeline; + private final TupleTag responseTag; + private final PCollection responses; + private final PCollection failures; + + private Result( + Coder responseTCoder, TupleTag responseTag, PCollectionTuple pct) { + this.pipeline = pct.getPipeline(); + this.responseTag = responseTag; + this.responses = pct.get(responseTag).setCoder(responseTCoder); + this.failures = pct.get(FAILURE_TAG); + } + + public PCollection getResponses() { + return responses; + } + + public PCollection getFailures() { + return failures; + } + + @Override + public @NonNull Pipeline getPipeline() { + return this.pipeline; + } + + @Override + public @NonNull Map, PValue> expand() { + return ImmutableMap.of( + responseTag, responses, + FAILURE_TAG, failures); + } + + @Override + public void finishSpecifyingOutput( + @NonNull String transformName, + @NonNull PInput input, + @NonNull PTransform transform) {} + } + + private static class NoopSetupTeardown implements SetupTeardown { + + @Override + public void setup() throws UserCodeExecutionException { + // Noop + } + + @Override + public void teardown() throws UserCodeExecutionException { + // Noop + } + } + + private static class CallerWithTimeout + implements Caller { + private final Duration timeout; + private final Caller caller; + private @MonotonicNonNull ExecutorService executor; + + private CallerWithTimeout(Duration timeout, Caller caller) { + this.timeout = timeout; + this.caller = caller; + } + + private void setExecutor(ExecutorService executor) { + this.executor = executor; + } + + @Override + public ResponseT call(RequestT request) throws UserCodeExecutionException { + Future future = checkStateNotNull(executor).submit(() -> caller.call(request)); + try { + return future.get(timeout.getMillis(), TimeUnit.MILLISECONDS); + } catch (TimeoutException | InterruptedException e) { + throw new UserCodeTimeoutException(e); + } catch (ExecutionException e) { + parseAndThrow(future, e); + } + throw new UserCodeExecutionException("could not complete request"); + } + } + + private static class SetupTeardownWithTimeout implements SetupTeardown { + private final Duration timeout; + private final SetupTeardown setupTeardown; + private @MonotonicNonNull ExecutorService executor; + + SetupTeardownWithTimeout(Duration timeout, SetupTeardown setupTeardown) { + this.timeout = timeout; + this.setupTeardown = setupTeardown; + } + + private void setExecutor(ExecutorService executor) { + this.executor = executor; + } + + @Override + public void setup() throws UserCodeExecutionException { + Callable callable = + () -> { + setupTeardown.setup(); + return null; + }; + + executeAsync(callable); + } + + @Override + public void teardown() throws UserCodeExecutionException { + Callable callable = + () -> { + setupTeardown.teardown(); + return null; + }; + + executeAsync(callable); + } + + private void executeAsync(Callable callable) throws UserCodeExecutionException { + Future future = checkStateNotNull(executor).submit(callable); + try { + future.get(timeout.getMillis(), TimeUnit.MILLISECONDS); + } catch (TimeoutException | InterruptedException e) { + future.cancel(true); + throw new UserCodeTimeoutException(e); + } catch (ExecutionException e) { + parseAndThrow(future, e); + } + } + } + + private static void parseAndThrow(Future future, ExecutionException e) + throws UserCodeExecutionException { + future.cancel(true); + if (e.getCause() == null) { + throw new UserCodeExecutionException(e); + } + Throwable cause = checkStateNotNull(e.getCause()); + if (cause instanceof UserCodeQuotaException) { + throw new UserCodeQuotaException(cause); + } + throw new UserCodeExecutionException(cause); + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CallShouldBackoff.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CallShouldBackoff.java new file mode 100644 index 0000000000000..1d093f2efb126 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CallShouldBackoff.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import java.io.Serializable; + +/** Informs whether a call to an API should backoff. */ +public interface CallShouldBackoff extends Serializable { + + /** Update the state of whether to backoff using information about the exception. */ + void update(UserCodeExecutionException exception); + + /** Update the state of whether to backoff using information about the response. */ + void update(ResponseT response); + + /** Report whether to backoff. */ + boolean value(); +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CallShouldBackoffBasedOnRejectionProbability.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CallShouldBackoffBasedOnRejectionProbability.java new file mode 100644 index 0000000000000..62a7990d21eec --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/CallShouldBackoffBasedOnRejectionProbability.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import org.checkerframework.checker.nullness.qual.Nullable; + +/** Reports whether to apply backoff based on https://sre.google/sre-book/handling-overload/. */ +class CallShouldBackoffBasedOnRejectionProbability + implements CallShouldBackoff { + + // Default multiplier value recommended by https://sre.google/sre-book/handling-overload/ + private static final double DEFAULT_MULTIPLIER = 2.0; + + // The threshold is the value that the rejection probability must exceed in order to report a + // value() of true. If null, then the computation relies on a random value. + private @Nullable Double threshold; + + // The multiplier drives the impact of accepts on the rejection probability. See setThreshold(double threshold) { + this.threshold = threshold; + return this; + } + + /** Update the state of whether to backoff using information about the exception. */ + @Override + public void update(UserCodeExecutionException exception) { + this.requests++; + } + + /** Update the state of whether to backoff using information about the response. */ + @Override + public void update(ResponseT response) { + this.requests++; + this.accepts++; + } + + /** Provide a threshold to evaluate backoff. */ + double getThreshold() { + if (this.threshold != null) { + return this.threshold; + } + return Math.random(); + } + + /** + * Compute the probability of API call rejection based on + * https://sre.google/sre-book/handling-overload/. + */ + double getRejectionProbability() { + double numerator = requests - multiplier * accepts; + double denominator = requests + 1; + double ratio = numerator / denominator; + return Math.max(0, ratio); + } + + /** Report whether to backoff. */ + @Override + public boolean value() { + return getRejectionProbability() > getThreshold(); + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/Caller.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/Caller.java new file mode 100644 index 0000000000000..da636c8637403 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/Caller.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import java.io.Serializable; + +/** {@link Caller} interfaces user custom code intended for API calls. */ +public interface Caller extends Serializable { + + /** Calls a Web API with the {@link RequestT} and returns a {@link ResponseT}. */ + ResponseT call(RequestT request) throws UserCodeExecutionException; +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/RedisClient.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/RedisClient.java new file mode 100644 index 0000000000000..a87f5c191e4b0 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/RedisClient.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import java.net.URI; +import java.nio.charset.StandardCharsets; +import org.apache.beam.sdk.transforms.DoFn; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.joda.time.Duration; +import redis.clients.jedis.JedisPooled; +import redis.clients.jedis.exceptions.JedisException; + +/** + * {@link RedisClient} is a convenience class that supports operations needed for caching by various + * transforms in this package. It implements the {@link SetupTeardown} interface for ease-of-use + * within a {@link DoFn} context. Unlike the underlying {@link JedisPooled} client, {@link + * RedisClient} is {@link java.io.Serializable}. + */ +class RedisClient implements SetupTeardown { + + private final URI uri; + + private transient @MonotonicNonNull JedisPooled jedis; + + /** + * Instantiates a {@link RedisClient}. {@link URI} expected of the form: {@code + * redis://:}. + */ + RedisClient(URI uri) { + this.uri = uri; + } + + /** + * Decrement a value stored by the key, returning the resulting decremented value. Per Redis + * convention, sets the value to -1 for keys that do not exist. Naming of this method preserves + * that of the underlying {@link JedisPooled} client and performs a null check prior to execution. + */ + long decr(String key) throws UserCodeExecutionException { + try { + return getSafeClient().decr(key); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** + * Get the long value stored by the key. Yields zero when key does not exist, keeping consistency + * with Redis convention. Consider using {@link #exists} to query key existance. + */ + long getLong(String key) throws UserCodeExecutionException { + try { + return getSafeClient().decrBy(key, 0L); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** Query whether the key exists. */ + boolean exists(String key) throws UserCodeExecutionException { + try { + return getSafeClient().exists(key); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** + * Increment a value stored by the key, returning the resulting decremented value. Sets the value + * to 1, if key does not exist, per Redis convention. Naming of this method preserves that of the + * underlying {@link JedisPooled} client and performs a null check prior to execution. + */ + long incr(String key) throws UserCodeExecutionException { + try { + return getSafeClient().incr(key); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** + * Query the size of a list identified by the key. Returns 0 if key does not exist, per Redis + * convention. Naming of this method preserves that of the underlying {@link JedisPooled} client + * and performs a null check prior to execution. + */ + long llen(String key) throws UserCodeExecutionException { + try { + return getSafeClient().llen(key); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** Query whether the Redis list is empty. Calls {@link #llen} to determine this. */ + boolean isEmpty(String key) throws UserCodeExecutionException { + return this.llen(key) == 0L; + } + + /** + * Pushes items to the back ('right') of the list. Naming of this method preserves that of the + * underlying {@link JedisPooled} client and performs a null check prior to execution. + */ + void rpush(String key, byte[]... items) throws UserCodeExecutionException { + try { + getSafeClient().rpush(key.getBytes(StandardCharsets.UTF_8), items); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** + * Pops items from the front ('left') of the list. Naming of this method preserves that of the + * underlying {@link JedisPooled} client and performs a null check prior to execution. + */ + byte[] lpop(String key) throws UserCodeExecutionException { + try { + return getSafeClient().lpop(key.getBytes(StandardCharsets.UTF_8)); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** + * Sets the key/value for a Duration expiry. Naming of this method preserves that of the + * underlying {@link JedisPooled} client and performs a null check prior to execution. + */ + void setex(byte[] key, byte[] value, @NonNull Duration expiry) throws UserCodeExecutionException { + try { + getSafeClient().setex(key, expiry.getStandardSeconds(), value); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** + * Sets the key/value for a Duration expiry. Naming of this method preserves that of the + * underlying {@link JedisPooled} client and performs a null check prior to execution. + */ + void setex(String key, Long value, @NonNull Duration expiry) throws UserCodeExecutionException { + try { + getSafeClient().setex(key, expiry.getStandardSeconds(), String.valueOf(value)); + } catch (JedisException e) { + throw new UserCodeExecutionException(e); + } + } + + /** Overrides {@link SetupTeardown}'s {@link SetupTeardown#setup} method. */ + @Override + public void setup() throws UserCodeExecutionException { + try { + jedis = new JedisPooled(uri); + jedis.ping(); + } catch (JedisException e) { + String message = + String.format("Failed to connect to host: %s, error: %s", uri, e.getMessage()); + throw new UserCodeExecutionException(message, e); + } + } + + private @NonNull JedisPooled getSafeClient() { + return checkStateNotNull(jedis); + } + + /** Overrides {@link SetupTeardown}'s {@link SetupTeardown#teardown} method. */ + @Override + public void teardown() throws UserCodeExecutionException { + if (jedis != null) { + jedis.close(); + } + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/RequestResponseIO.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/RequestResponseIO.java new file mode 100644 index 0000000000000..de7d26aab4bd3 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/RequestResponseIO.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.auto.value.AutoValue; +import java.util.Map; +import org.apache.beam.io.requestresponse.RequestResponseIO.Result; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; + +/** + * {@link PTransform} for reading from and writing to Web APIs. + * + *

{@link RequestResponseIO} is recommended for interacting with external systems that offer RPCs + * that execute relatively quickly and do not offer advance features to make RPC execution + * efficient. + * + *

For systems that offer features for more efficient reading, for example, tracking progress of + * RPCs, support for splitting RPCs (deduct two or more RPCs which when combined return the same + * result), consider using the Apache Beam's `Splittable DoFn` interface instead. + * + *

Basic Usage

+ * + * {@link RequestResponseIO} minimally requires implementing the {@link Caller} interface: + * + *
{@code class MyCaller implements Caller {
+ *    public SomeResponse call(SomeRequest request) throws UserCodeExecutionException {
+ *      // calls the API submitting SomeRequest payload and returning SomeResponse
+ *    }
+ * }}
+ * + *

Then provide {@link RequestResponseIO}'s {@link #create} method your {@link Caller} + * implementation. + * + *

{@code  PCollection requests = ...
+ *  Result result = requests.apply(RequestResponseIO.create(new MyCaller()));
+ *  result.getResponses().apply( ... );
+ *  result.getFailures().apply( ... );
+ * }
+ */ +public class RequestResponseIO + extends PTransform, Result> { + + private static final TupleTag FAILURE_TAG = new TupleTag() {}; + + // TODO(damondouglas): remove when utilized. + @SuppressWarnings({"unused"}) + private final Configuration configuration; + + private RequestResponseIO(Configuration configuration) { + this.configuration = configuration; + } + + public static RequestResponseIO of( + Caller caller) { + return new RequestResponseIO<>( + Configuration.builder().setCaller(caller).build()); + } + + /** Configuration details for {@link RequestResponseIO}. */ + @AutoValue + abstract static class Configuration { + + static Builder builder() { + return new AutoValue_RequestResponseIO_Configuration.Builder<>(); + } + + /** + * The {@link Caller} that interfaces user custom code to process a {@link RequestT} into a + * {@link ResponseT}. + */ + abstract Caller getCaller(); + + abstract Builder toBuilder(); + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setCaller(Caller value); + + abstract Configuration build(); + } + } + + @Override + public Result expand(PCollection input) { + // TODO(damondouglas; https://github.com/apache/beam/issues?q=is%3Aissue+is%3Aopen+%5BRRIO%5D): + // expand pipeline as more dependencies develop. + return Result.of(new TupleTag() {}, PCollectionTuple.empty(input.getPipeline())); + } + + /** + * The {@link Result} of processing request {@link PCollection} into response {@link PCollection} + * using custom {@link Caller} code. + */ + public static class Result implements POutput { + + static Result of(TupleTag responseTag, PCollectionTuple pct) { + return new Result<>(responseTag, pct); + } + + private final Pipeline pipeline; + private final TupleTag responseTag; + private final PCollection responses; + private final PCollection failures; + + private Result(TupleTag responseTag, PCollectionTuple pct) { + this.pipeline = pct.getPipeline(); + this.responseTag = responseTag; + this.responses = pct.get(responseTag); + this.failures = pct.get(FAILURE_TAG); + } + + public PCollection getResponses() { + return responses; + } + + public PCollection getFailures() { + return failures; + } + + @Override + public Pipeline getPipeline() { + return this.pipeline; + } + + @Override + public Map, PValue> expand() { + return ImmutableMap.of( + responseTag, responses, + FAILURE_TAG, failures); + } + + @Override + public void finishSpecifyingOutput( + String transformName, PInput input, PTransform transform) {} + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/SetupTeardown.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/SetupTeardown.java new file mode 100644 index 0000000000000..be1b03105c3dc --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/SetupTeardown.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import java.io.Serializable; + +/** + * Provided by user and called within {@link org.apache.beam.sdk.transforms.DoFn.Setup} and @{link + * org.apache.beam.sdk.transforms.DoFn.Teardown} lifecycle methods of {@link Call}'s {@link + * org.apache.beam.sdk.transforms.DoFn}. + */ +public interface SetupTeardown extends Serializable { + + /** Called during the {@link org.apache.beam.sdk.transforms.DoFn}'s setup lifecycle method. */ + void setup() throws UserCodeExecutionException; + + /** Called during the {@link org.apache.beam.sdk.transforms.DoFn}'s teardown lifecycle method. */ + void teardown() throws UserCodeExecutionException; +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleDequeue.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleDequeue.java new file mode 100644 index 0000000000000..085b13b5e1120 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleDequeue.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.auto.value.AutoValue; +import java.util.Map; +import org.apache.beam.io.requestresponse.ThrottleDequeue.Result; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PInput; +import org.apache.beam.sdk.values.POutput; +import org.apache.beam.sdk.values.PValue; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.joda.time.Instant; + +/** + * {@link ThrottleDequeue} dequeues {@link RequestT} elements at a fixed rate yielding a {@link + * Result} containing the dequeued {@link RequestT} {@link PCollection} and a {@link ApiIOError} + * {@link PCollection} of any errors. + */ +class ThrottleDequeue extends PTransform, Result> { + + private static final TupleTag FAILURE_TAG = new TupleTag() {}; + + // TODO(damondouglas): remove suppress warnings after instance utilized. + @SuppressWarnings({"unused"}) + private final Configuration configuration; + + private ThrottleDequeue(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public Result expand(PCollection input) { + // TODO(damondouglas): expand in a future PR. + return new Result<>(new TupleTag() {}, PCollectionTuple.empty(input.getPipeline())); + } + + @AutoValue + abstract static class Configuration { + + @AutoValue.Builder + abstract static class Builder { + abstract Configuration build(); + } + } + + /** The {@link Result} of dequeuing {@link RequestT}s. */ + static class Result implements POutput { + + static Result of(TupleTag requestsTag, PCollectionTuple pct) { + return new Result<>(requestsTag, pct); + } + + private final Pipeline pipeline; + private final TupleTag requestsTag; + private final PCollection requests; + private final PCollection failures; + + private Result(TupleTag requestsTag, PCollectionTuple pct) { + this.pipeline = pct.getPipeline(); + this.requestsTag = requestsTag; + this.requests = pct.get(requestsTag); + this.failures = pct.get(FAILURE_TAG); + } + + @Override + public Pipeline getPipeline() { + return pipeline; + } + + @Override + public Map, PValue> expand() { + return ImmutableMap.of( + requestsTag, requests, + FAILURE_TAG, failures); + } + + @Override + public void finishSpecifyingOutput( + String transformName, PInput input, PTransform transform) {} + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleEnqueue.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleEnqueue.java new file mode 100644 index 0000000000000..505ef86be48b3 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleEnqueue.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptor; + +/** + * {@link ThrottleEnqueue} enqueues {@link RequestT} elements yielding an {@link ApiIOError} {@link + * PCollection} of any enqueue errors. + */ +class ThrottleEnqueue extends PTransform, PCollection> { + + @SuppressWarnings({"unused"}) + private final Configuration configuration; + + private ThrottleEnqueue(Configuration configuration) { + this.configuration = configuration; + } + + /** Configuration details for {@link ThrottleEnqueue}. */ + @AutoValue + abstract static class Configuration { + + static Builder builder() { + return new AutoValue_ThrottleEnqueue_Configuration.Builder<>(); + } + + abstract Builder toBuilder(); + + @AutoValue.Builder + abstract static class Builder { + + abstract Configuration build(); + } + } + + @Override + public PCollection expand(PCollection input) { + // TODO(damondouglas): expand in a future PR. + return input.getPipeline().apply(Create.empty(TypeDescriptor.of(ApiIOError.class))); + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleRefreshQuota.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleRefreshQuota.java new file mode 100644 index 0000000000000..57e57528db4bc --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleRefreshQuota.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.joda.time.Instant; + +/** + * {@link ThrottleRefreshQuota} refreshes a quota per {@link Instant} processing events emitting any + * errors into an {@link ApiIOError} {@link PCollection}. + */ +class ThrottleRefreshQuota extends PTransform, PCollection> { + + // TODO: remove suppress warnings after configuration utilized. + @SuppressWarnings({"unused"}) + private final Configuration configuration; + + private ThrottleRefreshQuota(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public PCollection expand(PCollection input) { + // TODO(damondouglas): expand in a later PR. + return input.getPipeline().apply(Create.empty(TypeDescriptor.of(ApiIOError.class))); + } + + @AutoValue + abstract static class Configuration { + + @AutoValue.Builder + abstract static class Builder { + abstract Configuration build(); + } + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleWithoutExternalResource.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleWithoutExternalResource.java new file mode 100644 index 0000000000000..0648a86f28eb5 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/ThrottleWithoutExternalResource.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.auto.value.AutoValue; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.values.PCollection; + +/** + * {@link ThrottleWithoutExternalResource} throttles a {@link RequestT} {@link PCollection} emitting + * a {@link RequestT} {@link PCollection} at a maximally configured rate, without using an external + * resource. + */ +// TODO(damondouglas): expand what "without external resource" means with respect to "with external +// resource" when the other throttle transforms implemented. +// See: https://github.com/apache/beam/issues/28932 +class ThrottleWithoutExternalResource + extends PTransform, PCollection> { + + // TODO(damondouglas): remove suppress warnings when finally utilized in a future PR. + @SuppressWarnings({"unused"}) + private final Configuration configuration; + + private ThrottleWithoutExternalResource(Configuration configuration) { + this.configuration = configuration; + } + + @Override + public PCollection expand(PCollection input) { + // TODO(damondouglas): expand in a future PR. + return input; + } + + @AutoValue + abstract static class Configuration { + + @AutoValue.Builder + abstract static class Builder { + abstract Configuration build(); + } + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeExecutionException.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeExecutionException.java new file mode 100644 index 0000000000000..be545b6da66aa --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeExecutionException.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +/** Base {@link Exception} for signaling errors in user custom code. */ +public class UserCodeExecutionException extends Exception { + public UserCodeExecutionException(String message) { + super(message); + } + + public UserCodeExecutionException(String message, Throwable cause) { + super(message, cause); + } + + public UserCodeExecutionException(Throwable cause) { + super(cause); + } + + public UserCodeExecutionException( + String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeQuotaException.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeQuotaException.java new file mode 100644 index 0000000000000..c513a5371da77 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeQuotaException.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +/** + * Extends {@link UserCodeQuotaException} to allow the user custom code to specifically signal a + * Quota or API overuse related error. + */ +public class UserCodeQuotaException extends UserCodeExecutionException { + + public UserCodeQuotaException(String message) { + super(message); + } + + public UserCodeQuotaException(String message, Throwable cause) { + super(message, cause); + } + + public UserCodeQuotaException(Throwable cause) { + super(cause); + } + + public UserCodeQuotaException( + String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeTimeoutException.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeTimeoutException.java new file mode 100644 index 0000000000000..869b8a51b73fa --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/UserCodeTimeoutException.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +/** An extension of {@link UserCodeQuotaException} to specifically signal a user code timeout. */ +public class UserCodeTimeoutException extends UserCodeExecutionException { + + public UserCodeTimeoutException(String message) { + super(message); + } + + public UserCodeTimeoutException(String message, Throwable cause) { + super(message, cause); + } + + public UserCodeTimeoutException(Throwable cause) { + super(cause); + } + + public UserCodeTimeoutException( + String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/package-info.java b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/package-info.java new file mode 100644 index 0000000000000..abaea0a58b323 --- /dev/null +++ b/sdks/java/io/rrio/src/main/java/org/apache/beam/io/requestresponse/package-info.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Package provides Beam I/O transform support for safely reading from and writing to Web APIs. */ +package org.apache.beam.io.requestresponse; diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallShouldBackoffBasedOnRejectionProbabilityTest.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallShouldBackoffBasedOnRejectionProbabilityTest.java new file mode 100644 index 0000000000000..40aaa48c26925 --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallShouldBackoffBasedOnRejectionProbabilityTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link CallShouldBackoffBasedOnRejectionProbability}. */ +@RunWith(JUnit4.class) +public class CallShouldBackoffBasedOnRejectionProbabilityTest { + + @Test + public void testValue() { + for (Case caze : CASES) { + CallShouldBackoffBasedOnRejectionProbability shouldBackoff = instance(); + for (boolean ar : caze.acceptRejects) { + if (ar) { + shouldBackoff.update(""); + } else { + shouldBackoff.update(new UserCodeExecutionException("")); + } + } + assertEquals(caze.toString(), caze.wantPReject, shouldBackoff.getRejectionProbability(), 0.1); + assertEquals(caze.toString(), caze.wantValue, shouldBackoff.value()); + } + } + + private static final List CASES = + Arrays.asList( + of(0, false), + of(0, false, true, true, true, true, true, true, true, true, true, true, true), + of(0, false, true), + of(0.5, false, false), + of(0.91, true, false, false, false, false, false, false, false, false, false, false)); + + private static Case of(double wantPReject, boolean wantValue, boolean... acceptRejects) { + List list = new ArrayList<>(); + for (boolean ar : acceptRejects) { + list.add(ar); + } + return new Case(list, wantPReject, wantValue); + } + + private static class Case { + private final List acceptRejects; + private final double wantPReject; + private final boolean wantValue; + + Case(List acceptRejects, double wantPReject, boolean wantValue) { + this.acceptRejects = acceptRejects; + this.wantPReject = wantPReject; + this.wantValue = wantValue; + } + + @Override + public String toString() { + return "Case{" + + "acceptRejects=" + + acceptRejects + + ", wantPReject=" + + wantPReject + + ", wantValue=" + + wantValue + + '}'; + } + } + + CallShouldBackoffBasedOnRejectionProbability instance() { + return new CallShouldBackoffBasedOnRejectionProbability().setThreshold(0.5); + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallTest.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallTest.java new file mode 100644 index 0000000000000..18574b00978dc --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallTest.java @@ -0,0 +1,493 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.apache.beam.sdk.values.TypeDescriptors.strings; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; + +import java.io.Serializable; +import org.apache.beam.io.requestresponse.Call.Result; +import org.apache.beam.sdk.coders.SerializableCoder; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Objects; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Throwables; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.UncheckedExecutionException; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.joda.time.Duration; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link Call}. */ +@RunWith(JUnit4.class) +public class CallTest { + @Rule public TestPipeline pipeline = TestPipeline.create(); + + private static final SerializableCoder<@NonNull Response> RESPONSE_CODER = + SerializableCoder.of(Response.class); + + @Test + public void givenCallerNotSerializable_throwsError() { + assertThrows( + IllegalArgumentException.class, () -> Call.of(new UnSerializableCaller(), RESPONSE_CODER)); + } + + @Test + public void givenSetupTeardownNotSerializable_throwsError() { + assertThrows( + IllegalArgumentException.class, + () -> + Call.ofCallerAndSetupTeardown( + new UnSerializableCallerWithSetupTeardown(), RESPONSE_CODER)); + } + + @Test + public void givenCallerThrowsUserCodeExecutionException_emitsIntoFailurePCollection() { + Result result = + pipeline + .apply(Create.of(new Request("a"))) + .apply(Call.of(new CallerThrowsUserCodeExecutionException(), RESPONSE_CODER)); + + PCollection failures = result.getFailures(); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeExecutionException.class)) + .isEqualTo(1L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeQuotaException.class)).isEqualTo(0L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeTimeoutException.class)) + .isEqualTo(0L); + + pipeline.run(); + } + + @Test + public void givenCallerThrowsQuotaException_emitsIntoFailurePCollection() { + Result result = + pipeline + .apply(Create.of(new Request("a"))) + .apply(Call.of(new CallerInvokesQuotaException(), RESPONSE_CODER)); + + PCollection failures = result.getFailures(); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeExecutionException.class)) + .isEqualTo(0L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeQuotaException.class)).isEqualTo(1L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeTimeoutException.class)) + .isEqualTo(0L); + + pipeline.run(); + } + + @Test + public void givenCallerTimeout_emitsFailurePCollection() { + Duration timeout = Duration.standardSeconds(1L); + Result result = + pipeline + .apply(Create.of(new Request("a"))) + .apply(Call.of(new CallerExceedsTimeout(timeout), RESPONSE_CODER).withTimeout(timeout)); + + PCollection failures = result.getFailures(); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeExecutionException.class)) + .isEqualTo(0L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeQuotaException.class)).isEqualTo(0L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeTimeoutException.class)) + .isEqualTo(1L); + + pipeline.run(); + } + + @Test + public void givenCallerThrowsTimeoutException_emitsFailurePCollection() { + Result result = + pipeline + .apply(Create.of(new Request("a"))) + .apply(Call.of(new CallerThrowsTimeout(), RESPONSE_CODER)); + + PCollection failures = result.getFailures(); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeExecutionException.class)) + .isEqualTo(1L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeQuotaException.class)).isEqualTo(0L); + PAssert.thatSingleton(countStackTracesOf(failures, UserCodeTimeoutException.class)) + .isEqualTo(1L); + + pipeline.run(); + } + + @Test + public void givenSetupThrowsUserCodeExecutionException_throwsError() { + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withSetupTeardown(new SetupThrowsUserCodeExecutionException())); + + assertPipelineThrows(UserCodeExecutionException.class, pipeline); + } + + @Test + public void givenSetupThrowsQuotaException_throwsError() { + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withSetupTeardown(new SetupThrowsUserCodeQuotaException())); + + assertPipelineThrows(UserCodeQuotaException.class, pipeline); + } + + @Test + public void givenSetupTimeout_throwsError() { + Duration timeout = Duration.standardSeconds(1L); + + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withSetupTeardown(new SetupExceedsTimeout(timeout)) + .withTimeout(timeout)); + + assertPipelineThrows(UserCodeTimeoutException.class, pipeline); + } + + @Test + public void givenSetupThrowsTimeoutException_throwsError() { + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withSetupTeardown(new SetupThrowsUserCodeTimeoutException())); + + assertPipelineThrows(UserCodeTimeoutException.class, pipeline); + } + + @Test + public void givenTeardownThrowsUserCodeExecutionException_throwsError() { + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withSetupTeardown(new TeardownThrowsUserCodeExecutionException())); + + // Exceptions thrown during teardown do not populate with the cause + assertThrows(IllegalStateException.class, () -> pipeline.run()); + } + + @Test + public void givenTeardownThrowsQuotaException_throwsError() { + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withSetupTeardown(new TeardownThrowsUserCodeQuotaException())); + + // Exceptions thrown during teardown do not populate with the cause + assertThrows(IllegalStateException.class, () -> pipeline.run()); + } + + @Test + public void givenTeardownTimeout_throwsError() { + Duration timeout = Duration.standardSeconds(1L); + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withTimeout(timeout) + .withSetupTeardown(new TeardownExceedsTimeout(timeout))); + + // Exceptions thrown during teardown do not populate with the cause + assertThrows(IllegalStateException.class, () -> pipeline.run()); + } + + @Test + public void givenTeardownThrowsTimeoutException_throwsError() { + pipeline + .apply(Create.of(new Request(""))) + .apply( + Call.of(new ValidCaller(), RESPONSE_CODER) + .withSetupTeardown(new TeardownThrowsUserCodeTimeoutException())); + + // Exceptions thrown during teardown do not populate with the cause + assertThrows(IllegalStateException.class, () -> pipeline.run()); + } + + @Test + public void givenValidCaller_emitValidResponse() { + Result result = + pipeline + .apply(Create.of(new Request("a"))) + .apply(Call.of(new ValidCaller(), RESPONSE_CODER)); + + PAssert.thatSingleton(result.getFailures().apply(Count.globally())).isEqualTo(0L); + PAssert.that(result.getResponses()).containsInAnyOrder(new Response("a")); + + pipeline.run(); + } + + private static class ValidCaller implements Caller { + + @Override + public Response call(Request request) throws UserCodeExecutionException { + return new Response(request.id); + } + } + + private static class UnSerializableCaller implements Caller { + + @SuppressWarnings({"unused"}) + private final UnSerializable nestedThing = new UnSerializable(); + + @Override + public Response call(Request request) throws UserCodeExecutionException { + return new Response(request.id); + } + } + + private static class UnSerializableCallerWithSetupTeardown extends UnSerializableCaller + implements SetupTeardown { + + @Override + public void setup() throws UserCodeExecutionException {} + + @Override + public void teardown() throws UserCodeExecutionException {} + } + + private static class UnSerializable {} + + private static class Request implements Serializable { + + final String id; + + Request(String id) { + this.id = id; + } + + public String getId() { + return id; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Request request = (Request) o; + return Objects.equal(id, request.id); + } + + @Override + public int hashCode() { + return Objects.hashCode(id); + } + } + + private static class Response implements Serializable { + final String id; + + Response(String id) { + this.id = id; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Response response = (Response) o; + return Objects.equal(id, response.id); + } + + @Override + public int hashCode() { + return Objects.hashCode(id); + } + } + + private static class CallerExceedsTimeout implements Caller { + private final Duration timeout; + + CallerExceedsTimeout(Duration timeout) { + this.timeout = timeout.plus(Duration.standardSeconds(1L)); + } + + @Override + public Response call(Request request) throws UserCodeExecutionException { + sleep(timeout); + return new Response(request.id); + } + } + + private static class CallerThrowsUserCodeExecutionException implements Caller { + + @Override + public Response call(Request request) throws UserCodeExecutionException { + throw new UserCodeExecutionException(request.id); + } + } + + private static class CallerThrowsTimeout implements Caller { + + @Override + public Response call(Request request) throws UserCodeExecutionException { + throw new UserCodeTimeoutException(""); + } + } + + private static class CallerInvokesQuotaException implements Caller { + + @Override + public Response call(Request request) throws UserCodeExecutionException { + throw new UserCodeQuotaException(request.id); + } + } + + private static class SetupExceedsTimeout implements SetupTeardown { + + private final Duration timeout; + + private SetupExceedsTimeout(Duration timeout) { + this.timeout = timeout.plus(Duration.standardSeconds(1L)); + } + + @Override + public void setup() throws UserCodeExecutionException { + sleep(timeout); + } + + @Override + public void teardown() throws UserCodeExecutionException {} + } + + private static class SetupThrowsUserCodeExecutionException implements SetupTeardown { + @Override + public void setup() throws UserCodeExecutionException { + throw new UserCodeExecutionException("error message"); + } + + @Override + public void teardown() throws UserCodeExecutionException {} + } + + private static class SetupThrowsUserCodeQuotaException implements SetupTeardown { + @Override + public void setup() throws UserCodeExecutionException { + throw new UserCodeQuotaException(""); + } + + @Override + public void teardown() throws UserCodeExecutionException {} + } + + private static class SetupThrowsUserCodeTimeoutException implements SetupTeardown { + @Override + public void setup() throws UserCodeExecutionException { + throw new UserCodeTimeoutException(""); + } + + @Override + public void teardown() throws UserCodeExecutionException {} + } + + private static class TeardownExceedsTimeout implements SetupTeardown { + private final Duration timeout; + + private TeardownExceedsTimeout(Duration timeout) { + this.timeout = timeout.plus(Duration.standardSeconds(1L)); + } + + @Override + public void setup() throws UserCodeExecutionException {} + + @Override + public void teardown() throws UserCodeExecutionException { + sleep(timeout); + } + } + + private static class TeardownThrowsUserCodeExecutionException implements SetupTeardown { + @Override + public void setup() throws UserCodeExecutionException {} + + @Override + public void teardown() throws UserCodeExecutionException { + throw new UserCodeExecutionException(""); + } + } + + private static class TeardownThrowsUserCodeQuotaException implements SetupTeardown { + @Override + public void setup() throws UserCodeExecutionException {} + + @Override + public void teardown() throws UserCodeExecutionException { + throw new UserCodeQuotaException(""); + } + } + + private static class TeardownThrowsUserCodeTimeoutException implements SetupTeardown { + @Override + public void setup() throws UserCodeExecutionException {} + + @Override + public void teardown() throws UserCodeExecutionException { + throw new UserCodeExecutionException(""); + } + } + + private static void assertPipelineThrows( + Class clazz, TestPipeline p) { + + // Because we need to wrap in a timeout via a java Future, exceptions are thrown as + // UncheckedExecutionException + UncheckedExecutionException error = assertThrows(UncheckedExecutionException.class, p::run); + + // Iterate through the stack trace to assert ErrorT is among stack. + assertTrue( + error.toString(), Throwables.getCausalChain(error).stream().anyMatch(clazz::isInstance)); + } + + private static PCollection countStackTracesOf( + PCollection failures, Class clazz) { + return failures + .apply( + "stackTrace " + clazz.getSimpleName(), + MapElements.into(strings()).via(failure -> checkStateNotNull(failure).getStackTrace())) + .apply( + "filter " + clazz.getSimpleName(), Filter.by(input -> input.contains(clazz.getName()))) + .apply("count " + clazz.getSimpleName(), Count.globally()); + } + + private static void sleep(Duration timeout) { + try { + Thread.sleep(timeout.getMillis()); + } catch (InterruptedException ignored) { + } + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallerTest.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallerTest.java new file mode 100644 index 0000000000000..93f3de474c58d --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/CallerTest.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +import org.apache.beam.sdk.Pipeline.PipelineExecutionException; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.coders.VarIntCoder; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.util.SerializableUtils; +import org.joda.time.Instant; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link Caller}. */ +@RunWith(JUnit4.class) +public class CallerTest { + + @Rule public TestPipeline pipeline = TestPipeline.create(); + + @Test + public void canSerializeImplementingClasses() { + SerializableUtils.serializeToByteArray(new CallerImpl()); + } + + @Test + public void canSerializeWhenUsedInDoFn() { + pipeline + .apply(Create.of(Instant.now())) + .apply(ParDo.of(new CallerUsingDoFn<>(new CallerImpl()))) + .setCoder(StringUtf8Coder.of()); + + pipeline.run(); + } + + @Test + public void canSignalQuotaException() { + pipeline + .apply(Create.of(1)) + .apply(ParDo.of(new CallerUsingDoFn<>(new CallerThrowsQuotaException()))) + .setCoder(VarIntCoder.of()); + + PipelineExecutionException executionException = + assertThrows(PipelineExecutionException.class, pipeline::run); + assertEquals(UserCodeQuotaException.class, executionException.getCause().getClass()); + } + + @Test + public void canSignalTimeoutException() { + pipeline + .apply(Create.of(1)) + .apply(ParDo.of(new CallerUsingDoFn<>(new CallerThrowsTimeoutException()))) + .setCoder(VarIntCoder.of()); + + PipelineExecutionException executionException = + assertThrows(PipelineExecutionException.class, pipeline::run); + assertEquals(UserCodeTimeoutException.class, executionException.getCause().getClass()); + } + + private static class CallerUsingDoFn extends DoFn { + private final Caller caller; + + private CallerUsingDoFn(Caller caller) { + this.caller = caller; + } + + @ProcessElement + public void process(@Element RequestT request, OutputReceiver receiver) + throws UserCodeExecutionException { + RequestT safeRequest = checkStateNotNull(request); + ResponseT response = caller.call(safeRequest); + receiver.output(response); + } + } + + private static class CallerImpl implements Caller { + + @Override + public String call(Instant request) throws UserCodeExecutionException { + return request.toString(); + } + } + + private static class CallerThrowsQuotaException implements Caller { + + @Override + public Integer call(Integer request) throws UserCodeExecutionException { + throw new UserCodeQuotaException("quota"); + } + } + + private static class CallerThrowsTimeoutException implements Caller { + + @Override + public Integer call(Integer request) throws UserCodeExecutionException { + throw new UserCodeTimeoutException("timeout"); + } + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoGRPCCallerWithSetupTeardown.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoGRPCCallerWithSetupTeardown.java new file mode 100644 index 0000000000000..22e2ff9b1a1f7 --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoGRPCCallerWithSetupTeardown.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import io.grpc.ChannelCredentials; +import io.grpc.InsecureChannelCredentials; +import io.grpc.ManagedChannel; +import io.grpc.StatusRuntimeException; +import io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder; +import java.net.URI; +import java.util.concurrent.TimeUnit; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse; +import org.apache.beam.testinfra.mockapis.echo.v1.EchoServiceGrpc; +import org.apache.beam.testinfra.mockapis.echo.v1.EchoServiceGrpc.EchoServiceBlockingStub; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; + +/** + * Implements {@link Caller} and {@link SetupTeardown} to call the {@link EchoServiceGrpc}. The + * purpose of {@link EchoGRPCCallerWithSetupTeardown} is support integration tests. + */ +class EchoGRPCCallerWithSetupTeardown implements Caller, SetupTeardown { + + static EchoGRPCCallerWithSetupTeardown of(URI uri) { + return new EchoGRPCCallerWithSetupTeardown(uri); + } + + private final URI uri; + private transient @MonotonicNonNull ManagedChannel cachedManagedChannel; + private transient @MonotonicNonNull EchoServiceBlockingStub cachedBlockingStub; + private static final ChannelCredentials DEFAULT_CREDENTIALS = InsecureChannelCredentials.create(); + + private EchoGRPCCallerWithSetupTeardown(URI uri) { + this.uri = uri; + } + + /** + * Overrides {@link Caller#call} invoking the {@link EchoServiceGrpc} with a {@link EchoRequest}, + * returning either a successful {@link EchoResponse} or throwing either a {@link + * UserCodeExecutionException}, a {@link UserCodeTimeoutException}, or a {@link + * UserCodeQuotaException}. + */ + @Override + public EchoResponse call(EchoRequest request) throws UserCodeExecutionException { + try { + return cachedBlockingStub.echo(request); + } catch (StatusRuntimeException e) { + switch (e.getStatus().getCode()) { + case RESOURCE_EXHAUSTED: + throw new UserCodeQuotaException(e); + case DEADLINE_EXCEEDED: + throw new UserCodeTimeoutException(e); + default: + throw new UserCodeExecutionException(e); + } + } + } + + /** + * Overrides {@link SetupTeardown#setup} to initialize the {@link ManagedChannel} and {@link + * EchoServiceBlockingStub}. + */ + @Override + public void setup() throws UserCodeExecutionException { + cachedManagedChannel = + NettyChannelBuilder.forTarget(uri.toString(), DEFAULT_CREDENTIALS).build(); + cachedBlockingStub = EchoServiceGrpc.newBlockingStub(cachedManagedChannel); + } + + /** Overrides {@link SetupTeardown#teardown} to shut down the {@link ManagedChannel}. */ + @Override + public void teardown() throws UserCodeExecutionException { + if (cachedManagedChannel != null && cachedManagedChannel.isShutdown()) { + cachedManagedChannel.shutdown(); + try { + boolean ignored = cachedManagedChannel.awaitTermination(1L, TimeUnit.SECONDS); + } catch (InterruptedException ignored) { + } + } + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoGRPCCallerWithSetupTeardownTestIT.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoGRPCCallerWithSetupTeardownTestIT.java new file mode 100644 index 0000000000000..14b6e9e6433d4 --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoGRPCCallerWithSetupTeardownTestIT.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.io.common.IOITHelper.readIOTestPipelineOptions; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +import com.google.protobuf.ByteString; +import java.net.URI; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse; +import org.apache.beam.testinfra.mockapis.echo.v1.EchoServiceGrpc; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests for {@link EchoGRPCCallerWithSetupTeardown} on a deployed {@link EchoServiceGrpc} instance. + * See {@link EchoITOptions} for details on the required parameters and how to provide these for + * running integration tests. + */ +@RunWith(JUnit4.class) +public class EchoGRPCCallerWithSetupTeardownTestIT { + + private static @MonotonicNonNull EchoITOptions options; + private static @MonotonicNonNull EchoGRPCCallerWithSetupTeardown client; + private static final ByteString PAYLOAD = ByteString.copyFromUtf8("payload"); + + @BeforeClass + public static void setUp() throws UserCodeExecutionException { + options = readIOTestPipelineOptions(EchoITOptions.class); + if (options.getgRPCEndpointAddress().isEmpty()) { + throw new RuntimeException( + "--gRPCEndpointAddress is missing. See " + EchoITOptions.class + "for details."); + } + client = EchoGRPCCallerWithSetupTeardown.of(URI.create(options.getgRPCEndpointAddress())); + checkStateNotNull(client).setup(); + + EchoRequest request = createShouldExceedQuotaRequest(); + + // The challenge with building and deploying a real quota aware endpoint, the integration with + // which these tests validate, is that we need a value of at least 1. The allocated quota where + // we expect to exceed will be shared among many tests and across languages. Code below in this + // setup ensures that the API is in the state where we can expect a quota exceeded error. There + // are tests in this file that detect errors in expected responses. We only throw exceptions + // that are not UserCodeQuotaException. + try { + EchoResponse ignored = client.call(request); + client.call(request); + client.call(request); + } catch (UserCodeExecutionException e) { + if (!(e instanceof UserCodeQuotaException)) { + throw e; + } + } + } + + @AfterClass + public static void tearDown() throws UserCodeExecutionException { + checkStateNotNull(client).teardown(); + } + + @Test + public void givenValidRequest_receivesResponse() throws UserCodeExecutionException { + EchoRequest request = createShouldNeverExceedQuotaRequest(); + EchoResponse response = client.call(request); + assertEquals(response.getId(), request.getId()); + assertEquals(response.getPayload(), request.getPayload()); + } + + @Test + public void givenExceededQuota_shouldThrow() { + assertThrows(UserCodeQuotaException.class, () -> client.call(createShouldExceedQuotaRequest())); + } + + @Test + public void givenNotFound_shouldThrow() { + UserCodeExecutionException error = + assertThrows( + UserCodeExecutionException.class, + () -> + client.call( + EchoRequest.newBuilder() + .setId("i-dont-exist-quota-id") + .setPayload(PAYLOAD) + .build())); + assertEquals( + "io.grpc.StatusRuntimeException: NOT_FOUND: error: source not found: i-dont-exist-quota-id, err resource does not exist", + error.getMessage()); + } + + private static @NonNull EchoRequest createShouldNeverExceedQuotaRequest() { + return EchoRequest.newBuilder() + .setPayload(PAYLOAD) + .setId(checkStateNotNull(options).getNeverExceedQuotaId()) + .build(); + } + + private static @NonNull EchoRequest createShouldExceedQuotaRequest() { + return EchoRequest.newBuilder() + .setPayload(PAYLOAD) + .setId(checkStateNotNull(options).getShouldExceedQuotaId()) + .build(); + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoHTTPCaller.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoHTTPCaller.java new file mode 100644 index 0000000000000..91842f2efb27d --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoHTTPCaller.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import com.google.api.client.http.ByteArrayContent; +import com.google.api.client.http.GenericUrl; +import com.google.api.client.http.HttpMediaType; +import com.google.api.client.http.HttpRequest; +import com.google.api.client.http.HttpRequestFactory; +import com.google.api.client.http.HttpResponse; +import com.google.api.client.http.HttpResponseException; +import com.google.api.client.http.javanet.NetHttpTransport; +import com.google.protobuf.util.JsonFormat; +import java.io.IOException; +import java.net.URI; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse; +import org.apache.beam.testinfra.mockapis.echo.v1.EchoServiceGrpc; + +/** + * Implements {@link Caller} to call the {@link EchoServiceGrpc}'s HTTP handler. The purpose of + * {@link EchoHTTPCaller} is to suppport integration tests. + */ +class EchoHTTPCaller implements Caller { + + static EchoHTTPCaller of(URI uri) { + return new EchoHTTPCaller(uri); + } + + private static final String PATH = "/v1/echo"; + private static final HttpRequestFactory REQUEST_FACTORY = + new NetHttpTransport().createRequestFactory(); + private static final HttpMediaType CONTENT_TYPE = new HttpMediaType("application/json"); + private static final int STATUS_CODE_TOO_MANY_REQUESTS = 429; + + private final URI uri; + + private EchoHTTPCaller(URI uri) { + this.uri = uri; + } + + /** + * Overrides {@link Caller#call} invoking the {@link EchoServiceGrpc}'s HTTP handler with a {@link + * EchoRequest}, returning either a successful {@link EchoResponse} or throwing either a {@link + * UserCodeExecutionException}, a {@link UserCodeTimeoutException}, or a {@link + * UserCodeQuotaException}. + */ + @Override + public EchoResponse call(EchoRequest request) throws UserCodeExecutionException { + try { + String json = JsonFormat.printer().omittingInsignificantWhitespace().print(request); + ByteArrayContent body = ByteArrayContent.fromString(CONTENT_TYPE.getType(), json); + HttpRequest httpRequest = REQUEST_FACTORY.buildPostRequest(getUrl(), body); + HttpResponse httpResponse = httpRequest.execute(); + String responseJson = httpResponse.parseAsString(); + EchoResponse.Builder builder = EchoResponse.newBuilder(); + JsonFormat.parser().merge(responseJson, builder); + return builder.build(); + } catch (IOException e) { + if (e instanceof HttpResponseException) { + HttpResponseException ex = (HttpResponseException) e; + if (ex.getStatusCode() == STATUS_CODE_TOO_MANY_REQUESTS) { + throw new UserCodeQuotaException(e); + } + } + throw new UserCodeExecutionException(e); + } + } + + private GenericUrl getUrl() { + String rawUrl = uri.toString(); + if (uri.getPath().isEmpty()) { + rawUrl += PATH; + } + return new GenericUrl(rawUrl); + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoHTTPCallerTestIT.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoHTTPCallerTestIT.java new file mode 100644 index 0000000000000..fa0cb93781100 --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoHTTPCallerTestIT.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.io.common.IOITHelper.readIOTestPipelineOptions; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; + +import com.google.protobuf.ByteString; +import java.net.URI; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoRequest; +import org.apache.beam.testinfra.mockapis.echo.v1.Echo.EchoResponse; +import org.apache.beam.testinfra.mockapis.echo.v1.EchoServiceGrpc; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests for {@link EchoHTTPCallerTestIT} on a deployed {@link EchoServiceGrpc} instance's HTTP + * handler. See {@link EchoITOptions} for details on the required parameters and how to provide + * these for running integration tests. + */ +@RunWith(JUnit4.class) +public class EchoHTTPCallerTestIT { + + private static @MonotonicNonNull EchoITOptions options; + private static @MonotonicNonNull EchoHTTPCaller client; + private static final ByteString PAYLOAD = ByteString.copyFromUtf8("payload"); + + @BeforeClass + public static void setUp() throws UserCodeExecutionException { + options = readIOTestPipelineOptions(EchoITOptions.class); + if (options.getHttpEndpointAddress().isEmpty()) { + throw new RuntimeException( + "--httpEndpointAddress is missing. See " + EchoITOptions.class + "for details."); + } + client = EchoHTTPCaller.of(URI.create(options.getHttpEndpointAddress())); + + EchoRequest request = createShouldExceedQuotaRequest(); + + // The challenge with building and deploying a real quota aware endpoint, the integration with + // which these tests validate, is that we need a value of at least 1. The allocated quota where + // we expect to exceed will be shared among many tests and across languages. Code below in this + // setup ensures that the API is in the state where we can expect a quota exceeded error. There + // are tests in this file that detect errors in expected responses. We only throw exceptions + // that are not UserCodeQuotaException. + try { + EchoResponse ignored = client.call(request); + client.call(request); + client.call(request); + client.call(request); + } catch (UserCodeExecutionException e) { + if (!(e instanceof UserCodeQuotaException)) { + throw e; + } + } + } + + @Test + public void givenValidRequest_receivesResponse() throws UserCodeExecutionException { + EchoRequest request = createShouldNeverExceedQuotaRequest(); + EchoResponse response = client.call(request); + assertEquals(response.getId(), request.getId()); + assertEquals(response.getPayload(), request.getPayload()); + } + + @Test + public void givenExceededQuota_shouldThrow() { + assertThrows(UserCodeQuotaException.class, () -> client.call(createShouldExceedQuotaRequest())); + } + + @Test + public void givenNotFound_shouldThrow() { + UserCodeExecutionException error = + assertThrows( + UserCodeExecutionException.class, + () -> + client.call( + EchoRequest.newBuilder() + .setId("i-dont-exist-quota-id") + .setPayload(PAYLOAD) + .build())); + + assertTrue(error.getMessage().contains("404 Not Found")); + } + + private static @NonNull EchoRequest createShouldNeverExceedQuotaRequest() { + return EchoRequest.newBuilder() + .setPayload(PAYLOAD) + .setId(checkStateNotNull(options).getNeverExceedQuotaId()) + .build(); + } + + private static @NonNull EchoRequest createShouldExceedQuotaRequest() { + return EchoRequest.newBuilder() + .setPayload(PAYLOAD) + .setId(checkStateNotNull(options).getShouldExceedQuotaId()) + .build(); + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoITOptions.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoITOptions.java new file mode 100644 index 0000000000000..a32f7a78e8265 --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/EchoITOptions.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.testinfra.mockapis.echo.v1.EchoServiceGrpc; + +/** + * Shared options for running integration tests on a deployed {@link EchoServiceGrpc}. See
https://github.com/apache/beam/tree/master/.test-infra/mock-apis#integration + * for details on how to acquire values required by {@link EchoITOptions}. + * + *

To provide these values to your integration tests: + * + *

+ *   ./gradlew :sdks:java:io:rrio:integrationTest -DintegrationTestPipelineOptions='[
+ *      "--grpcEndpointAddress=",
+ *      "--httpEndpointAddress="
+ *   ]'
+ * 
+ */ +public interface EchoITOptions extends PipelineOptions { + @Description("The gRPC address of the Echo API endpoint, typically of the form :.") + String getgRPCEndpointAddress(); + + void setgRPCEndpointAddress(String value); + + @Description("The HTTP address of the Echo API endpoint; must being with http(s)://") + String getHttpEndpointAddress(); + + void setHttpEndpointAddress(String value); + + @Description("The ID for an allocated quota that should never exceed.") + @Default.String("echo-should-never-exceed-quota") + String getNeverExceedQuotaId(); + + void setNeverExceedQuotaId(String value); + + @Description("The ID for an allocated quota that should exceed.") + @Default.String("echo-should-exceed-quota") + String getShouldExceedQuotaId(); + + void setShouldExceedQuotaId(String value); +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/RedisClientTestIT.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/RedisClientTestIT.java new file mode 100644 index 0000000000000..1fbb320a5f23c --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/RedisClientTestIT.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.allPrimitiveDataTypes; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.math.BigDecimal; +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import org.apache.beam.sdk.coders.SerializableCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.AllPrimitiveDataTypes; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.joda.time.Duration; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.utility.DockerImageName; + +/** Integration tests for {@link RedisClient}. */ +@RunWith(JUnit4.class) +public class RedisClientTestIT { + + private static final String CONTAINER_IMAGE_NAME = "redis:5.0.3-alpine"; + private static final Integer PORT = 6379; + + @Rule + public GenericContainer redis = + new GenericContainer<>(DockerImageName.parse(CONTAINER_IMAGE_NAME)).withExposedPorts(PORT); + + @Rule + public RedisExternalResourcesRule externalClients = + new RedisExternalResourcesRule( + () -> { + redis.start(); + return URI.create( + String.format("redis://%s:%d", redis.getHost(), redis.getFirstMappedPort())); + }); + + @Test + public void canSerialize() { + SerializableUtils.serializeToByteArray(externalClients.getActualClient()); + } + + @Test + public void wrongHostURIThrowsException() { + URI uri = URI.create("redis://1.2.3.4:6379"); + RedisClient client = new RedisClient(uri); + UserCodeExecutionException got = assertThrows(UserCodeExecutionException.class, client::setup); + String expected = + "Failed to connect to host: redis://1.2.3.4:6379, error: Failed to connect to any host resolved for DNS name."; + assertEquals(expected, got.getMessage()); + } + + @Test + public void givenCustomTypeAndCoder_setex_doesNotCorruptData() + throws IOException, UserCodeExecutionException { + + String key = UUID.randomUUID().toString(); + StringUtf8Coder keyCoder = StringUtf8Coder.of(); + + AllPrimitiveDataTypes value = + allPrimitiveDataTypes(true, BigDecimal.ONE, 1.23456, 1.23456f, 1, 1L, "🦄🦄🦄🦄"); + SerializableCoder<@NonNull AllPrimitiveDataTypes> valueCoder = + SerializableCoder.of(AllPrimitiveDataTypes.class); + + ByteArrayOutputStream keyBuffer = new ByteArrayOutputStream(); + keyCoder.encode(key, keyBuffer); + ByteArrayOutputStream valueBuffer = new ByteArrayOutputStream(); + valueCoder.encode(value, valueBuffer); + + byte[] keyBytes = keyBuffer.toByteArray(); + + externalClients + .getActualClient() + .setex(keyBytes, valueBuffer.toByteArray(), Duration.standardHours(1L)); + byte[] storedValueBytes = externalClients.getValidatingClient().get(keyBytes); + AllPrimitiveDataTypes storedValue = + valueCoder.decode(new ByteArrayInputStream(storedValueBytes)); + + assertEquals(value, storedValue); + } + + @Test + public void setex_expiresDataWhenExpected() + throws UserCodeExecutionException, InterruptedException { + Duration expiry = Duration.standardSeconds(2L); + String key = UUID.randomUUID().toString(); + byte[] keyBytes = key.getBytes(StandardCharsets.UTF_8); + externalClients.getActualClient().setex(keyBytes, keyBytes, expiry); + assertTrue(externalClients.getValidatingClient().exists(keyBytes)); + assertTrue(externalClients.getValidatingClient().ttl(keyBytes) > 0L); + Thread.sleep(expiry.getMillis()); + assertFalse(externalClients.getValidatingClient().exists(keyBytes)); + } + + @Test + public void givenCustomTypeAndCoder_rpush_doesNotCorruptData() + throws IOException, UserCodeExecutionException { + String key = UUID.randomUUID().toString(); + + AllPrimitiveDataTypes value = + allPrimitiveDataTypes(true, BigDecimal.ONE, 1.23456, 1.23456f, 1, 1L, "🦄🦄🦄🦄"); + SerializableCoder<@NonNull AllPrimitiveDataTypes> valueCoder = + SerializableCoder.of(AllPrimitiveDataTypes.class); + + ByteArrayOutputStream valueBuffer = new ByteArrayOutputStream(); + valueCoder.encode(value, valueBuffer); + + assertEquals(0L, externalClients.getActualClient().llen(key)); + externalClients.getActualClient().rpush(key, valueBuffer.toByteArray()); + assertEquals(1L, externalClients.getActualClient().llen(key)); + + byte[] storedBytes = externalClients.getActualClient().lpop(key); + + AllPrimitiveDataTypes storedValue = valueCoder.decode(new ByteArrayInputStream(storedBytes)); + + assertEquals(value, storedValue); + assertEquals(0L, externalClients.getActualClient().llen(key)); + } + + @Test + public void rpushAndlPopYieldsFIFOOrder() throws UserCodeExecutionException { + String key = UUID.randomUUID().toString(); + List want = ImmutableList.of("1", "2", "3", "4", "5"); + + for (String item : want) { + externalClients.getActualClient().rpush(key, item.getBytes(StandardCharsets.UTF_8)); + } + List got = new ArrayList<>(); + while (!externalClients.getActualClient().isEmpty(key)) { + byte[] bytes = externalClients.getActualClient().lpop(key); + got.add(new String(bytes, StandardCharsets.UTF_8)); + } + + assertEquals(want, got); + } + + @Test + public void givenExpired_decr_yieldsNegativeOne_andNotExists() + throws InterruptedException, UserCodeExecutionException { + String key = UUID.randomUUID().toString(); + externalClients.getActualClient().setex(key, 100L, Duration.standardSeconds(1L)); + assertTrue(externalClients.getActualClient().exists(key)); + Thread.sleep(1500L); + assertFalse(externalClients.getActualClient().exists(key)); + assertEquals(-1L, externalClients.getActualClient().decr(key)); + assertEquals(-2L, externalClients.getActualClient().decr(key)); + assertEquals(-3L, externalClients.getActualClient().decr(key)); + + key = UUID.randomUUID().toString(); + externalClients.getActualClient().setex(key, -100L, Duration.standardSeconds(1L)); + assertTrue(externalClients.getActualClient().exists(key)); + Thread.sleep(1500L); + assertFalse(externalClients.getActualClient().exists(key)); + assertEquals(-1L, externalClients.getActualClient().decr(key)); + assertEquals(-2L, externalClients.getActualClient().decr(key)); + assertEquals(-3L, externalClients.getActualClient().decr(key)); + } + + @Test + public void setThenDecrThenIncr_yieldsExpectedValue() throws UserCodeExecutionException { + String key = UUID.randomUUID().toString(); + externalClients.getActualClient().setex(key, 100L, Duration.standardHours(1L)); + assertEquals(100L, externalClients.getActualClient().getLong(key)); + for (long i = 0; i < 100L; i++) { + externalClients.getActualClient().decr(key); + } + assertEquals(0L, externalClients.getActualClient().getLong(key)); + for (long i = 0; i < 100L; i++) { + externalClients.getActualClient().incr(key); + } + assertEquals(100L, externalClients.getActualClient().getLong(key)); + } + + @Test + public void givenKeyNotExists_getLong_yieldsZero() throws UserCodeExecutionException { + assertEquals(0L, externalClients.getActualClient().getLong(UUID.randomUUID().toString())); + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/RedisExternalResourcesRule.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/RedisExternalResourcesRule.java new file mode 100644 index 0000000000000..5c77dde387b0b --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/RedisExternalResourcesRule.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import java.net.URI; +import java.util.function.Supplier; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.junit.rules.ExternalResource; +import redis.clients.jedis.JedisPooled; + +/** + * {@link org.junit.runners.JUnit4} {@link org.junit.Rule} for {@link JedisPooled} based clients. + */ +class RedisExternalResourcesRule extends ExternalResource { + + private final Supplier configurationSupplier; + + private @MonotonicNonNull JedisPooled validatingClient; + private @MonotonicNonNull RedisClient actualClient; + + RedisExternalResourcesRule(Supplier configurationSupplier) { + this.configurationSupplier = configurationSupplier; + } + + @Override + protected void before() throws Throwable { + URI uri = configurationSupplier.get(); + validatingClient = new JedisPooled(uri); + actualClient = new RedisClient(uri); + + validatingClient.ping(); + actualClient.setup(); + } + + @Override + protected void after() { + getValidatingClient().close(); + try { + getActualClient().teardown(); + } catch (UserCodeExecutionException e) { + throw new RuntimeException(e); + } + } + + @NonNull + JedisPooled getValidatingClient() { + return checkStateNotNull(validatingClient); + } + + public RedisClient getActualClient() { + return checkStateNotNull(actualClient); + } +} diff --git a/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/SetupTeardownTest.java b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/SetupTeardownTest.java new file mode 100644 index 0000000000000..eade6588955d4 --- /dev/null +++ b/sdks/java/io/rrio/src/test/java/org/apache/beam/io/requestresponse/SetupTeardownTest.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.io.requestresponse; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +import org.apache.beam.sdk.coders.VarIntCoder; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.sdk.util.UserCodeException; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.UncheckedExecutionException; +import org.junit.Rule; +import org.junit.Test; + +public class SetupTeardownTest { + @Rule public TestPipeline pipeline = TestPipeline.create(); + + @Test + public void canSerializeImplementingClasses() { + SerializableUtils.serializeToByteArray(new SetupTeardownImpl()); + } + + @Test + public void canSerializeWhenUsedInDoFn() { + pipeline + .apply(Create.of(1)) + .apply(ParDo.of(new SetupTeardownUsingDoFn(new SetupTeardownImpl()))) + .setCoder(VarIntCoder.of()); + + pipeline.run(); + } + + @Test + public void canSignalQuotaException() { + pipeline + .apply(Create.of(1)) + .apply(ParDo.of(new SetupTeardownUsingDoFn(new ThrowsQuotaException()))) + .setCoder(VarIntCoder.of()); + + UncheckedExecutionException exception = + assertThrows(UncheckedExecutionException.class, pipeline::run); + UserCodeException userCodeException = (UserCodeException) exception.getCause(); + assertEquals(UserCodeQuotaException.class, userCodeException.getCause().getClass()); + } + + @Test + public void canSignalTimeoutException() { + pipeline + .apply(Create.of(1)) + .apply(ParDo.of(new SetupTeardownUsingDoFn(new ThrowsTimeoutException()))) + .setCoder(VarIntCoder.of()); + + UncheckedExecutionException exception = + assertThrows(UncheckedExecutionException.class, pipeline::run); + UserCodeException userCodeException = (UserCodeException) exception.getCause(); + assertEquals(UserCodeTimeoutException.class, userCodeException.getCause().getClass()); + } + + private static class SetupTeardownUsingDoFn extends DoFn { + private final SetupTeardown setupTeardown; + + private SetupTeardownUsingDoFn(SetupTeardown setupTeardown) { + this.setupTeardown = setupTeardown; + } + + @Setup + public void setup() throws UserCodeExecutionException { + setupTeardown.setup(); + } + + @Teardown + public void teardown() throws UserCodeExecutionException { + setupTeardown.teardown(); + } + + @ProcessElement + public void process() {} + } + + private static class SetupTeardownImpl implements SetupTeardown { + @Override + public void setup() throws UserCodeExecutionException {} + + @Override + public void teardown() throws UserCodeExecutionException {} + } + + private static class ThrowsQuotaException implements SetupTeardown { + + @Override + public void setup() throws UserCodeExecutionException { + throw new UserCodeQuotaException("quota"); + } + + @Override + public void teardown() throws UserCodeExecutionException {} + } + + private static class ThrowsTimeoutException implements SetupTeardown { + + @Override + public void setup() throws UserCodeExecutionException { + throw new UserCodeTimeoutException("timeout"); + } + + @Override + public void teardown() throws UserCodeExecutionException {} + } +} diff --git a/sdks/java/io/snowflake/src/test/java/org/apache/beam/sdk/io/snowflake/test/unit/read/SnowflakeIOReadTest.java b/sdks/java/io/snowflake/src/test/java/org/apache/beam/sdk/io/snowflake/test/unit/read/SnowflakeIOReadTest.java index 06cbd99f8e037..df70fd87aac92 100644 --- a/sdks/java/io/snowflake/src/test/java/org/apache/beam/sdk/io/snowflake/test/unit/read/SnowflakeIOReadTest.java +++ b/sdks/java/io/snowflake/src/test/java/org/apache/beam/sdk/io/snowflake/test/unit/read/SnowflakeIOReadTest.java @@ -24,7 +24,7 @@ import org.apache.avro.generic.GenericRecordBuilder; import org.apache.beam.sdk.Pipeline.PipelineExecutionException; import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; -import org.apache.beam.sdk.io.AvroGeneratedUser; +import org.apache.beam.sdk.extensions.avro.io.AvroGeneratedUser; import org.apache.beam.sdk.io.snowflake.SnowflakeIO; import org.apache.beam.sdk.io.snowflake.services.SnowflakeServices; import org.apache.beam.sdk.io.snowflake.test.FakeSnowflakeBasicDataSource; diff --git a/sdks/java/io/splunk/build.gradle b/sdks/java/io/splunk/build.gradle index dd1b15e10dde9..41a7a409e8904 100644 --- a/sdks/java/io/splunk/build.gradle +++ b/sdks/java/io/splunk/build.gradle @@ -37,6 +37,7 @@ dependencies { implementation library.java.joda_time implementation library.java.slf4j_api implementation library.java.vendored_guava_32_1_2_jre + implementation library.java.commons_io testImplementation library.java.junit testImplementation group: 'org.mock-server', name: 'mockserver-junit-rule', version: '5.10.0' testImplementation group: 'org.mock-server', name: 'mockserver-client-java', version: '5.10.0' diff --git a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/HttpEventPublisher.java b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/HttpEventPublisher.java index 6c5537990bdfd..f34fcb7c4e0e1 100644 --- a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/HttpEventPublisher.java +++ b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/HttpEventPublisher.java @@ -22,9 +22,11 @@ import com.google.api.client.http.ByteArrayContent; import com.google.api.client.http.GZipEncoding; import com.google.api.client.http.GenericUrl; +import com.google.api.client.http.HttpBackOffIOExceptionHandler; import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler; import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler.BackOffRequired; import com.google.api.client.http.HttpContent; +import com.google.api.client.http.HttpIOExceptionHandler; import com.google.api.client.http.HttpMediaType; import com.google.api.client.http.HttpRequest; import com.google.api.client.http.HttpRequestFactory; @@ -139,6 +141,9 @@ HttpResponse execute(List events) throws IOException { responseHandler.setBackOffRequired(BackOffRequired.ON_SERVER_ERROR); request.setUnsuccessfulResponseHandler(responseHandler); + HttpIOExceptionHandler ioExceptionHandler = + new HttpBackOffIOExceptionHandler(getConfiguredBackOff()); + request.setIOExceptionHandler(ioExceptionHandler); setHeaders(request, token()); return request.execute(); @@ -180,6 +185,10 @@ void close() throws IOException { */ private void setHeaders(HttpRequest request, String token) { request.getHeaders().setAuthorization(String.format(AUTHORIZATION_SCHEME, token)); + + if (enableGzipHttpCompression()) { + request.getHeaders().setContentEncoding("gzip"); + } } /** diff --git a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEvent.java b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEvent.java index 7dd78e1754b4c..177900a2d09a6 100644 --- a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEvent.java +++ b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEvent.java @@ -20,9 +20,9 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; import com.google.auto.value.AutoValue; +import com.google.gson.JsonObject; import com.google.gson.annotations.SerializedName; -import org.apache.beam.sdk.schemas.AutoValueSchema; -import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.coders.DefaultCoder; import org.checkerframework.checker.nullness.qual.Nullable; /** @@ -39,7 +39,7 @@ *
  • index * */ -@DefaultSchema(AutoValueSchema.class) +@DefaultCoder(SplunkEventCoder.class) @AutoValue public abstract class SplunkEvent { @@ -59,6 +59,8 @@ public static Builder newBuilder() { public abstract @Nullable String index(); + public abstract @Nullable JsonObject fields(); + public abstract @Nullable String event(); /** A builder class for creating a {@link SplunkEvent}. */ @@ -75,6 +77,8 @@ public abstract static class Builder { abstract Builder setIndex(String index); + abstract Builder setFields(JsonObject fields); + abstract Builder setEvent(String event); abstract String event(); @@ -136,6 +140,17 @@ public Builder withIndex(String index) { return setIndex(index); } + /** + * Assigns fields value to the event metadata. + * + * @param fields fields value to assign + */ + public Builder withFields(JsonObject fields) { + checkNotNull(fields, "withFields(fields) called with null input."); + + return setFields(fields); + } + /** * Assigns the event payload to be sent to the HEC endpoint. * diff --git a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEventCoder.java b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEventCoder.java new file mode 100644 index 0000000000000..35d5314ae9eed --- /dev/null +++ b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEventCoder.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.splunk; + +import com.google.gson.Gson; +import com.google.gson.JsonObject; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import org.apache.beam.sdk.coders.AtomicCoder; +import org.apache.beam.sdk.coders.BigEndianLongCoder; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.beam.sdk.coders.CoderProvider; +import org.apache.beam.sdk.coders.CoderProviders; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.commons.io.IOUtils; + +/** A {@link org.apache.beam.sdk.coders.Coder} for {@link SplunkEvent} objects. */ +public class SplunkEventCoder extends AtomicCoder { + + private static final SplunkEventCoder SPLUNK_EVENT_CODER = new SplunkEventCoder(); + + private static final TypeDescriptor TYPE_DESCRIPTOR = + new TypeDescriptor() {}; + private static final StringUtf8Coder STRING_UTF_8_CODER = StringUtf8Coder.of(); + private static final NullableCoder STRING_NULLABLE_CODER = + NullableCoder.of(STRING_UTF_8_CODER); + private static final NullableCoder LONG_NULLABLE_CODER = + NullableCoder.of(BigEndianLongCoder.of()); + + private static final Gson GSON = new Gson(); + + // Version markers must be >= 2. + private static final int VERSION_3 = 3; + + public static SplunkEventCoder of() { + return SPLUNK_EVENT_CODER; + } + + public static CoderProvider getCoderProvider() { + return CoderProviders.forCoder(TYPE_DESCRIPTOR, SplunkEventCoder.of()); + } + + @Override + @SuppressWarnings("nullness") + public void encode(SplunkEvent value, OutputStream out) throws IOException { + out.write(VERSION_3); + + LONG_NULLABLE_CODER.encode(value.time(), out); + STRING_NULLABLE_CODER.encode(value.host(), out); + STRING_NULLABLE_CODER.encode(value.source(), out); + STRING_NULLABLE_CODER.encode(value.sourceType(), out); + STRING_NULLABLE_CODER.encode(value.index(), out); + String fields = value.fields() == null ? null : value.fields().toString(); + STRING_NULLABLE_CODER.encode(fields, out); + STRING_UTF_8_CODER.encode(value.event(), out); + } + + @Override + public SplunkEvent decode(InputStream in) throws CoderException, IOException { + SplunkEvent.Builder builder = SplunkEvent.newBuilder(); + + int v = in.read(); + + // Versions 1 and 2 of this coder had no version marker field, but 1st byte in the serialized + // data was always 0 or 1 (present/not present indicator for a nullable field). + // So here we assume if the first byte is >= 2 then it's the version marker. + + if (v >= 2) { + decodeWithVersion(v, in, builder); + } else { + // It's impossible to distinguish between V1 and V2 without re-reading portions of the input + // stream twice (and without the version marker), so we must have a ByteArrayInputStream copy, + // which is guaranteed to support mark()/reset(). + + ByteArrayOutputStream os = new ByteArrayOutputStream(); + os.write(v); + IOUtils.copy(in, os); + ByteArrayInputStream streamCopy = new ByteArrayInputStream(os.toByteArray()); + + decodeVersion1or2(streamCopy, builder); + } + + return builder.build(); + } + + private void decodeWithVersion(int version, InputStream in, SplunkEvent.Builder builder) + throws IOException { + + decodeCommonFields(in, builder); + + if (version >= VERSION_3) { + String fields = STRING_NULLABLE_CODER.decode(in); + if (fields != null) { + builder.withFields(GSON.fromJson(fields, JsonObject.class)); + } + + String event = STRING_UTF_8_CODER.decode(in); + builder.withEvent(event); + } + } + + private void decodeVersion1or2(ByteArrayInputStream in, SplunkEvent.Builder builder) + throws IOException { + + decodeCommonFields(in, builder); + + in.mark(Integer.MAX_VALUE); + + // The following fields may be different between V1 and V2. + + // V1 format: <... common fields...> + // V2 format: <... common fields...> + // + + // We try to read this as V2 first. If any exception, fall back to V1. + + // Note: it's impossible to incorrectly parse V1 data with V2 decoder (potentially causing + // corrupted fields in the message). If we try that and the 1st byte is: + // - 2 or more: decoding fails because V2 expects it to be either 0 or 1 (present indicator). + // - 1: this means the "event" string length is 1, so we have only 1 more byte in the stream. + // V2 decoding fails with EOF assuming 1 is the "fields" string length and reading + // at least 1 more byte. + // - 0: this means the "event" string is empty, so we have no more bytes in the stream. + // V2 decoding fails with EOF assuming 0 is the "fields" string length and reading + // the next "event" field. + + JsonObject fields = null; + String event; + + try { + // Assume V2 first. + String fieldsString = STRING_NULLABLE_CODER.decode(in); + if (fieldsString != null) { + fields = GSON.fromJson(fieldsString, JsonObject.class); + } + event = STRING_UTF_8_CODER.decode(in); + } catch (CoderException e) { + // If failed, reset the stream and parse as V1. + in.reset(); + event = STRING_UTF_8_CODER.decode(in); + } + + if (fields != null) { + builder.withFields(fields); + } + builder.withEvent(event); + } + + private void decodeCommonFields(InputStream in, SplunkEvent.Builder builder) throws IOException { + Long time = LONG_NULLABLE_CODER.decode(in); + if (time != null) { + builder.withTime(time); + } + + String host = STRING_NULLABLE_CODER.decode(in); + if (host != null) { + builder.withHost(host); + } + + String source = STRING_NULLABLE_CODER.decode(in); + if (source != null) { + builder.withSource(source); + } + + String sourceType = STRING_NULLABLE_CODER.decode(in); + if (sourceType != null) { + builder.withSourceType(sourceType); + } + + String index = STRING_NULLABLE_CODER.decode(in); + if (index != null) { + builder.withIndex(index); + } + } + + @Override + public TypeDescriptor getEncodedTypeDescriptor() { + return TYPE_DESCRIPTOR; + } + + @Override + public void verifyDeterministic() throws NonDeterministicException { + throw new NonDeterministicException( + this, "SplunkEvent can hold arbitrary instances, which may be non-deterministic."); + } +} diff --git a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEventWriter.java b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEventWriter.java index 8ec2a064ee0dd..615d4e932f4d4 100644 --- a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEventWriter.java +++ b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkEventWriter.java @@ -33,8 +33,9 @@ import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; -import java.time.Instant; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.beam.repackaged.core.org.apache.commons.compress.utils.IOUtils; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult; @@ -53,8 +54,11 @@ import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.values.KV; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.InetAddresses; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.InternetDomainName; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; import org.slf4j.Logger; @@ -70,7 +74,7 @@ }) abstract class SplunkEventWriter extends DoFn, SplunkWriteError> { - private static final Integer DEFAULT_BATCH_COUNT = 1; + private static final Integer DEFAULT_BATCH_COUNT = 10; private static final Boolean DEFAULT_DISABLE_CERTIFICATE_VALIDATION = false; private static final Boolean DEFAULT_ENABLE_BATCH_LOGS = true; private static final Boolean DEFAULT_ENABLE_GZIP_HTTP_COMPRESSION = true; @@ -98,6 +102,13 @@ abstract class SplunkEventWriter extends DoFn, SplunkWr private static final String COUNT_STATE_NAME = "count"; private static final String TIME_ID_NAME = "expiry"; + private static final Pattern URL_PATTERN = Pattern.compile("^http(s?)://([^:]+)(:[0-9]+)?$"); + + @VisibleForTesting + protected static final String INVALID_URL_FORMAT_MESSAGE = + "Invalid url format. Url format should match PROTOCOL://HOST[:PORT], where PORT is optional. " + + "Supported Protocols are http and https. eg: http://hostname:8088"; + @StateId(BUFFER_STATE_NAME) private final StateSpec> buffer = StateSpecs.bag(); @@ -139,6 +150,7 @@ static Builder newBuilder() { public void setup() { checkArgument(url().isAccessible(), "url is required for writing events."); + checkArgument(isValidUrlFormat(url().get()), INVALID_URL_FORMAT_MESSAGE); checkArgument(token().isAccessible(), "Access token is required for writing events."); // Either user supplied or default batchCount. @@ -287,7 +299,7 @@ private void flush( response = publisher.execute(events); if (!response.isSuccessStatusCode()) { - UNSUCCESSFUL_WRITE_LATENCY_MS.update(System.nanoTime() - startTime); + UNSUCCESSFUL_WRITE_LATENCY_MS.update(nanosToMillis(System.nanoTime() - startTime)); FAILED_WRITES.inc(countState.read()); int statusCode = response.getStatusCode(); if (statusCode >= 400 && statusCode < 500) { @@ -305,7 +317,7 @@ private void flush( events, response.getStatusMessage(), response.getStatusCode(), receiver); } else { - SUCCESSFUL_WRITE_LATENCY_MS.update(Instant.now().toEpochMilli() - startTime); + SUCCESSFUL_WRITE_LATENCY_MS.update(nanosToMillis(System.nanoTime() - startTime)); SUCCESS_WRITES.inc(countState.read()); VALID_REQUESTS.inc(); SUCCESSFUL_WRITE_BATCH_SIZE.update(countState.read()); @@ -321,7 +333,7 @@ private void flush( e.getStatusCode(), e.getContent(), e.getStatusMessage()); - UNSUCCESSFUL_WRITE_LATENCY_MS.update(System.nanoTime() - startTime); + UNSUCCESSFUL_WRITE_LATENCY_MS.update(nanosToMillis(System.nanoTime() - startTime)); FAILED_WRITES.inc(countState.read()); int statusCode = e.getStatusCode(); if (statusCode >= 400 && statusCode < 500) { @@ -336,7 +348,7 @@ private void flush( } catch (IOException ioe) { LOG.error("Error writing to Splunk: {}", ioe.getMessage()); - UNSUCCESSFUL_WRITE_LATENCY_MS.update(System.nanoTime() - startTime); + UNSUCCESSFUL_WRITE_LATENCY_MS.update(nanosToMillis(System.nanoTime() - startTime)); FAILED_WRITES.inc(countState.read()); INVALID_REQUESTS.inc(); @@ -350,8 +362,21 @@ private void flush( bufferState.clear(); countState.clear(); - if (response != null) { - response.disconnect(); + // We've observed cases where errors at this point can cause the pipeline to keep retrying + // the same events over and over (e.g. from Dataflow Runner's Pub/Sub implementation). Since + // the events have either been published or wrapped for error handling, we can safely + // ignore this error, though there may or may not be a leak of some type depending on + // HttpResponse's implementation. However, any potential leak would still happen if we let + // the exception fall through, so this isn't considered a major issue. + try { + if (response != null) { + response.ignore(); + } + } catch (IOException e) { + LOG.warn( + "Error ignoring response from Splunk. Messages should still have published, but there" + + " might be a connection leak.", + e); } } } @@ -426,6 +451,26 @@ public static byte[] getCertFromGcsAsBytes(String filePath) throws IOException { } } + @VisibleForTesting + static boolean isValidUrlFormat(String url) { + Matcher matcher = URL_PATTERN.matcher(url); + if (matcher.find()) { + String host = matcher.group(2); + return InetAddresses.isInetAddress(host) || InternetDomainName.isValid(host); + } + return false; + } + + /** + * Converts Nanoseconds to Milliseconds. + * + * @param ns time in nanoseconds + * @return time in milliseconds + */ + private static long nanosToMillis(long ns) { + return Math.round(((double) ns) / 1e6); + } + @AutoValue.Builder abstract static class Builder { @@ -458,6 +503,9 @@ abstract Builder setDisableCertificateValidation( */ Builder withUrl(ValueProvider url) { checkArgument(url != null, "withURL(url) called with null input."); + if (url.isAccessible()) { + checkArgument(isValidUrlFormat(url.get()), INVALID_URL_FORMAT_MESSAGE); + } return setUrl(url); } @@ -469,6 +517,7 @@ Builder withUrl(ValueProvider url) { */ Builder withUrl(String url) { checkArgument(url != null, "withURL(url) called with null input."); + checkArgument(isValidUrlFormat(url), INVALID_URL_FORMAT_MESSAGE); return setUrl(ValueProvider.StaticValueProvider.of(url)); } diff --git a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkIO.java b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkIO.java index bd1e716951d44..2127cc55752d3 100644 --- a/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkIO.java +++ b/sdks/java/io/splunk/src/main/java/org/apache/beam/sdk/io/splunk/SplunkIO.java @@ -159,7 +159,6 @@ public PCollection expand(PCollection input) { .withRootCaCertificatePath(rootCaCertificatePath()) .withEnableBatchLogs(enableBatchLogs()) .withEnableGzipHttpCompression(enableGzipHttpCompression()); - ; SplunkEventWriter writer = builder.build(); LOG.info("SplunkEventWriter configured"); diff --git a/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventCoderTest.java b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventCoderTest.java new file mode 100644 index 0000000000000..8267e406960ad --- /dev/null +++ b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventCoderTest.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.splunk; + +import static org.junit.Assert.assertEquals; + +import com.google.gson.JsonObject; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.binary.Hex; +import org.junit.Test; + +/** Unit tests for {@link SplunkEventCoder} class. */ +public class SplunkEventCoderTest { + + /** + * Test whether {@link SplunkEventCoder} is able to encode/decode a {@link SplunkEvent} correctly. + * + * @throws IOException + */ + @Test + public void testEncodeDecode() throws IOException { + + String event = "test-event"; + String host = "test-host"; + String index = "test-index"; + String source = "test-source"; + String sourceType = "test-source-type"; + Long time = 123456789L; + + SplunkEvent actualEvent = + SplunkEvent.newBuilder() + .withEvent(event) + .withHost(host) + .withIndex(index) + .withSource(source) + .withSourceType(sourceType) + .withTime(time) + .build(); + + SplunkEventCoder coder = SplunkEventCoder.of(); + try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { + coder.encode(actualEvent, bos); + try (ByteArrayInputStream bin = new ByteArrayInputStream(bos.toByteArray())) { + SplunkEvent decodedEvent = coder.decode(bin); + assertEquals(decodedEvent, actualEvent); + } + } + } + + /** + * Test whether {@link SplunkEventCoder} is able to encode/decode a {@link SplunkEvent} with + * metadata 'fields'. + * + * @throws IOException + */ + @Test + public void testEncodeDecodeFields() throws IOException { + + String event = "test-event"; + JsonObject fields = new JsonObject(); + fields.addProperty("test-key", "test-value"); + + SplunkEvent actualEvent = SplunkEvent.newBuilder().withEvent(event).withFields(fields).build(); + + SplunkEventCoder coder = SplunkEventCoder.of(); + try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { + coder.encode(actualEvent, bos); + try (ByteArrayInputStream bin = new ByteArrayInputStream(bos.toByteArray())) { + SplunkEvent decodedEvent = coder.decode(bin); + assertEquals(decodedEvent, actualEvent); + } + } + } + + /** + * Tests whether {@link SplunkEventCoder} is able to decode a {@link SplunkEvent} encoded using + * the older coder version 1 (commit f0ff6cc). + */ + @Test + public void testBackwardsCompatibility_canDecodeVersion1() throws IOException, DecoderException { + + SplunkEvent expectedEvent = + SplunkEvent.newBuilder() + .withEvent("e") + .withHost("h") + .withIndex("i") + .withSource("s") + .withSourceType("st") + .withTime(1234L) + .build(); + + String hex = "0100000000000004d2010168010173010273740101690165"; + SplunkEvent actualEvent = SplunkEventCoder.of().decode(fromHex(hex)); + + assertEquals(expectedEvent, actualEvent); + } + + /** + * Tests whether {@link SplunkEventCoder} is able to decode a {@link SplunkEvent} encoded using + * the older coder version 1 (commit f0ff6cc) and having an empty "event" field. + * + *

    An empty field is encoded as 00, which may look like the present/not present + * marker for the "fields" field in V2. + */ + @Test + public void testBackwardsCompatibility_canDecodeVersion1withEmptyEvent() + throws IOException, DecoderException { + + SplunkEvent expectedEvent = + SplunkEvent.newBuilder() + .withEvent("") + .withHost("h") + .withIndex("i") + .withSource("s") + .withSourceType("st") + .withTime(1234L) + .build(); + + String hex = "0100000000000004d20101680101730102737401016900"; + SplunkEvent actualEvent = SplunkEventCoder.of().decode(fromHex(hex)); + + assertEquals(expectedEvent, actualEvent); + } + + /** + * Tests whether {@link SplunkEventCoder} is able to decode a {@link SplunkEvent} encoded using + * the older coder version 1 (commit f0ff6cc) and having the "event" field of length 1. + * + *

    This is a special case when "event" is of length 1 and the first character code is 00. This + * is encoded as byte sequence 01 00 by V1 coder, which can be treated as an empty "fields" field + * by V2 decoder. + */ + @Test + public void testBackwardsCompatibility_canDecodeVersion1withEventLength1() + throws IOException, DecoderException { + + SplunkEvent expectedEvent = + SplunkEvent.newBuilder() + .withEvent(new String(new byte[] {0}, StandardCharsets.UTF_8)) + .withHost("h") + .withIndex("i") + .withSource("s") + .withSourceType("st") + .withTime(1234L) + .build(); + + String hex = "0100000000000004d2010168010173010273740101690100"; + SplunkEvent actualEvent = SplunkEventCoder.of().decode(fromHex(hex)); + + assertEquals(expectedEvent, actualEvent); + } + + /** + * Tests whether {@link SplunkEventCoder} is able to decode a {@link SplunkEvent} encoded using + * the older coder version 2 (commit 5e53040), without the newly added "fields" field. + */ + @Test + public void testBackwardsCompatibility_canDecodeVersion2() throws IOException, DecoderException { + + SplunkEvent expectedEvent = + SplunkEvent.newBuilder() + .withEvent("e") + .withHost("h") + .withIndex("i") + .withSource("s") + .withSourceType("st") + .withTime(1234L) + .build(); + + String hex = "0100000000000004d201016801017301027374010169000165"; + SplunkEvent actualEvent = SplunkEventCoder.of().decode(fromHex(hex)); + + assertEquals(expectedEvent, actualEvent); + } + + /** + * Tests whether {@link SplunkEventCoder} is able to decode a {@link SplunkEvent} encoded using + * the older coder version 2 (commit 5e53040), with the newly added "fields" field. + */ + @Test + public void testBackwardsCompatibility_canDecodeVersion2withFields() + throws IOException, DecoderException { + + JsonObject fields = new JsonObject(); + fields.addProperty("k", "v"); + + SplunkEvent expectedEvent = + SplunkEvent.newBuilder() + .withEvent("e") + .withHost("h") + .withIndex("i") + .withSource("s") + .withSourceType("st") + .withTime(1234L) + .withFields(fields) + .build(); + + String hex = "0100000000000004d20101680101730102737401016901097b226b223a2276227d0165"; + SplunkEvent actualEvent = SplunkEventCoder.of().decode(fromHex(hex)); + + assertEquals(expectedEvent, actualEvent); + } + + private static InputStream fromHex(String hex) throws DecoderException { + byte[] b = Hex.decodeHex(hex); + return new ByteArrayInputStream(b); + } +} diff --git a/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventTest.java b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventTest.java index 29769526d248b..749086bac4354 100644 --- a/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventTest.java +++ b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventTest.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; +import com.google.gson.JsonObject; import org.junit.Test; /** Unit tests for {@link SplunkEvent} class. */ @@ -34,6 +35,8 @@ public void testEquals() { String source = "test-source"; String sourceType = "test-source-type"; Long time = 123456789L; + JsonObject fields = new JsonObject(); + fields.addProperty("test-key", "test-value"); SplunkEvent actualEvent = SplunkEvent.newBuilder() @@ -43,6 +46,7 @@ public void testEquals() { .withSource(source) .withSourceType(sourceType) .withTime(time) + .withFields(fields) .create(); assertEquals( @@ -53,6 +57,7 @@ public void testEquals() { .withSource(source) .withSourceType(sourceType) .withTime(time) + .withFields(fields) .create(), actualEvent); diff --git a/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventWriterTest.java b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventWriterTest.java index 3633844ab6d2b..f4d8c1a5e1373 100644 --- a/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventWriterTest.java +++ b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkEventWriterTest.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.splunk; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -62,6 +63,21 @@ public static void setup() { private MockServerClient mockServerClient; + @Test + public void testMissingURLProtocol() { + assertFalse(SplunkEventWriter.isValidUrlFormat("test-url")); + } + + @Test + public void testInvalidURL() { + assertFalse(SplunkEventWriter.isValidUrlFormat("http://1.2.3")); + } + + @Test + public void testValidURL() { + assertTrue(SplunkEventWriter.isValidUrlFormat("http://test-url")); + } + @Test public void eventWriterMissingURL() { @@ -71,13 +87,51 @@ public void eventWriterMissingURL() { assertTrue(thrown.getMessage().contains("url needs to be provided")); } + @Test + public void eventWriterMissingURLProtocol() { + Exception thrown = + assertThrows( + IllegalArgumentException.class, + () -> SplunkEventWriter.newBuilder().withUrl("test-url").build()); + + assertTrue(thrown.getMessage().contains(SplunkEventWriter.INVALID_URL_FORMAT_MESSAGE)); + } + + /** Test building {@link SplunkEventWriter} with an invalid URL. */ + @Test + public void eventWriterInvalidURL() { + Exception thrown = + assertThrows( + IllegalArgumentException.class, + () -> SplunkEventWriter.newBuilder().withUrl("http://1.2.3").build()); + + assertTrue(thrown.getMessage().contains(SplunkEventWriter.INVALID_URL_FORMAT_MESSAGE)); + } + + /** + * Test building {@link SplunkEventWriter} with the 'services/collector/event' path appended to + * the URL. + */ + @Test + public void eventWriterFullEndpoint() { + Exception thrown = + assertThrows( + IllegalArgumentException.class, + () -> + SplunkEventWriter.newBuilder() + .withUrl("http://test-url:8088/services/collector/event") + .build()); + + assertTrue(thrown.getMessage().contains(SplunkEventWriter.INVALID_URL_FORMAT_MESSAGE)); + } + @Test public void eventWriterMissingToken() { Exception thrown = assertThrows( NullPointerException.class, - () -> SplunkEventWriter.newBuilder().withUrl("test-url").build()); + () -> SplunkEventWriter.newBuilder().withUrl("http://test-url").build()); assertTrue(thrown.getMessage().contains("token needs to be provided")); } @@ -86,7 +140,7 @@ public void eventWriterMissingToken() { public void eventWriterDefaultBatchCountAndValidation() { SplunkEventWriter writer = - SplunkEventWriter.newBuilder().withUrl("test-url").withToken("test-token").build(); + SplunkEventWriter.newBuilder().withUrl("http://test-url").withToken("test-token").build(); assertNull(writer.inputBatchCount()); assertNull(writer.disableCertificateValidation()); @@ -99,7 +153,7 @@ public void eventWriterCustomBatchCountAndValidation() { Boolean certificateValidation = false; SplunkEventWriter writer = SplunkEventWriter.newBuilder() - .withUrl("test-url") + .withUrl("http://test-url") .withToken("test-token") .withInputBatchCount(StaticValueProvider.of(batchCount)) .withDisableCertificateValidation(StaticValueProvider.of(certificateValidation)) @@ -144,7 +198,6 @@ public void successfulSplunkWriteSingleBatchTest() { PCollection actual = pipeline .apply("Create Input data", Create.of(testEvents)) - // .withCoder(KvCoder.of(BigEndianIntegerCoder.of(), SplunkEventCoder.of()))) .apply( "SplunkEventWriter", ParDo.of( @@ -200,7 +253,6 @@ public void successfulSplunkWriteMultiBatchTest() { PCollection actual = pipeline .apply("Create Input data", Create.of(testEvents)) - // .withCoder(KvCoder.of(BigEndianIntegerCoder.of(), SplunkEventCoder.of()))) .apply( "SplunkEventWriter", ParDo.of( @@ -246,7 +298,6 @@ public void failedSplunkWriteSingleBatchTest() { PCollection actual = pipeline .apply("Create Input data", Create.of(testEvents)) - // .withCoder(KvCoder.of(BigEndianIntegerCoder.of(), SplunkEventCoder.of()))) .apply( "SplunkEventWriter", ParDo.of( diff --git a/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkIOTest.java b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkIOTest.java index 32c98513ea248..d2cfd59aace2d 100644 --- a/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkIOTest.java +++ b/sdks/java/io/splunk/src/test/java/org/apache/beam/sdk/io/splunk/SplunkIOTest.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io.splunk; +import com.google.gson.JsonObject; import java.util.List; import org.apache.beam.sdk.testing.NeedsRunner; import org.apache.beam.sdk.testing.PAssert; @@ -63,7 +64,8 @@ public void successfulSplunkIOMultiBatchNoParallelismTest() { int testPort = mockServerRule.getPort(); String url = Joiner.on(':').join("http://localhost", testPort); String token = "test-token"; - + JsonObject fields = new JsonObject(); + fields.addProperty("customfield", 1); List testEvents = ImmutableList.of( SplunkEvent.newBuilder() @@ -73,6 +75,7 @@ public void successfulSplunkIOMultiBatchNoParallelismTest() { .withSource("test-source-1") .withSourceType("test-source-type-1") .withTime(12345L) + .withFields(fields) .create(), SplunkEvent.newBuilder() .withEvent("test-event-2") @@ -81,11 +84,12 @@ public void successfulSplunkIOMultiBatchNoParallelismTest() { .withSource("test-source-2") .withSourceType("test-source-type-2") .withTime(12345L) + .withFields(fields) .create()); PCollection actual = pipeline - .apply("Create Input data", Create.of(testEvents)) // .withCoder(SplunkEventCoder.of())) + .apply("Create Input data", Create.of(testEvents)) .apply( "SplunkIO", SplunkIO.write(url, token).withParallelism(1).withBatchCount(testEvents.size())); @@ -132,7 +136,7 @@ public void successfulSplunkIOMultiBatchParallelismTest() { PCollection actual = pipeline - .apply("Create Input data", Create.of(testEvents)) // .withCoder(SplunkEventCoder.of())) + .apply("Create Input data", Create.of(testEvents)) .apply( "SplunkIO", SplunkIO.write(url, token) @@ -182,7 +186,7 @@ public void successfulSplunkIOSingleBatchParallelismTest() { PCollection actual = pipeline - .apply("Create Input data", Create.of(testEvents)) // .withCoder(SplunkEventCoder.of())) + .apply("Create Input data", Create.of(testEvents)) .apply( "SplunkIO", SplunkIO.write(url, token).withParallelism(testParallelism).withBatchCount(1)); diff --git a/sdks/java/io/thrift/build.gradle b/sdks/java/io/thrift/build.gradle index 6ee3314a74ff8..47c8111a0257f 100644 --- a/sdks/java/io/thrift/build.gradle +++ b/sdks/java/io/thrift/build.gradle @@ -42,6 +42,7 @@ dependencies { testImplementation library.java.junit testRuntimeOnly library.java.slf4j_jdk14 testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") + testImplementation project(path: ":sdks:java:extensions:avro", configuration: "testRuntimeMigration") } /* Removed due to lack of Thrift on Jenkins workers. diff --git a/sdks/java/maven-archetypes/examples/build.gradle b/sdks/java/maven-archetypes/examples/build.gradle index 56b4a7c842850..1edb55a10f956 100644 --- a/sdks/java/maven-archetypes/examples/build.gradle +++ b/sdks/java/maven-archetypes/examples/build.gradle @@ -72,6 +72,13 @@ task generateSources(type: Exec) { commandLine './generate-sources.sh' } +// add dependency BeamModulePlugin defined custom tasks +// they are defined only when certain flags are provided (e.g. -Prelease; -Ppublishing, etc) +def sourcesJar = project.tasks.findByName('sourcesJar') +if (sourcesJar != null) { + sourcesJar.dependsOn generateSources +} + sourceSets { main { output.dir('src', builtBy: 'generateSources') diff --git a/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle b/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle index 541c91bd6adb0..f9fabcfe19b08 100644 --- a/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle +++ b/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle @@ -71,6 +71,12 @@ task generateSources(type: Exec) { environment "HERE", "." commandLine '../examples/generate-sources.sh' } +// add dependency BeamModulePlugin defined custom tasks +// they are defined only when certain flags are provided (e.g. -Prelease; -Ppublishing, etc) +def sourcesJar = project.tasks.findByName('sourcesJar') +if (sourcesJar != null) { + sourcesJar.dependsOn generateSources +} sourceSets { main { diff --git a/sdks/java/testing/jpms-tests/build.gradle b/sdks/java/testing/jpms-tests/build.gradle index 6321f874c9036..9b881d7a2a9ed 100644 --- a/sdks/java/testing/jpms-tests/build.gradle +++ b/sdks/java/testing/jpms-tests/build.gradle @@ -23,10 +23,10 @@ plugins { } // overwrite javaVersion before applyJavaNature -if (project.hasProperty("compileAndRunTestsWithJava17")) { - javaVersion = '1.17' +if (project.hasProperty("testJavaVersion")) { + javaVersion = "1.${project.getProperty('testJavaVersion')}" as String } else { - javaVersion = '1.11' + javaVersion = "1.11" } applyJavaNature( @@ -42,13 +42,14 @@ ext.summary = "E2E test for Java 9 modules" // direct compileJava to use specified java version. project.tasks.compileJava { - if (project.hasProperty("compileAndRunTestsWithJava11")) { + if (project.hasProperty('testJavaVersion')) { options.fork = true - options.forkOptions.javaHome = project.findProperty("java11Home") as File - } else if (project.hasProperty("compileAndRunTestsWithJava17")) { - options.fork = true - options.forkOptions.javaHome = project.findProperty("java17Home") as File - setJava17Options(options) + options.forkOptions.javaHome = project.findProperty("java${project.getProperty('testJavaVersion')}Home") as File + if (project.getProperty('testJavaVersion') == '17') { + setJavaVerOptions(options, '17') + } else if (project.getProperty('testJavaVersion') == '21') { + setJavaVerOptions(options, '21') + } } } @@ -117,10 +118,12 @@ plugins.withType(JavaPlugin).configureEach{ } } -// JPMS requires JDK > 8 +// JPMS requires JDK > 8. Test tasks enabled when either +// (i) testJavaVersion property specified (assumed to be >8) or; +// (ii) current Java version is greater than 8 project.tasks.each { it.onlyIf { - project.hasProperty("compileAndRunTestsWithJava17") + project.hasProperty('testJavaVersion') || JavaVersion.VERSION_1_8.compareTo(JavaVersion.current()) < 0 } } diff --git a/sdks/java/testing/test-utils/build.gradle b/sdks/java/testing/test-utils/build.gradle index 50c815dd57f7f..6e30693d8894f 100644 --- a/sdks/java/testing/test-utils/build.gradle +++ b/sdks/java/testing/test-utils/build.gradle @@ -43,24 +43,15 @@ dependencies { testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadowTest") } -task verifyJavaVersion(type: Test) { - filter { - includeTestsMatching 'org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyCodeIsCompiledWithJava8' - includeTestsMatching 'org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyTestCodeIsCompiledWithJava11' - includeTestsMatching 'org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyRunningJVMVersionIs11' - } - doLast { - println 'Java verified' +['11', '17', '21'].each { + tasks.create(name: "verifyJavaVersion${it}", type: Test) { + filter { + includeTestsMatching "org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyCodeIsCompiledWithJava8" + includeTestsMatching "org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyTestCodeIsCompiledWithJava${it}" + includeTestsMatching "org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyRunningJVMVersionIs${it}" + } + doLast { + println 'Java verified' + } } } - -task verifyJavaVersion17(type: Test) { - filter { - includeTestsMatching 'org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyCodeIsCompiledWithJava8' - includeTestsMatching 'org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyTestCodeIsCompiledWithJava17' - includeTestsMatching 'org.apache.beam.sdk.testutils.jvmverification.JvmVerification.verifyRunningJVMVersionIs17' - } - doLast { - println 'Java verified' - } -} \ No newline at end of file diff --git a/sdks/java/testing/test-utils/src/test/java/org/apache/beam/sdk/testutils/jvmverification/JvmVerification.java b/sdks/java/testing/test-utils/src/test/java/org/apache/beam/sdk/testutils/jvmverification/JvmVerification.java index ad29e8b6a1d63..a6b5d6dca6c1e 100644 --- a/sdks/java/testing/test-utils/src/test/java/org/apache/beam/sdk/testutils/jvmverification/JvmVerification.java +++ b/sdks/java/testing/test-utils/src/test/java/org/apache/beam/sdk/testutils/jvmverification/JvmVerification.java @@ -20,6 +20,7 @@ import static org.apache.beam.sdk.testutils.jvmverification.JvmVerification.Java.v11; import static org.apache.beam.sdk.testutils.jvmverification.JvmVerification.Java.v17; import static org.apache.beam.sdk.testutils.jvmverification.JvmVerification.Java.v1_8; +import static org.apache.beam.sdk.testutils.jvmverification.JvmVerification.Java.v21; import static org.junit.Assert.assertEquals; import java.io.IOException; @@ -39,6 +40,7 @@ public class JvmVerification { versionMapping.put("0034", v1_8); versionMapping.put("0037", v11); versionMapping.put("003d", v17); + versionMapping.put("0041", v21); } // bytecode @@ -62,6 +64,11 @@ public void verifyTestCodeIsCompiledWithJava17() throws IOException { assertEquals(v17, getByteCodeVersion(JvmVerification.class)); } + @Test + public void verifyTestCodeIsCompiledWithJava21() throws IOException { + assertEquals(v21, getByteCodeVersion(JvmVerification.class)); + } + // jvm @Test public void verifyRunningJVMVersionIs11() { @@ -75,6 +82,12 @@ public void verifyRunningJVMVersionIs17() { assertEquals(v17.name, version); } + @Test + public void verifyRunningJVMVersionIs21() { + final String version = getJavaSpecification(); + assertEquals(v21.name, version); + } + private static Java getByteCodeVersion(final Class clazz) throws IOException { final InputStream stream = clazz.getClassLoader().getResourceAsStream(clazz.getName().replace(".", "/") + ".class"); @@ -91,7 +104,8 @@ private static String getJavaSpecification() { enum Java { v1_8("1.8"), v11("11"), - v17("17"); + v17("17"), + v21("21"); final String name; diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java index e8b85f63b36a8..1550a25b7c8f1 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java @@ -39,12 +39,16 @@ import org.apache.beam.sdk.extensions.sql.meta.provider.text.TextTable; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.parquet.ParquetIO; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.testutils.publishing.InfluxDBPublisher; import org.apache.beam.sdk.testutils.publishing.InfluxDBSettings; +import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionTuple; import org.apache.beam.sdk.values.Row; @@ -81,6 +85,9 @@ public class SqlTransformRunner { private static final Logger LOG = LoggerFactory.getLogger(SqlTransformRunner.class); + static final String METRICS_NAMESPACE = "tpcds"; + static final String OUTPUT_COUNTER = "output_rows"; + /** This class is used to extract all SQL query identifiers. */ static class SqlIdentifierVisitor extends SqlBasicVisitor { private final Set identifiers = new HashSet<>(); @@ -283,7 +290,8 @@ public static void runUsingSqlTransform(String[] args) throws Exception { // Make an array of pipelines, each pipeline is responsible for running a corresponding query. Pipeline[] pipelines = new Pipeline[queryNames.length]; - CSVFormat csvFormat = CSVFormat.MYSQL.withDelimiter('|').withNullString(""); + CSVFormat csvFormat = + CSVFormat.MYSQL.withDelimiter('|').withTrailingDelimiter().withNullString(""); // Execute all queries, transform each result into a PCollection, write them into // the txt file and store in a GCP directory. @@ -304,6 +312,7 @@ public static void runUsingSqlTransform(String[] args) throws Exception { tables .apply(SqlTransform.query(queryString)) .apply(MapElements.into(TypeDescriptors.strings()).via(Row::toString)) + .apply(ParDo.of(new CounterDoFn())) .apply( TextIO.write() .to( @@ -395,4 +404,14 @@ private static InfluxDBSettings getInfluxSettings(final TpcdsOptions options) { .withRetentionPolicy(options.getInfluxRetentionPolicy()) .get(); } + + private static class CounterDoFn extends DoFn { + private final Counter counter = Metrics.counter(METRICS_NAMESPACE, OUTPUT_COUNTER); + + @ProcessElement + public void processElement(ProcessContext context) { + counter.inc(); + context.output(context.element()); + } + } } diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TpcdsRun.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TpcdsRun.java index b6235db1c1233..700de369b6091 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TpcdsRun.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TpcdsRun.java @@ -17,10 +17,17 @@ */ package org.apache.beam.sdk.tpcds; +import static org.apache.beam.sdk.tpcds.SqlTransformRunner.METRICS_NAMESPACE; +import static org.apache.beam.sdk.tpcds.SqlTransformRunner.OUTPUT_COUNTER; + import java.util.concurrent.Callable; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.PipelineResult.State; +import org.apache.beam.sdk.metrics.MetricNameFilter; +import org.apache.beam.sdk.metrics.MetricQueryResults; +import org.apache.beam.sdk.metrics.MetricResult; +import org.apache.beam.sdk.metrics.MetricsFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,6 +55,35 @@ public TpcdsRunResult call() { // Make sure to set the job status to be successful only when pipelineResult's final state is // DONE. boolean isSuccessful = state == State.DONE; + + // Check a number of output records - it MUST be greater than 0. + if (isSuccessful) { + long outputRecords = 0; + MetricQueryResults metrics = + pipelineResult + .metrics() + .queryMetrics( + MetricsFilter.builder() + .addNameFilter(MetricNameFilter.named(METRICS_NAMESPACE, OUTPUT_COUNTER)) + .build()); + if (metrics.getCounters().iterator().hasNext()) { + // Despite it's iterable, it should contain only one entry + MetricResult metricResult = metrics.getCounters().iterator().next(); + if (metricResult.getAttempted() != null && metricResult.getAttempted() > 0) { + outputRecords = metricResult.getAttempted(); + } + } + + // It's expected a "greater than zero" number of output records for successful jobs. + if (outputRecords <= 0) { + LOG.warn( + "Output records counter for job \"{}\" is {}", + pipeline.getOptions().getJobName(), + outputRecords); + isSuccessful = false; + } + } + tpcdsRunResult = new TpcdsRunResult( isSuccessful, startTimeStamp, endTimeStamp, pipeline.getOptions(), pipelineResult); diff --git a/sdks/java/transform-service/docker-compose/.env b/sdks/java/transform-service/docker-compose/.env index 5de5982cfa301..ed27b267fed37 100644 --- a/sdks/java/transform-service/docker-compose/.env +++ b/sdks/java/transform-service/docker-compose/.env @@ -12,6 +12,14 @@ BEAM_VERSION=$BEAM_VERSION CREDENTIALS_VOLUME=$CREDENTIALS_VOLUME +DEPENDENCIES_VOLUME=$DEPENDENCIES_VOLUME + +# A requirements file with either of the following +# * PyPi packages +# * Locally available packages relative to the directory provided to +# DEPENDENCIES_VOLUME. +PYTHON_REQUIREMENTS_FILE_NAME=$PYTHON_REQUIREMENTS_FILE_NAME + GOOGLE_APPLICATION_CREDENTIALS_FILE_NAME=application_default_credentials.json COMPOSE_PROJECT_NAME=apache.beam.transform.service TRANSFORM_SERVICE_PORT=$TRANSFORM_SERVICE_PORT diff --git a/sdks/java/transform-service/docker-compose/docker-compose.yml b/sdks/java/transform-service/docker-compose/docker-compose.yml index b685be10a329b..39235533b9a86 100644 --- a/sdks/java/transform-service/docker-compose/docker-compose.yml +++ b/sdks/java/transform-service/docker-compose/docker-compose.yml @@ -32,8 +32,9 @@ services: expansion-service-2: image: "apache/beam_python_expansion_service:${BEAM_VERSION}" restart: on-failure - command: -id expansion-service-2 -port 5001 + command: -id expansion-service-2 -port 5001 -requirements_file ${PYTHON_REQUIREMENTS_FILE_NAME} -dependencies_dir '/dependencies_volume' volumes: - ${CREDENTIALS_VOLUME}:/credentials_volume + - ${DEPENDENCIES_VOLUME}:/dependencies_volume environment: - GOOGLE_APPLICATION_CREDENTIALS=/credentials_volume/${GOOGLE_APPLICATION_CREDENTIALS_FILE_NAME} diff --git a/sdks/java/transform-service/launcher/build.gradle b/sdks/java/transform-service/launcher/build.gradle index 83c5d60a1ef1f..0952f37109eb9 100644 --- a/sdks/java/transform-service/launcher/build.gradle +++ b/sdks/java/transform-service/launcher/build.gradle @@ -45,6 +45,9 @@ dependencies { shadow library.java.args4j shadow library.java.error_prone_annotations permitUnusedDeclared(library.java.error_prone_annotations) + testImplementation library.java.junit + testImplementation library.java.mockito_core + testImplementation project(path: ":sdks:java:core") } sourceSets { diff --git a/sdks/java/transform-service/launcher/src/main/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncher.java b/sdks/java/transform-service/launcher/src/main/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncher.java index f52fdfed710d2..c0a9097a762fa 100644 --- a/sdks/java/transform-service/launcher/src/main/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncher.java +++ b/sdks/java/transform-service/launcher/src/main/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncher.java @@ -17,9 +17,11 @@ */ package org.apache.beam.sdk.transformservice.launcher; +import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -28,6 +30,7 @@ import java.util.Locale; import java.util.Map; import java.util.concurrent.TimeoutException; +import java.util.stream.Stream; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.ByteStreams; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Files; @@ -62,9 +65,9 @@ public class TransformServiceLauncher { private static final int STATUS_LOGGER_WAIT_TIME = 3000; @SuppressWarnings("argument") - private TransformServiceLauncher(@Nullable String projectName, int port) throws IOException { - LOG.info("Initializing the Beam Transform Service {}.", projectName); - + private TransformServiceLauncher( + @Nullable String projectName, int port, @Nullable String pythonRequirementsFile) + throws IOException { String tmpDirLocation = System.getProperty("java.io.tmpdir"); // We use Docker Compose project name as the name of the temporary directory to isolate // different transform service instances that may be running in the same machine. @@ -83,14 +86,14 @@ private TransformServiceLauncher(@Nullable String projectName, int port) throws ByteStreams.copy(getClass().getResourceAsStream("/.env"), fout); } + // Setting up the credentials directory. File credentialsDir = Paths.get(tmpDir, "credentials_dir").toFile(); - LOG.info( - "Creating a temporary directory for storing credentials: " - + credentialsDir.getAbsolutePath()); - if (credentialsDir.exists()) { LOG.info("Reusing the existing credentials directory " + credentialsDir.getAbsolutePath()); } else { + LOG.info( + "Creating a temporary directory for storing credentials: " + + credentialsDir.getAbsolutePath()); if (!credentialsDir.mkdir()) { throw new IOException( "Could not create a temporary directory for storing credentials: " @@ -124,10 +127,84 @@ private TransformServiceLauncher(@Nullable String projectName, int port) throws } } + // Setting up the dependencies directory. + File dependenciesDir = Paths.get(tmpDir, "dependencies_dir").toFile(); + Path updatedRequirementsFilePath = Paths.get(dependenciesDir.toString(), "requirements.txt"); + if (dependenciesDir.exists()) { + LOG.info("Reusing the existing dependencies directory " + dependenciesDir.getAbsolutePath()); + } else { + LOG.info( + "Creating a temporary directory for storing dependencies: " + + dependenciesDir.getAbsolutePath()); + if (!dependenciesDir.mkdir()) { + throw new IOException( + "Could not create a temporary directory for storing dependencies: " + + dependenciesDir.getAbsolutePath()); + } + + // We create a requirements file with extra dependencies. + // If there are no extra dependencies, we just provide an empty requirements file. + File file = updatedRequirementsFilePath.toFile(); + if (!file.createNewFile()) { + throw new IOException( + "Could not create the new requirements file " + updatedRequirementsFilePath); + } + + // Updating dependencies. + if (pythonRequirementsFile != null) { + Path requirementsFilePath = Paths.get(pythonRequirementsFile); + List updatedLines = new ArrayList<>(); + + try (Stream lines = java.nio.file.Files.lines(requirementsFilePath)) { + lines.forEachOrdered( + line -> { + Path dependencyFilePath = Paths.get(line); + if (java.nio.file.Files.exists(dependencyFilePath)) { + Path fileName = dependencyFilePath.getFileName(); + if (fileName == null) { + throw new IllegalArgumentException( + "Could not determine the filename of the local artifact " + + dependencyFilePath); + } + try { + java.nio.file.Files.copy( + dependencyFilePath, + Paths.get(dependenciesDir.toString(), fileName.toString())); + } catch (IOException e) { + throw new RuntimeException(e); + } + updatedLines.add(fileName.toString()); + } else { + updatedLines.add(line); + } + }); + } + + try (BufferedWriter writer = + java.nio.file.Files.newBufferedWriter(file.toPath(), StandardCharsets.UTF_8)) { + for (String line : updatedLines) { + writer.write(line); + writer.newLine(); + } + writer.flush(); + } + } + } + // Setting environment variables used by the docker-compose.yml file. environmentVariables.put("CREDENTIALS_VOLUME", credentialsDir.getAbsolutePath()); + environmentVariables.put("DEPENDENCIES_VOLUME", dependenciesDir.getAbsolutePath()); environmentVariables.put("TRANSFORM_SERVICE_PORT", String.valueOf(port)); + Path updatedRequirementsFileName = updatedRequirementsFilePath.getFileName(); + if (updatedRequirementsFileName == null) { + throw new IllegalArgumentException( + "Could not determine the file name of the updated requirements file " + + updatedRequirementsFilePath); + } + environmentVariables.put( + "PYTHON_REQUIREMENTS_FILE_NAME", updatedRequirementsFileName.toString()); + // Building the Docker Compose command. dockerComposeStartCommandPrefix.add("docker-compose"); dockerComposeStartCommandPrefix.add("-p"); @@ -136,21 +213,37 @@ private TransformServiceLauncher(@Nullable String projectName, int port) throws dockerComposeStartCommandPrefix.add(dockerComposeFile.getAbsolutePath()); } + /** + * Specifies the Beam version to get containers for the transform service. + * + *

    Could be a release Beam version with containers in Docker Hub or an unreleased Beam version + * for which containers are available locally. + * + * @param beamVersion a Beam version to get containers from. + */ public void setBeamVersion(String beamVersion) { environmentVariables.put("BEAM_VERSION", beamVersion); } - public void setPythonExtraPackages(String pythonExtraPackages) { - environmentVariables.put("$PYTHON_EXTRA_PACKAGES", pythonExtraPackages); - } - + /** + * Initializes a client for managing transform service instances. + * + * @param projectName project name for the transform service. + * @param port port exposed by the transform service. + * @param pythonRequirementsFile a requirements file with extra dependencies for the Python + * expansion services. + * @return an initialized client for managing the transform service. + * @throws IOException + */ public static synchronized TransformServiceLauncher forProject( - @Nullable String projectName, int port) throws IOException { + @Nullable String projectName, int port, @Nullable String pythonRequirementsFile) + throws IOException { if (projectName == null || projectName.isEmpty()) { projectName = DEFAULT_PROJECT_NAME; } if (!launchers.containsKey(projectName)) { - launchers.put(projectName, new TransformServiceLauncher(projectName, port)); + launchers.put( + projectName, new TransformServiceLauncher(projectName, port, pythonRequirementsFile)); } return launchers.get(projectName); } @@ -200,10 +293,10 @@ public synchronized void status() throws IOException { public synchronized void waitTillUp(int timeout) throws IOException, TimeoutException { timeout = timeout <= 0 ? DEFAULT_START_WAIT_TIME : timeout; - String statusFileName = getStatus(); long startTime = System.currentTimeMillis(); while (System.currentTimeMillis() - startTime < timeout) { + String statusFileName = getStatus(); try { // We are just waiting for a local process. No need for exponential backoff. this.wait(1000); @@ -226,6 +319,7 @@ public synchronized void waitTillUp(int timeout) throws IOException, TimeoutExce private synchronized String getStatus() throws IOException { File outputOverride = File.createTempFile("output_override", null); + outputOverride.deleteOnExit(); runDockerComposeCommand(ImmutableList.of("ps"), outputOverride); return outputOverride.getAbsolutePath(); @@ -238,6 +332,8 @@ private static class ArgConfig { static final String PORT_ARG_NAME = "port"; static final String BEAM_VERSION_ARG_NAME = "beam_version"; + static final String PYTHON_REQUIREMENTS_FILE_ARG_NAME = "python_requirements_file"; + @Option(name = "--" + PROJECT_NAME_ARG_NAME, usage = "Docker compose project name") private String projectName = ""; @@ -249,6 +345,11 @@ private static class ArgConfig { @Option(name = "--" + BEAM_VERSION_ARG_NAME, usage = "Beam version to use.") private String beamVersion = ""; + + @Option( + name = "--" + PYTHON_REQUIREMENTS_FILE_ARG_NAME, + usage = "Extra Python packages in the form of an requirements file.") + private String pythonRequirementsFile = ""; } public static void main(String[] args) throws IOException, TimeoutException { @@ -288,8 +389,12 @@ public static void main(String[] args) throws IOException, TimeoutException { : ("port " + Integer.toString(config.port) + "."))); System.out.println("==================================================="); + String pythonRequirementsFile = + !config.pythonRequirementsFile.isEmpty() ? config.pythonRequirementsFile : null; + TransformServiceLauncher service = - TransformServiceLauncher.forProject(config.projectName, config.port); + TransformServiceLauncher.forProject( + config.projectName, config.port, pythonRequirementsFile); if (!config.beamVersion.isEmpty()) { service.setBeamVersion(config.beamVersion); } diff --git a/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java b/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java new file mode 100644 index 0000000000000..4ef84b02061be --- /dev/null +++ b/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.transformservice.launcher; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.UUID; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class TransformServiceLauncherTest { + + @Test + public void testLauncherCreatesCredentialsDir() throws IOException { + String projectName = UUID.randomUUID().toString(); + Path expectedTempDir = Paths.get(System.getProperty("java.io.tmpdir"), projectName); + File file = expectedTempDir.toFile(); + file.deleteOnExit(); + TransformServiceLauncher.forProject(projectName, 12345, null); + Path expectedCredentialsDir = Paths.get(expectedTempDir.toString(), "credentials_dir"); + assertTrue(expectedCredentialsDir.toFile().exists()); + } + + @Test + public void testLauncherCreatesDependenciesDir() throws IOException { + String projectName = UUID.randomUUID().toString(); + Path expectedTempDir = Paths.get(System.getProperty("java.io.tmpdir"), projectName); + File file = expectedTempDir.toFile(); + file.deleteOnExit(); + TransformServiceLauncher.forProject(projectName, 12345, null); + Path expectedCredentialsDir = Paths.get(expectedTempDir.toString(), "dependencies_dir"); + assertTrue(expectedCredentialsDir.toFile().exists()); + } + + @Test + public void testLauncherInstallsDependencies() throws IOException { + String projectName = UUID.randomUUID().toString(); + Path expectedTempDir = Paths.get(System.getProperty("java.io.tmpdir"), projectName); + File file = expectedTempDir.toFile(); + file.deleteOnExit(); + + File requirementsFile = + Paths.get( + System.getProperty("java.io.tmpdir"), + ("requirements" + UUID.randomUUID().toString() + ".txt")) + .toFile(); + requirementsFile.deleteOnExit(); + + try (Writer fout = + new OutputStreamWriter( + new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + fout.write("pypipackage1\n"); + fout.write("pypipackage2\n"); + } + + TransformServiceLauncher.forProject(projectName, 12345, requirementsFile.getAbsolutePath()); + + // Confirming that the Transform Service launcher created a temporary requirements file with the + // specified set of packages. + Path expectedUpdatedRequirementsFile = + Paths.get(expectedTempDir.toString(), "dependencies_dir", "requirements.txt"); + assertTrue(expectedUpdatedRequirementsFile.toFile().exists()); + + ArrayList expectedUpdatedRequirementsFileLines = new ArrayList<>(); + try (BufferedReader bufReader = + Files.newBufferedReader(expectedUpdatedRequirementsFile, UTF_8)) { + String line = bufReader.readLine(); + while (line != null) { + expectedUpdatedRequirementsFileLines.add(line); + line = bufReader.readLine(); + } + } + + assertEquals(2, expectedUpdatedRequirementsFileLines.size()); + assertTrue(expectedUpdatedRequirementsFileLines.contains("pypipackage1")); + assertTrue(expectedUpdatedRequirementsFileLines.contains("pypipackage2")); + } + + @Test + public void testLauncherInstallsLocalDependencies() throws IOException { + String projectName = UUID.randomUUID().toString(); + Path expectedTempDir = Paths.get(System.getProperty("java.io.tmpdir"), projectName); + File file = expectedTempDir.toFile(); + file.deleteOnExit(); + + String dependency1FileName = "dep_" + UUID.randomUUID().toString(); + File dependency1 = + Paths.get(System.getProperty("java.io.tmpdir"), dependency1FileName).toFile(); + dependency1.deleteOnExit(); + try (Writer fout = + new OutputStreamWriter( + new FileOutputStream(dependency1.getAbsolutePath()), Charsets.UTF_8)) { + fout.write("tempdata\n"); + } + + String dependency2FileName = "dep_" + UUID.randomUUID().toString(); + File dependency2 = + Paths.get(System.getProperty("java.io.tmpdir"), dependency2FileName).toFile(); + dependency2.deleteOnExit(); + try (Writer fout = + new OutputStreamWriter( + new FileOutputStream(dependency2.getAbsolutePath()), Charsets.UTF_8)) { + fout.write("tempdata\n"); + } + + File requirementsFile = + Paths.get( + System.getProperty("java.io.tmpdir"), + ("requirements" + UUID.randomUUID().toString() + ".txt")) + .toFile(); + requirementsFile.deleteOnExit(); + try (Writer fout = + new OutputStreamWriter( + new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + fout.write(dependency1.getAbsolutePath() + "\n"); + fout.write(dependency2.getAbsolutePath() + "\n"); + fout.write("pypipackage" + "\n"); + } + + TransformServiceLauncher.forProject(projectName, 12345, requirementsFile.getAbsolutePath()); + + // Confirming that the Transform Service launcher created a temporary requirements file with the + // specified set of packages. + Path expectedUpdatedRequirementsFile = + Paths.get(expectedTempDir.toString(), "dependencies_dir", "requirements.txt"); + assertTrue(expectedUpdatedRequirementsFile.toFile().exists()); + + ArrayList expectedUpdatedRequirementsFileLines = new ArrayList<>(); + try (BufferedReader bufReader = + Files.newBufferedReader(expectedUpdatedRequirementsFile, UTF_8)) { + String line = bufReader.readLine(); + while (line != null) { + expectedUpdatedRequirementsFileLines.add(line); + line = bufReader.readLine(); + } + } + + // To make local packages available to the expansion service Docker containers, the temporary + // requirements file should contain names of the local packages relative to the dependencies + // volume and local packages should have been copied to the dependencies volume. + assertEquals(3, expectedUpdatedRequirementsFileLines.size()); + assertTrue(expectedUpdatedRequirementsFileLines.contains(dependency1FileName)); + assertTrue(expectedUpdatedRequirementsFileLines.contains(dependency2FileName)); + assertTrue(expectedUpdatedRequirementsFileLines.contains("pypipackage")); + + assertTrue( + Paths.get(expectedTempDir.toString(), "dependencies_dir", dependency1FileName) + .toFile() + .exists()); + assertTrue( + Paths.get(expectedTempDir.toString(), "dependencies_dir", dependency2FileName) + .toFile() + .exists()); + } +} diff --git a/sdks/java/transform-service/src/main/java/org/apache/beam/sdk/transformservice/ExpansionService.java b/sdks/java/transform-service/src/main/java/org/apache/beam/sdk/transformservice/ExpansionService.java index 17fe5472f9fcc..0a2e65099e7db 100644 --- a/sdks/java/transform-service/src/main/java/org/apache/beam/sdk/transformservice/ExpansionService.java +++ b/sdks/java/transform-service/src/main/java/org/apache/beam/sdk/transformservice/ExpansionService.java @@ -17,15 +17,22 @@ */ package org.apache.beam.sdk.transformservice; +import java.io.IOException; +import java.net.Socket; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeoutException; import org.apache.beam.model.expansion.v1.ExpansionApi; +import org.apache.beam.model.expansion.v1.ExpansionApi.ExpansionResponse; import org.apache.beam.model.expansion.v1.ExpansionServiceGrpc; import org.apache.beam.model.pipeline.v1.Endpoints; import org.apache.beam.runners.core.construction.DefaultExpansionServiceClientFactory; import org.apache.beam.runners.core.construction.ExpansionServiceClientFactory; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.ManagedChannelBuilder; import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Throwables; import org.checkerframework.checker.nullness.qual.Nullable; @@ -40,6 +47,12 @@ public class ExpansionService extends ExpansionServiceGrpc.ExpansionServiceImplB final List endpoints; + private boolean checkedAllServices = false; + + private static final long SERVICE_CHECK_TIMEOUT_MILLIS = 60000; + + private boolean disableServiceCheck = false; + ExpansionService( List endpoints, @Nullable ExpansionServiceClientFactory clientFactory) { @@ -48,10 +61,65 @@ public class ExpansionService extends ExpansionServiceGrpc.ExpansionServiceImplB clientFactory != null ? clientFactory : DEFAULT_EXPANSION_SERVICE_CLIENT_FACTORY; } + // Waits till all expansion services are ready. + private void waitForAllServicesToBeReady() throws TimeoutException { + if (disableServiceCheck) { + // Service check disabled. Just returning. + return; + } + + outer: + for (Endpoints.ApiServiceDescriptor endpoint : endpoints) { + long start = System.currentTimeMillis(); + long duration = 10; + while (System.currentTimeMillis() - start < SERVICE_CHECK_TIMEOUT_MILLIS) { + try { + String url = endpoint.getUrl(); + int portIndex = url.lastIndexOf(":"); + if (portIndex <= 0) { + throw new RuntimeException( + "Expected the endpoint to be of the form : but received " + url); + } + int port = Integer.parseInt(url.substring(portIndex + 1)); + String host = url.substring(0, portIndex); + new Socket(host, port).close(); + // Current service is up. Checking the next one. + continue outer; + } catch (IOException exn) { + try { + Thread.sleep(duration); + } catch (InterruptedException e) { + // Ignore + } + duration = (long) (duration * 1.2); + } + } + throw new TimeoutException( + "Timeout waiting for the service " + + endpoint.getUrl() + + " to startup after " + + (System.currentTimeMillis() - start) + + " milliseconds."); + } + } + + @VisibleForTesting + void disableServiceCheck() { + disableServiceCheck = true; + } + @Override public void expand( ExpansionApi.ExpansionRequest request, StreamObserver responseObserver) { + if (!checkedAllServices) { + try { + waitForAllServicesToBeReady(); + } catch (TimeoutException e) { + throw new RuntimeException(e); + } + checkedAllServices = true; + } try { responseObserver.onNext(processExpand(request)); responseObserver.onCompleted(); @@ -68,6 +136,14 @@ public void expand( public void discoverSchemaTransform( ExpansionApi.DiscoverSchemaTransformRequest request, StreamObserver responseObserver) { + if (!checkedAllServices) { + try { + waitForAllServicesToBeReady(); + } catch (TimeoutException e) { + throw new RuntimeException(e); + } + checkedAllServices = true; + } try { responseObserver.onNext(processDiscover(request)); responseObserver.onCompleted(); @@ -80,18 +156,41 @@ public void discoverSchemaTransform( } } - /*package*/ ExpansionApi.ExpansionResponse processExpand(ExpansionApi.ExpansionRequest request) { + private ExpansionApi.ExpansionResponse getAggregatedErrorResponse( + Map errorResponses) { + StringBuilder errorMessageBuilder = new StringBuilder(); + + errorMessageBuilder.append( + "Aggregated errors from " + errorResponses.size() + " expansion services." + "\n"); + for (Map.Entry entry : errorResponses.entrySet()) { + errorMessageBuilder.append( + "Error from expansion service " + + entry.getKey() + + ": " + + entry.getValue().getError() + + "\n"); + } + + return errorResponses + .values() + .iterator() + .next() + .toBuilder() + .setError(errorMessageBuilder.toString()) + .build(); + } + + ExpansionApi.ExpansionResponse processExpand(ExpansionApi.ExpansionRequest request) { // Trying out expansion services in order till one succeeds. // If all services fail, re-raises the last error. - // TODO: when all services fail, return an aggregated error with errors from all services. - ExpansionApi.ExpansionResponse lastErrorResponse = null; + Map errorResponses = new HashMap<>(); RuntimeException lastException = null; for (Endpoints.ApiServiceDescriptor endpoint : endpoints) { try { ExpansionApi.ExpansionResponse response = expansionServiceClientFactory.getExpansionServiceClient(endpoint).expand(request); if (!response.getError().isEmpty()) { - lastErrorResponse = response; + errorResponses.put(endpoint.getUrl(), response); continue; } return response; @@ -99,8 +198,11 @@ public void discoverSchemaTransform( lastException = e; } } - if (lastErrorResponse != null) { - return lastErrorResponse; + if (lastException != null) { + throw new RuntimeException("Expansion request to transform service failed.", lastException); + } + if (!errorResponses.isEmpty()) { + return getAggregatedErrorResponse(errorResponses); } else if (lastException != null) { throw new RuntimeException("Expansion request to transform service failed.", lastException); } else { diff --git a/sdks/java/transform-service/src/test/java/org/apache/beam/sdk/transformservice/ExpansionServiceTest.java b/sdks/java/transform-service/src/test/java/org/apache/beam/sdk/transformservice/ExpansionServiceTest.java index 298bce87f9015..9905abd1d9bae 100644 --- a/sdks/java/transform-service/src/test/java/org/apache/beam/sdk/transformservice/ExpansionServiceTest.java +++ b/sdks/java/transform-service/src/test/java/org/apache/beam/sdk/transformservice/ExpansionServiceTest.java @@ -60,6 +60,8 @@ public void setUp() throws Exception { endpoints.add(endpoint2); clientFactory = Mockito.mock(ExpansionServiceClientFactory.class); expansionService = new ExpansionService(endpoints, clientFactory); + // We do not run actual services in unit tests. + expansionService.disableServiceCheck(); } @Test @@ -131,7 +133,10 @@ public void testExpandFail() { ArgumentCaptor expansionResponseCapture = ArgumentCaptor.forClass(ExpansionResponse.class); Mockito.verify(responseObserver).onNext(expansionResponseCapture.capture()); - assertEquals("expansion error 2", expansionResponseCapture.getValue().getError()); + + // Error response should contain errors from both expansion services. + assertTrue(expansionResponseCapture.getValue().getError().contains("expansion error 1")); + assertTrue(expansionResponseCapture.getValue().getError().contains("expansion error 2")); } @Test diff --git a/sdks/python/.pylintrc b/sdks/python/.pylintrc index 250932e798120..a67f00ff2f82b 100644 --- a/sdks/python/.pylintrc +++ b/sdks/python/.pylintrc @@ -116,7 +116,6 @@ disable = missing-docstring, modified-iterating-list, multiple-statements, - missing-timeout, #TODO(https://github.com/apache/beam/issues/28240) Enable and fix warnings no-self-use, no-else-break, no-else-continue, @@ -140,7 +139,6 @@ disable = stop-iteration-return, super-init-not-called, superfluous-parens, - typevar-name-mismatch, #TODO(https://github.com/apache/beam/issues/28241) Enable and fix warnings try-except-raise, undefined-variable, unexpected-keyword-arg, diff --git a/sdks/python/apache_beam/__init__.py b/sdks/python/apache_beam/__init__.py index a4a13eab97381..85b5187185286 100644 --- a/sdks/python/apache_beam/__init__.py +++ b/sdks/python/apache_beam/__init__.py @@ -94,6 +94,14 @@ from apache_beam.pvalue import PCollection from apache_beam.pvalue import Row from apache_beam.pvalue import TaggedOutput + +try: + # Add mitigation for CVE-2023-47248 while Beam allows affected versions + # of pyarrow. (https://github.com/apache/beam/issues/29392) + import pyarrow_hotfix +except ImportError: + pass + # pylint: enable=wrong-import-position __version__ = version.__version__ diff --git a/sdks/python/apache_beam/coders/slow_coders_test.py b/sdks/python/apache_beam/coders/slow_coders_test.py index fe1c707a62e52..7915116a19a34 100644 --- a/sdks/python/apache_beam/coders/slow_coders_test.py +++ b/sdks/python/apache_beam/coders/slow_coders_test.py @@ -25,6 +25,9 @@ from apache_beam.coders.coders_test_common import * +@unittest.skip( + 'Remove non-cython tests.' + 'https://github.com/apache/beam/issues/28307') class SlowCoders(unittest.TestCase): def test_using_slow_impl(self): try: diff --git a/sdks/python/apache_beam/dataframe/expressions.py b/sdks/python/apache_beam/dataframe/expressions.py index ac7e1f828aca0..ae08cdaf54cd5 100644 --- a/sdks/python/apache_beam/dataframe/expressions.py +++ b/sdks/python/apache_beam/dataframe/expressions.py @@ -404,8 +404,10 @@ def allow_non_parallel_operations(allow=True): yield else: old_value, _ALLOW_NON_PARALLEL.value = _ALLOW_NON_PARALLEL.value, allow - yield - _ALLOW_NON_PARALLEL.value = old_value + try: + yield + finally: + _ALLOW_NON_PARALLEL.value = old_value class NonParallelOperation(Exception): diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py index a74ccbba041ae..b7aa130fbbd8f 100644 --- a/sdks/python/apache_beam/dataframe/frames.py +++ b/sdks/python/apache_beam/dataframe/frames.py @@ -1388,7 +1388,7 @@ def align(self, other, join, axis, level, method, **kwargs): Only the default, ``method=None``, is allowed.""" if level is not None: raise NotImplementedError('per-level align') - if method is not None: + if method is not None and method != lib.no_default: raise frame_base.WontImplementError( f"align(method={method!r}) is not supported because it is " "order sensitive. Only align(method=None) is supported.", @@ -2580,7 +2580,7 @@ def align(self, other, join, axis, copy, level, method, **kwargs): "align(copy=False) is not supported because it might be an inplace " "operation depending on the data. Please prefer the default " "align(copy=True).") - if method is not None: + if method is not None and method != lib.no_default: raise frame_base.WontImplementError( f"align(method={method!r}) is not supported because it is " "order sensitive. Only align(method=None) is supported.", @@ -2978,6 +2978,8 @@ def aggregate(self, func, axis, *args, **kwargs): agg = aggregate applymap = frame_base._elementwise_method('applymap', base=pd.DataFrame) + if PD_VERSION >= (2, 1): + map = frame_base._elementwise_method('map', base=pd.DataFrame) add_prefix = frame_base._elementwise_method('add_prefix', base=pd.DataFrame) add_suffix = frame_base._elementwise_method('add_suffix', base=pd.DataFrame) @@ -4594,8 +4596,9 @@ def wrapper(self, *args, **kwargs): return _unliftable_agg(meth)(self, *args, **kwargs) to_group = self._ungrouped.proxy().index - is_categorical_grouping = any(to_group.get_level_values(i).is_categorical() - for i in self._grouping_indexes) + is_categorical_grouping = any( + isinstance(to_group.get_level_values(i).dtype, pd.CategoricalDtype) + for i in self._grouping_indexes) groupby_kwargs = self._kwargs group_keys = self._group_keys @@ -4647,8 +4650,9 @@ def wrapper(self, *args, **kwargs): to_group = self._ungrouped.proxy().index group_keys = self._group_keys - is_categorical_grouping = any(to_group.get_level_values(i).is_categorical() - for i in self._grouping_indexes) + is_categorical_grouping = any( + isinstance(to_group.get_level_values(i).dtype, pd.CategoricalDtype) + for i in self._grouping_indexes) groupby_kwargs = self._kwargs project = _maybe_project_func(self._projection) diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index 6f7a63c291642..6e32acefc61b8 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -865,6 +865,8 @@ def test_corrwith_bad_axis(self): self._run_error_test(lambda df: df.corrwith(df, axis=5), df) @unittest.skipIf(PD_VERSION < (1, 2), "na_action added in pandas 1.2.0") + @pytest.mark.filterwarnings( + "ignore:The default of observed=False is deprecated:FutureWarning") def test_applymap_na_action(self): # Replicates a doctest for na_action which is incompatible with # doctest framework @@ -875,6 +877,17 @@ def test_applymap_na_action(self): # TODO: generate proxy using naive type inference on fn check_proxy=False) + @unittest.skipIf(PD_VERSION < (2, 1), "map added in 2.1.0") + def test_map_na_action(self): + # Replicates a doctest for na_action which is incompatible with + # doctest framework + df = pd.DataFrame([[pd.NA, 2.12], [3.356, 4.567]]) + self._run_test( + lambda df: df.map(lambda x: len(str(x)), na_action='ignore'), + df, + # TODO: generate proxy using naive type inference on fn + check_proxy=False) + def test_dataframe_eval_query(self): df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) self._run_test(lambda df: df.eval('foo = a + b - c'), df) @@ -1021,8 +1034,14 @@ def test_categorical_groupby(self): df = df.set_index('B') # TODO(BEAM-11190): These aggregations can be done in index partitions, but # it will require a little more complex logic - self._run_test(lambda df: df.groupby(level=0).sum(), df, nonparallel=True) - self._run_test(lambda df: df.groupby(level=0).mean(), df, nonparallel=True) + self._run_test( + lambda df: df.groupby(level=0, observed=False).sum(), + df, + nonparallel=True) + self._run_test( + lambda df: df.groupby(level=0, observed=False).mean(), + df, + nonparallel=True) def test_astype_categorical(self): df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) diff --git a/sdks/python/apache_beam/dataframe/io.py b/sdks/python/apache_beam/dataframe/io.py index eb1b1b5ec15c6..fedc40c60714e 100644 --- a/sdks/python/apache_beam/dataframe/io.py +++ b/sdks/python/apache_beam/dataframe/io.py @@ -281,8 +281,9 @@ def expand(self, root): if not self.binary: handle = TextIOWrapper(handle) if self.incremental: - sample = next( - self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) + with self.reader(handle, *self.args, **dict(self.kwargs, + chunksize=100)) as stream: + sample = next(stream) else: sample = self.reader(handle, *self.args, **self.kwargs) diff --git a/sdks/python/apache_beam/dataframe/pandas_docs_test.py b/sdks/python/apache_beam/dataframe/pandas_docs_test.py index d52773c955f17..8302c3cb53b63 100644 --- a/sdks/python/apache_beam/dataframe/pandas_docs_test.py +++ b/sdks/python/apache_beam/dataframe/pandas_docs_test.py @@ -145,8 +145,10 @@ def run_tests(path): def deferred_stdout(): captured = io.StringIO() old_stdout, sys.stdout = sys.stdout, captured - yield captured.getvalue - sys.stdout = old_stdout + try: + yield captured.getvalue + finally: + sys.stdout = old_stdout if __name__ == '__main__': diff --git a/sdks/python/apache_beam/examples/cookbook/bigtableio_it_test.py b/sdks/python/apache_beam/examples/cookbook/bigtableio_it_test.py index 98023fbc624c1..8cc8b3c73a104 100644 --- a/sdks/python/apache_beam/examples/cookbook/bigtableio_it_test.py +++ b/sdks/python/apache_beam/examples/cookbook/bigtableio_it_test.py @@ -174,7 +174,9 @@ def tearDown(self): if self.instance.exists(): self.instance.delete() - @pytest.mark.it_postcommit + # TODO(https://github.com/apache/beam/issues/29076): Reenable this test + # once BigTable issues are fixed. + @pytest.mark.it_postcommit_sickbay def test_bigtable_write(self): number = self.number pipeline_args = self.test_pipeline.options_list diff --git a/sdks/python/apache_beam/examples/inference/README.md b/sdks/python/apache_beam/examples/inference/README.md index 19262dead5865..cd92d9c127ee0 100644 --- a/sdks/python/apache_beam/examples/inference/README.md +++ b/sdks/python/apache_beam/examples/inference/README.md @@ -29,7 +29,6 @@ Some examples are also used in [our benchmarks](http://s.apache.org/beam-communi You must have the latest (possibly unreleased) `apache-beam` or greater installed from the Beam repo in order to run these pipelines, because some examples rely on the latest features that are actively in development. To install Beam, run the following from the `sdks/python` directory: ``` -pip install -r build-requirements.txt pip install -e .[gcp] ``` diff --git a/sdks/python/apache_beam/examples/kafkataxi/README.md b/sdks/python/apache_beam/examples/kafkataxi/README.md index c4e808cad8b4d..72a8d8f85c037 100644 --- a/sdks/python/apache_beam/examples/kafkataxi/README.md +++ b/sdks/python/apache_beam/examples/kafkataxi/README.md @@ -157,9 +157,9 @@ Install Beam and dependencies and build a Beam distribution. ```sh cd beam/sdks/python -pip install -r build-requirements.txt pip install -e '.[gcp]' -python setup.py sdist +pip install -q build +python -m build --sdist ``` Run the Beam pipeline. You can either use the default Kafka topic name or specify diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/requirements.txt index e902ead34151f..706adf9de0aa8 100644 --- a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/requirements.txt +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/requirements.txt @@ -18,4 +18,4 @@ requests==2.31.0 torch==1.13.1 torchvision==0.13.0 numpy==1.22.4 -Pillow==9.3.0 +Pillow==10.0.1 diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/mltransform_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/mltransform_test.py index 1d2197e35e4e0..0db10718295ba 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/mltransform_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/mltransform_test.py @@ -31,7 +31,7 @@ import tensorflow_transform as tft # pylint: disable=unused-import from apache_beam.examples.snippets.transforms.elementwise.mltransform import mltransform_scale_to_0_1 from apache_beam.examples.snippets.transforms.elementwise.mltransform import mltransform_compute_and_apply_vocabulary - from apache_beam.examples.snippets.transforms.elementwise.mltransform import mltransform_compute_and_apply_vocabulary_with_non_columnar_data + from apache_beam.examples.snippets.transforms.elementwise.mltransform import mltransform_compute_and_apply_vocabulary_with_scalar except ImportError: raise unittest.SkipTest('tensorflow_transform is not installed.') @@ -46,8 +46,8 @@ def check_mltransform_compute_and_apply_vocab(): def check_mltransform_scale_to_0_1(): expected = '''[START mltransform_scale_to_0_1] -Row(x=array([0. , 0.5714286, 0.2857143], dtype=float32), x_max=array([8.], dtype=float32), x_min=array([1.], dtype=float32)) -Row(x=array([0.42857143, 0.14285715, 1. ], dtype=float32), x_max=array([8.], dtype=float32), x_min=array([1.], dtype=float32)) +Row(x=array([0. , 0.5714286, 0.2857143], dtype=float32)) +Row(x=array([0.42857143, 0.14285715, 1. ], dtype=float32)) [END mltransform_scale_to_0_1] '''.splitlines()[1:-1] return expected @@ -80,7 +80,7 @@ def test_mltransform_scale_to_0_1(self, mock_stdout): self.assertEqual(predicted, expected) def test_mltransform_compute_and_apply_vocab_scalar(self, mock_stdout): - mltransform_compute_and_apply_vocabulary_with_non_columnar_data() + mltransform_compute_and_apply_vocabulary_with_scalar() predicted = mock_stdout.getvalue().splitlines() expected = check_mltransform_compute_and_apply_vocabulary_with_scalar() self.assertEqual(predicted, expected) diff --git a/sdks/python/apache_beam/io/avroio.py b/sdks/python/apache_beam/io/avroio.py index d86f59e3a4111..9225acf346e4e 100644 --- a/sdks/python/apache_beam/io/avroio.py +++ b/sdks/python/apache_beam/io/avroio.py @@ -45,7 +45,13 @@ # pytype: skip-file import os from functools import partial +from typing import Any +from typing import Callable +from typing import Dict +from typing import List +from typing import Union +import fastavro from fastavro.read import block_reader from fastavro.write import Writer @@ -54,8 +60,11 @@ from apache_beam.io import filebasedsource from apache_beam.io import iobase from apache_beam.io.filesystem import CompressionTypes +from apache_beam.io.filesystems import FileSystems from apache_beam.io.iobase import Read +from apache_beam.portability.api import schema_pb2 from apache_beam.transforms import PTransform +from apache_beam.typehints import schemas __all__ = [ 'ReadFromAvro', @@ -73,7 +82,8 @@ def __init__( file_pattern=None, min_bundle_size=0, validate=True, - use_fastavro=True): + use_fastavro=True, + as_rows=False): """Initializes :class:`ReadFromAvro`. Uses source :class:`~apache_beam.io._AvroSource` to read a set of Avro @@ -140,13 +150,26 @@ def __init__( creation time. use_fastavro (bool): This flag is left for API backwards compatibility and no longer has an effect. Do not use. + as_rows (bool): Whether to return a schema'd PCollection of Beam rows. """ super().__init__() - self._source = _create_avro_source( + self._source = _FastAvroSource( file_pattern, min_bundle_size, validate=validate) + if as_rows: + path = FileSystems.match([file_pattern], [1])[0].metadata_list[0].path + with FileSystems.open(path) as fin: + avro_schema = fastavro.reader(fin).writer_schema + beam_schema = avro_schema_to_beam_schema(avro_schema) + self._post_process = avro_dict_to_beam_row(avro_schema, beam_schema) + else: + self._post_process = None def expand(self, pvalue): - return pvalue.pipeline | Read(self._source) + records = pvalue.pipeline | Read(self._source) + if self._post_process: + return records | beam.Map(self._post_process) + else: + return records def display_data(self): return {'source_dd': self._source} @@ -184,8 +207,7 @@ def __init__( name and the value being the actual data. If False, it only returns the data. """ - source_from_file = partial( - _create_avro_source, min_bundle_size=min_bundle_size) + source_from_file = partial(_FastAvroSource, min_bundle_size=min_bundle_size) self._read_all_files = filebasedsource.ReadAllFiles( True, CompressionTypes.AUTO, @@ -280,15 +302,6 @@ def advance_file_past_next_sync_marker(f, sync_marker): data = f.read(buf_size) -def _create_avro_source(file_pattern=None, min_bundle_size=0, validate=False): - return \ - _FastAvroSource( - file_pattern=file_pattern, - min_bundle_size=min_bundle_size, - validate=validate - ) - - class _FastAvroSource(filebasedsource.FileBasedSource): """A source for reading Avro files using the `fastavro` library. @@ -338,12 +351,15 @@ def split_points_unclaimed(stop_position): yield record +_create_avro_source = _FastAvroSource + + class WriteToAvro(beam.transforms.PTransform): """A ``PTransform`` for writing avro files.""" def __init__( self, file_path_prefix, - schema, + schema=None, codec='deflate', file_name_suffix='', num_shards=0, @@ -382,9 +398,10 @@ def __init__( Returns: A WriteToAvro transform usable for writing. """ - self._sink = _create_avro_sink( + self._schema = schema + self._sink_provider = lambda avro_schema: _create_avro_sink( file_path_prefix, - schema, + avro_schema, codec, file_name_suffix, num_shards, @@ -392,7 +409,21 @@ def __init__( mime_type) def expand(self, pcoll): - return pcoll | beam.io.iobase.Write(self._sink) + if self._schema: + avro_schema = self._schema + records = pcoll + else: + try: + beam_schema = schemas.schema_from_element_type(pcoll.element_type) + except TypeError as exn: + raise ValueError( + "An explicit schema is required to write non-schema'd PCollections." + ) from exn + avro_schema = beam_schema_to_avro_schema(beam_schema) + records = pcoll | beam.Map( + beam_row_to_avro_dict(avro_schema, beam_schema)) + self._sink = self._sink_provider(avro_schema) + return records | beam.io.iobase.Write(self._sink) def display_data(self): return {'sink_dd': self._sink} @@ -406,7 +437,7 @@ def _create_avro_sink( num_shards, shard_name_template, mime_type): - if "class \'avro.schema" in str(type(schema)): + if "class 'avro.schema" in str(type(schema)): raise ValueError( 'You are using Avro IO with fastavro (default with Beam on ' 'Python 3), but supplying a schema parsed by avro-python3. ' @@ -483,3 +514,205 @@ def write_record(self, writer, value): def close(self, writer): writer.flush() self.file_handle.close() + + +AVRO_PRIMITIVES_TO_BEAM_PRIMITIVES = { + 'boolean': schema_pb2.BOOLEAN, + 'int': schema_pb2.INT32, + 'long': schema_pb2.INT64, + 'float': schema_pb2.FLOAT, + 'double': schema_pb2.DOUBLE, + 'bytes': schema_pb2.BYTES, + 'string': schema_pb2.STRING, +} + +BEAM_PRIMITIVES_TO_AVRO_PRIMITIVES = { + v: k + for k, v in AVRO_PRIMITIVES_TO_BEAM_PRIMITIVES.items() +} + +_AvroSchemaType = Union[str, List, Dict] + + +def avro_type_to_beam_type(avro_type: _AvroSchemaType) -> schema_pb2.FieldType: + if isinstance(avro_type, str): + return avro_type_to_beam_type({'type': avro_type}) + elif isinstance(avro_type, list): + # Union type + return schemas.typing_to_runner_api(Any) + type_name = avro_type['type'] + if type_name in AVRO_PRIMITIVES_TO_BEAM_PRIMITIVES: + return schema_pb2.FieldType( + atomic_type=AVRO_PRIMITIVES_TO_BEAM_PRIMITIVES[type_name]) + elif type_name in ('fixed', 'enum'): + return schema_pb2.FieldType(atomic_type=schema_pb2.STRING) + elif type_name == 'array': + return schema_pb2.FieldType( + array_type=schema_pb2.ArrayType( + element_type=avro_type_to_beam_type(avro_type['items']))) + elif type_name == 'map': + return schema_pb2.FieldType( + map_type=schema_pb2.MapType( + key_type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), + value_type=avro_type_to_beam_type(avro_type['values']))) + elif type_name == 'record': + return schema_pb2.FieldType( + row_type=schema_pb2.RowType( + schema=schema_pb2.Schema( + fields=[ + schemas.schema_field( + f['name'], avro_type_to_beam_type(f['type'])) + for f in avro_type['fields'] + ]))) + else: + raise ValueError(f'Unable to convert {avro_type} to a Beam schema.') + + +def avro_schema_to_beam_schema( + avro_schema: _AvroSchemaType) -> schema_pb2.Schema: + beam_type = avro_type_to_beam_type(avro_schema) + if isinstance(avro_schema, dict) and avro_schema['type'] == 'record': + return beam_type.row_type.schema + else: + return schema_pb2.Schema(fields=[schemas.schema_field('record', beam_type)]) + + +def avro_dict_to_beam_row( + avro_schema: _AvroSchemaType, + beam_schema: schema_pb2.Schema) -> Callable[[Any], Any]: + if isinstance(avro_schema, str): + return avro_dict_to_beam_row({'type': avro_schema}) + if avro_schema['type'] == 'record': + to_row = avro_value_to_beam_value( + schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=beam_schema))) + else: + + def to_row(record): + return beam.Row(record=record) + + return beam.typehints.with_output_types( + schemas.named_tuple_from_schema(beam_schema))( + to_row) + + +def avro_value_to_beam_value( + beam_type: schema_pb2.FieldType) -> Callable[[Any], Any]: + type_info = beam_type.WhichOneof("type_info") + if type_info == "atomic_type": + return lambda value: value + elif type_info == "array_type": + element_converter = avro_value_to_beam_value( + beam_type.array_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "iterable_type": + element_converter = avro_value_to_beam_value( + beam_type.iterable_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "map_type": + if beam_type.map_type.key_type.atomic_type != schema_pb2.STRING: + raise TypeError( + f'Only strings allowd as map keys when converting from AVRO, ' + f'found {beam_type}') + value_converter = avro_value_to_beam_value(beam_type.map_type.value_type) + return lambda value: {k: value_converter(v) for (k, v) in value.items()} + elif type_info == "row_type": + converters = { + field.name: avro_value_to_beam_value(field.type) + for field in beam_type.row_type.schema.fields + } + return lambda value: beam.Row( + ** + {name: convert(value[name]) + for (name, convert) in converters.items()}) + elif type_info == "logical_type": + return lambda value: value + else: + raise ValueError(f"Unrecognized type_info: {type_info!r}") + + +def beam_schema_to_avro_schema( + beam_schema: schema_pb2.Schema) -> _AvroSchemaType: + return beam_type_to_avro_type( + schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=beam_schema))) + + +def beam_type_to_avro_type(beam_type: schema_pb2.FieldType) -> _AvroSchemaType: + type_info = beam_type.WhichOneof("type_info") + if type_info == "atomic_type": + return {'type': BEAM_PRIMITIVES_TO_AVRO_PRIMITIVES[beam_type.atomic_type]} + elif type_info == "array_type": + return { + 'type': 'array', + 'items': beam_type_to_avro_type(beam_type.array_type.element_type) + } + elif type_info == "iterable_type": + return { + 'type': 'array', + 'items': beam_type_to_avro_type(beam_type.iterable_type.element_type) + } + elif type_info == "map_type": + if beam_type.map_type.key_type.atomic_type != schema_pb2.STRING: + raise TypeError( + f'Only strings allowd as map keys when converting to AVRO, ' + f'found {beam_type}') + return { + 'type': 'map', + 'values': beam_type_to_avro_type(beam_type.map_type.element_type) + } + elif type_info == "row_type": + return { + 'type': 'record', + 'name': beam_type.row_type.schema.id, + 'fields': [{ + 'name': field.name, 'type': beam_type_to_avro_type(field.type) + } for field in beam_type.row_type.schema.fields], + } + else: + raise ValueError(f"Unconvertale type: {beam_type}") + + +def beam_row_to_avro_dict( + avro_schema: _AvroSchemaType, beam_schema: schema_pb2.Schema): + if isinstance(avro_schema, str): + return beam_row_to_avro_dict({'type': avro_schema}, beam_schema) + if avro_schema['type'] == 'record': + return beam_value_to_avro_value( + schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=beam_schema))) + else: + convert = beam_value_to_avro_value(beam_schema) + return lambda row: convert(row[0]) + + +def beam_value_to_avro_value( + beam_type: schema_pb2.FieldType) -> Callable[[Any], Any]: + type_info = beam_type.WhichOneof("type_info") + if type_info == "atomic_type": + return lambda value: value + elif type_info == "array_type": + element_converter = avro_value_to_beam_value( + beam_type.array_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "iterable_type": + element_converter = avro_value_to_beam_value( + beam_type.iterable_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "map_type": + if beam_type.map_type.key_type.atomic_type != schema_pb2.STRING: + raise TypeError( + f'Only strings allowd as map keys when converting from AVRO, ' + f'found {beam_type}') + value_converter = avro_value_to_beam_value(beam_type.map_type.value_type) + return lambda value: {k: value_converter(v) for (k, v) in value.items()} + elif type_info == "row_type": + converters = { + field.name: avro_value_to_beam_value(field.type) + for field in beam_type.row_type.schema.fields + } + return lambda value: { + name: convert(getattr(value, name)) + for (name, convert) in converters.items() + } + elif type_info == "logical_type": + return lambda value: value + else: + raise ValueError(f"Unrecognized type_info: {type_info!r}") diff --git a/sdks/python/apache_beam/io/avroio_test.py b/sdks/python/apache_beam/io/avroio_test.py index ba35cf5846c05..c54ac40711b1e 100644 --- a/sdks/python/apache_beam/io/avroio_test.py +++ b/sdks/python/apache_beam/io/avroio_test.py @@ -35,8 +35,8 @@ from apache_beam.io import filebasedsource from apache_beam.io import iobase from apache_beam.io import source_test_utils +from apache_beam.io.avroio import _FastAvroSource # For testing from apache_beam.io.avroio import _create_avro_sink # For testing -from apache_beam.io.avroio import _create_avro_source # For testing from apache_beam.io.filesystems import FileSystems from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that @@ -125,7 +125,7 @@ def _write_pattern(self, num_files, return_filenames=False): def _run_avro_test( self, pattern, desired_bundle_size, perform_splitting, expected_result): - source = _create_avro_source(pattern) + source = _FastAvroSource(pattern) if perform_splitting: assert desired_bundle_size @@ -146,6 +146,20 @@ def _run_avro_test( read_records = source_test_utils.read_from_source(source, None, None) self.assertCountEqual(expected_result, read_records) + def test_schema_read_write(self): + with tempfile.TemporaryDirectory() as tmp_dirname: + path = os.path.join(tmp_dirname, 'tmp_filename') + rows = [beam.Row(a=1, b=['x', 'y']), beam.Row(a=2, b=['t', 'u'])] + stable_repr = lambda row: json.dumps(row._asdict()) + with TestPipeline() as p: + _ = p | Create(rows) | avroio.WriteToAvro(path) | beam.Map(print) + with TestPipeline() as p: + readback = ( + p + | avroio.ReadFromAvro(path + '*', as_rows=True) + | beam.Map(stable_repr)) + assert_that(readback, equal_to([stable_repr(r) for r in rows])) + def test_read_without_splitting(self): file_name = self._write_data() expected_result = self.RECORDS @@ -159,7 +173,7 @@ def test_read_with_splitting(self): def test_source_display_data(self): file_name = 'some_avro_source' source = \ - _create_avro_source( + _FastAvroSource( file_name, validate=False, ) @@ -207,6 +221,7 @@ def test_sink_display_data(self): def test_write_display_data(self): file_name = 'some_avro_sink' write = avroio.WriteToAvro(file_name, self.SCHEMA) + write.expand(beam.PCollection(beam.Pipeline())) dd = DisplayData.create_from(write) expected_items = [ DisplayDataItemMatcher('schema', str(self.SCHEMA)), @@ -220,12 +235,12 @@ def test_write_display_data(self): def test_read_reentrant_without_splitting(self): file_name = self._write_data() - source = _create_avro_source(file_name) + source = _FastAvroSource(file_name) source_test_utils.assert_reentrant_reads_succeed((source, None, None)) def test_read_reantrant_with_splitting(self): file_name = self._write_data() - source = _create_avro_source(file_name) + source = _FastAvroSource(file_name) splits = [split for split in source.split(desired_bundle_size=100000)] assert len(splits) == 1 source_test_utils.assert_reentrant_reads_succeed( @@ -246,7 +261,7 @@ def test_split_points(self): sync_interval = 16000 file_name = self._write_data(count=num_records, sync_interval=sync_interval) - source = _create_avro_source(file_name) + source = _FastAvroSource(file_name) splits = [split for split in source.split(desired_bundle_size=float('inf'))] assert len(splits) == 1 @@ -306,7 +321,7 @@ def test_read_with_splitting_pattern(self): def test_dynamic_work_rebalancing_exhaustive(self): def compare_split_points(file_name): - source = _create_avro_source(file_name) + source = _FastAvroSource(file_name) splits = [ split for split in source.split(desired_bundle_size=float('inf')) ] @@ -334,7 +349,7 @@ def test_corrupted_file(self): f.write(corrupted_data) corrupted_file_name = f.name - source = _create_avro_source(corrupted_file_name) + source = _FastAvroSource(corrupted_file_name) with self.assertRaisesRegex(ValueError, r'expected sync marker'): source_test_utils.read_from_source(source, None, None) diff --git a/sdks/python/apache_beam/io/azure/integration_test/Dockerfile b/sdks/python/apache_beam/io/azure/integration_test/Dockerfile index e9ac396b8e174..257fa72cb6688 100644 --- a/sdks/python/apache_beam/io/azure/integration_test/Dockerfile +++ b/sdks/python/apache_beam/io/azure/integration_test/Dockerfile @@ -32,7 +32,7 @@ COPY sdks/python /app/sdks/python COPY model /app/model # This step should look like setupVirtualenv minus virtualenv creation. -RUN pip install --no-cache-dir tox -r sdks/python/build-requirements.txt +RUN pip install --no-cache-dir tox # Add Azurite's self-signed cert to the global CA cert store. COPY cert.pem /usr/local/share/ca-certificates/azurite.crt diff --git a/sdks/python/apache_beam/io/azure/integration_test/docker-compose.yml b/sdks/python/apache_beam/io/azure/integration_test/docker-compose.yml index f22d66feb1aa9..8e03fe27e6bd5 100644 --- a/sdks/python/apache_beam/io/azure/integration_test/docker-compose.yml +++ b/sdks/python/apache_beam/io/azure/integration_test/docker-compose.yml @@ -49,3 +49,6 @@ services: networks: azure_test_net: + driver: bridge + driver_opts: + com.docker.network.driver.mtu: 1460 diff --git a/sdks/python/apache_beam/io/fileio.py b/sdks/python/apache_beam/io/fileio.py index 23e979b44cae0..e671cfea07975 100644 --- a/sdks/python/apache_beam/io/fileio.py +++ b/sdks/python/apache_beam/io/fileio.py @@ -195,7 +195,8 @@ def __init__( self._empty_match_treatment = empty_match_treatment def expand(self, pcoll) -> beam.PCollection[filesystem.FileMetadata]: - return pcoll.pipeline | beam.Create([self._file_pattern]) | MatchAll() + return pcoll.pipeline | beam.Create([self._file_pattern]) | MatchAll( + empty_match_treatment=self._empty_match_treatment) class MatchAll(beam.PTransform): diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py index 986919fd6b821..184138af75251 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery.py +++ b/sdks/python/apache_beam/io/gcp/bigquery.py @@ -1310,7 +1310,7 @@ def __iter__(self): def __next__(self): try: return fastavro.schemaless_reader(self.bytes_reader, self.avro_schema) - except StopIteration: + except (StopIteration, EOFError): self.read_rows_response = next(self.read_rows_iterator, None) if self.read_rows_response is not None: self.bytes_reader = io.BytesIO( diff --git a/sdks/python/apache_beam/io/gcp/bigquery_test.py b/sdks/python/apache_beam/io/gcp/bigquery_test.py index 7e9c1e634748e..95b6c2a5fa603 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_test.py @@ -49,14 +49,12 @@ from apache_beam.io.gcp.bigquery import TableRowJsonCoder from apache_beam.io.gcp.bigquery import WriteToBigQuery from apache_beam.io.gcp.bigquery import _StreamToBigQuery -from apache_beam.io.gcp.bigquery_file_loads_test import _ELEMENTS from apache_beam.io.gcp.bigquery_read_internal import _JsonToDictCoder from apache_beam.io.gcp.bigquery_read_internal import bigquery_export_destination_uri from apache_beam.io.gcp.bigquery_tools import JSON_COMPLIANCE_ERROR from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper from apache_beam.io.gcp.bigquery_tools import RetryStrategy from apache_beam.io.gcp.internal.clients import bigquery -from apache_beam.io.gcp.internal.clients.bigquery import bigquery_v2_client from apache_beam.io.gcp.pubsub import ReadFromPubSub from apache_beam.io.gcp.tests import utils from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher @@ -82,6 +80,7 @@ # pylint: disable=wrong-import-order, wrong-import-position try: + from apache_beam.io.gcp.internal.clients.bigquery import bigquery_v2_client from apitools.base.py.exceptions import HttpError from google.cloud import bigquery as gcp_bigquery from google.api_core import exceptions @@ -93,6 +92,42 @@ _LOGGER = logging.getLogger(__name__) +_ELEMENTS = [ + { + 'name': 'beam', 'language': 'py' + }, + { + 'name': 'beam', 'language': 'java' + }, + { + 'name': 'beam', 'language': 'go' + }, + { + 'name': 'flink', 'language': 'java' + }, + { + 'name': 'flink', 'language': 'scala' + }, + { + 'name': 'spark', 'language': 'scala' + }, + { + 'name': 'spark', 'language': 'py' + }, + { + 'name': 'spark', 'language': 'scala' + }, + { + 'name': 'beam', 'foundation': 'apache' + }, + { + 'name': 'flink', 'foundation': 'apache' + }, + { + 'name': 'spark', 'foundation': 'apache' + }, +] + def _load_or_default(filename): try: diff --git a/sdks/python/apache_beam/io/gcp/bigquery_tools.py b/sdks/python/apache_beam/io/gcp/bigquery_tools.py index 2f9420795288f..57acc320e838c 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_tools.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_tools.py @@ -1607,7 +1607,7 @@ def get_table_schema_from_string(schema): bigquery_v2_messages.TableSchema` instance. Args: - schema (str): The sting schema to be used if the BigQuery table to write + schema (str): The string schema to be used if the BigQuery table to write has to be created. Returns: diff --git a/sdks/python/apache_beam/io/gcp/bigquery_tools_test.py b/sdks/python/apache_beam/io/gcp/bigquery_tools_test.py index b4c84d589c07d..0383db7925f28 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_tools_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_tools_test.py @@ -263,8 +263,14 @@ def test_temporary_dataset_is_unique(self, patched_time_sleep): wrapper.create_temporary_dataset('project-id', 'location') self.assertTrue(client.datasets.Get.called) + @mock.patch( + 'apache_beam.io.gcp.bigquery_tools.gcp_bigquery', + return_value=mock.Mock()) + @mock.patch( + 'apitools.base.py.base_api._SkipGetCredentials', return_value=True) @mock.patch('time.sleep', return_value=None) - def test_user_agent_passed(self, sleep_mock): + def test_user_agent_passed( + self, sleep_mock, skip_get_credentials_mock, gcp_bigquery_mock): try: wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper() except: # pylint: disable=bare-except diff --git a/sdks/python/apache_beam/io/gcp/bigtableio_it_test.py b/sdks/python/apache_beam/io/gcp/bigtableio_it_test.py index f61e346cff9f8..867dca9a5e7eb 100644 --- a/sdks/python/apache_beam/io/gcp/bigtableio_it_test.py +++ b/sdks/python/apache_beam/io/gcp/bigtableio_it_test.py @@ -49,6 +49,13 @@ HttpError = None +def instance_prefix(instance): + datestr = "".join(filter(str.isdigit, str(datetime.utcnow().date()))) + instance_id = '%s-%s-%s' % (instance, datestr, secrets.token_hex(4)) + assert len(instance_id) < 34, "instance id length needs to be within [6, 33]" + return instance_id + + @pytest.mark.uses_gcp_java_expansion_service @pytest.mark.uses_transform_service @unittest.skipUnless( @@ -65,8 +72,7 @@ def setUp(self): self.project = self.test_pipeline.get_option('project') self.expansion_service = ('localhost:%s' % os.environ.get('EXPANSION_PORT')) - instance_id = '%s-%s-%s' % ( - self.INSTANCE, str(int(time.time())), secrets.token_hex(3)) + instance_id = instance_prefix(self.INSTANCE) self.client = client.Client(admin=True, project=self.project) # create cluster and instance @@ -96,7 +102,7 @@ def tearDown(self): self.table.delete() self.instance.delete() except HttpError: - _LOGGER.debug( + _LOGGER.warning( "Failed to clean up table [%s] and instance [%s]", self.table.table_id, self.instance.instance_id) @@ -160,8 +166,7 @@ def setUpClass(cls): cls.args = cls.test_pipeline.get_full_options_as_args() cls.expansion_service = ('localhost:%s' % os.environ.get('EXPANSION_PORT')) - instance_id = '%s-%s-%s' % ( - cls.INSTANCE, str(int(time.time())), secrets.token_hex(3)) + instance_id = instance_prefix(cls.INSTANCE) cls.client = client.Client(admin=True, project=cls.project) # create cluster and instance @@ -190,7 +195,7 @@ def tearDown(self): _LOGGER.info("Deleting table [%s]", self.table.table_id) self.table.delete() except HttpError: - _LOGGER.debug("Failed to clean up table [%s]", self.table.table_id) + _LOGGER.warning("Failed to clean up table [%s]", self.table.table_id) @classmethod def tearDownClass(cls): @@ -198,7 +203,7 @@ def tearDownClass(cls): _LOGGER.info("Deleting instance [%s]", cls.instance.instance_id) cls.instance.delete() except HttpError: - _LOGGER.debug( + _LOGGER.warning( "Failed to clean up instance [%s]", cls.instance.instance_id) def run_pipeline(self, rows): diff --git a/sdks/python/apache_beam/io/hdfs_integration_test/Dockerfile b/sdks/python/apache_beam/io/hdfs_integration_test/Dockerfile index 487d5c3487aba..ab79405633941 100644 --- a/sdks/python/apache_beam/io/hdfs_integration_test/Dockerfile +++ b/sdks/python/apache_beam/io/hdfs_integration_test/Dockerfile @@ -30,7 +30,7 @@ COPY sdks/python /app/sdks/python COPY model /app/model # This step should look like setupVirtualenv minus virtualenv creation. -RUN pip install --no-cache-dir tox -r sdks/python/build-requirements.txt +RUN pip install --no-cache-dir tox # Run wordcount, and write results to HDFS. CMD cd sdks/python && tox -e hdfs_integration_test diff --git a/sdks/python/apache_beam/io/hdfs_integration_test/docker-compose.yml b/sdks/python/apache_beam/io/hdfs_integration_test/docker-compose.yml index fed64d1e8ba9b..41410028ca34c 100644 --- a/sdks/python/apache_beam/io/hdfs_integration_test/docker-compose.yml +++ b/sdks/python/apache_beam/io/hdfs_integration_test/docker-compose.yml @@ -59,3 +59,6 @@ services: networks: test_net: + driver: bridge + driver_opts: + com.docker.network.driver.mtu: 1460 diff --git a/sdks/python/apache_beam/io/parquetio.py b/sdks/python/apache_beam/io/parquetio.py index 734dfa0bfe8e0..4696e5ae79277 100644 --- a/sdks/python/apache_beam/io/parquetio.py +++ b/sdks/python/apache_beam/io/parquetio.py @@ -31,27 +31,34 @@ # pytype: skip-file from functools import partial +from typing import Iterator from packaging import version from apache_beam.io import filebasedsink from apache_beam.io import filebasedsource from apache_beam.io.filesystem import CompressionTypes +from apache_beam.io.filesystems import FileSystems from apache_beam.io.iobase import RangeTracker from apache_beam.io.iobase import Read from apache_beam.io.iobase import Write +from apache_beam.portability.api import schema_pb2 from apache_beam.transforms import DoFn from apache_beam.transforms import ParDo from apache_beam.transforms import PTransform from apache_beam.transforms import window +from apache_beam.typehints import schemas try: import pyarrow as pa import pyarrow.parquet as pq + # pylint: disable=ungrouped-imports + from apache_beam.typehints import arrow_type_compatibility except ImportError: pa = None pq = None ARROW_MAJOR_VERSION = None + arrow_type_compatibility = None else: base_pa_version = version.parse(pa.__version__).base_version ARROW_MAJOR_VERSION, _, _ = map(int, base_pa_version.split('.')) @@ -146,6 +153,24 @@ def _flush_buffer(self): self._record_batches_byte_size = self._record_batches_byte_size + size +class _ArrowTableToBeamRows(DoFn): + def __init__(self, beam_type): + self._beam_type = beam_type + + @DoFn.yields_batches + def process(self, element) -> Iterator[pa.Table]: + yield element + + def infer_output_type(self, input_type): + return self._beam_type + + +class _BeamRowsToArrowTable(DoFn): + @DoFn.yields_elements + def process_batch(self, element: pa.Table) -> Iterator[pa.Table]: + yield element + + class ReadFromParquetBatched(PTransform): """A :class:`~apache_beam.transforms.ptransform.PTransform` for reading Parquet files as a `PCollection` of `pyarrow.Table`. This `PTransform` is @@ -191,7 +216,7 @@ def __init__( """ super().__init__() - self._source = _create_parquet_source( + self._source = _ParquetSource( file_pattern, min_bundle_size, validate=validate, @@ -210,7 +235,12 @@ class ReadFromParquet(PTransform): Parquet files as a `PCollection` of dictionaries. This `PTransform` is currently experimental. No backward-compatibility guarantees.""" def __init__( - self, file_pattern=None, min_bundle_size=0, validate=True, columns=None): + self, + file_pattern=None, + min_bundle_size=0, + validate=True, + columns=None, + as_rows=False): """Initializes :class:`ReadFromParquet`. Uses source ``_ParquetSource`` to read a set of Parquet files defined by @@ -255,17 +285,38 @@ def __init__( columns (List[str]): list of columns that will be read from files. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' + as_rows (bool): whether to output a schema'd PCollection of Beam rows + rather than Python dictionaries. """ super().__init__() - self._source = _create_parquet_source( + self._source = _ParquetSource( file_pattern, min_bundle_size, validate=validate, columns=columns, ) + if as_rows: + if columns is None: + filter_schema = lambda schema: schema + else: + top_level_columns = set(c.split('.')[0] for c in columns) + filter_schema = lambda schema: schema_pb2.Schema( + fields=[f for f in schema.fields if f.name in top_level_columns]) + path = FileSystems.match([file_pattern], [1])[0].metadata_list[0].path + with FileSystems.open(path) as fin: + self._schema = filter_schema( + arrow_type_compatibility.beam_schema_from_arrow_schema( + pq.read_schema(fin))) + else: + self._schema = None def expand(self, pvalue): - return pvalue | Read(self._source) | ParDo(_ArrowTableToRowDictionaries()) + arrow_batches = pvalue | Read(self._source) + if self._schema is None: + return arrow_batches | ParDo(_ArrowTableToRowDictionaries()) + else: + return arrow_batches | ParDo( + _ArrowTableToBeamRows(schemas.named_tuple_from_schema(self._schema))) def display_data(self): return {'source_dd': self._source} @@ -305,9 +356,7 @@ def __init__( """ super().__init__() source_from_file = partial( - _create_parquet_source, - min_bundle_size=min_bundle_size, - columns=columns) + _ParquetSource, min_bundle_size=min_bundle_size, columns=columns) self._read_all_files = filebasedsource.ReadAllFiles( True, CompressionTypes.UNCOMPRESSED, @@ -333,17 +382,6 @@ def expand(self, pvalue): _ArrowTableToRowDictionaries(), with_filename=self._with_filename) -def _create_parquet_source( - file_pattern=None, min_bundle_size=0, validate=False, columns=None): - return \ - _ParquetSource( - file_pattern=file_pattern, - min_bundle_size=min_bundle_size, - validate=validate, - columns=columns, - ) - - class _ParquetUtils(object): @staticmethod def find_first_row_group_index(pf, start_offset): @@ -370,7 +408,8 @@ def get_number_of_row_groups(pf): class _ParquetSource(filebasedsource.FileBasedSource): """A source for reading Parquet files. """ - def __init__(self, file_pattern, min_bundle_size, validate, columns): + def __init__( + self, file_pattern, min_bundle_size=0, validate=False, columns=None): super().__init__( file_pattern=file_pattern, min_bundle_size=min_bundle_size, @@ -421,6 +460,9 @@ def split_points_unclaimed(stop_position): yield table +_create_parquet_source = _ParquetSource + + class WriteToParquet(PTransform): """A ``PTransform`` for writing parquet files. @@ -430,7 +472,7 @@ class WriteToParquet(PTransform): def __init__( self, file_path_prefix, - schema, + schema=None, row_group_buffer_size=64 * 1024 * 1024, record_batch_size=1000, codec='none', @@ -534,10 +576,19 @@ def __init__( ) def expand(self, pcoll): - return pcoll | ParDo( - _RowDictionariesToArrowTable( - self._schema, self._row_group_buffer_size, - self._record_batch_size)) | Write(self._sink) + if self._schema is None: + try: + beam_schema = schemas.schema_from_element_type(pcoll.element_type) + except TypeError as exn: + raise ValueError( + "A schema is required to write non-schema'd data.") from exn + self._sink._schema = ( + arrow_type_compatibility.arrow_schema_from_beam_schema(beam_schema)) + convert_fn = _BeamRowsToArrowTable() + else: + convert_fn = _RowDictionariesToArrowTable( + self._schema, self._row_group_buffer_size, self._record_batch_size) + return pcoll | ParDo(convert_fn) | Write(self._sink) def display_data(self): return { diff --git a/sdks/python/apache_beam/io/parquetio_test.py b/sdks/python/apache_beam/io/parquetio_test.py index df018a3a776f7..1cd5f1208cc2a 100644 --- a/sdks/python/apache_beam/io/parquetio_test.py +++ b/sdks/python/apache_beam/io/parquetio_test.py @@ -30,6 +30,7 @@ from parameterized import param from parameterized import parameterized +import apache_beam as beam from apache_beam import Create from apache_beam import Map from apache_beam.io import filebasedsource @@ -400,6 +401,21 @@ def test_sink_transform_compliant_nested_type(self): assert_that( readback, equal_to([json.dumps(r) for r in self.RECORDS_NESTED])) + def test_schema_read_write(self): + with TemporaryDirectory() as tmp_dirname: + path = os.path.join(tmp_dirname, 'tmp_filename') + rows = [beam.Row(a=1, b='x'), beam.Row(a=2, b='y')] + stable_repr = lambda row: json.dumps(row._asdict()) + with TestPipeline() as p: + _ = p | Create(rows) | WriteToParquet(path) | beam.Map(print) + with TestPipeline() as p: + # json used for stable sortability + readback = ( + p + | ReadFromParquet(path + '*', as_rows=True) + | Map(stable_repr)) + assert_that(readback, equal_to([stable_repr(r) for r in rows])) + def test_batched_read(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") diff --git a/sdks/python/apache_beam/ml/gcp/naturallanguageml_test.py b/sdks/python/apache_beam/ml/gcp/naturallanguageml_test.py index bad7443d0d944..891726cb2688e 100644 --- a/sdks/python/apache_beam/ml/gcp/naturallanguageml_test.py +++ b/sdks/python/apache_beam/ml/gcp/naturallanguageml_test.py @@ -20,11 +20,7 @@ import unittest -import mock - -import apache_beam as beam from apache_beam.metrics import MetricsFilter -from apache_beam.testing.test_pipeline import TestPipeline # Protect against environments where Google Cloud Natural Language client # is not available. @@ -60,21 +56,6 @@ def test_document_source(self): self.assertFalse('content' in dict_) self.assertTrue('gcs_content_uri' in dict_) - def test_annotate_test_called(self): - with mock.patch('apache_beam.ml.gcp.naturallanguageml._AnnotateTextFn' - '._get_api_client'): - p = TestPipeline() - features = [ - naturallanguageml.language_v1.AnnotateTextRequest.Features( - extract_syntax=True) - ] - _ = ( - p | beam.Create([naturallanguageml.Document('Hello, world!')]) - | naturallanguageml.AnnotateText(features)) - result = p.run() - result.wait_until_finish() - self.assertCounterEqual(result, 'api_calls', 1) - if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/ml/gcp/recommendations_ai_test_it.py b/sdks/python/apache_beam/ml/gcp/recommendations_ai_test_it.py index 0ae68a298421e..9f739de7883d5 100644 --- a/sdks/python/apache_beam/ml/gcp/recommendations_ai_test_it.py +++ b/sdks/python/apache_beam/ml/gcp/recommendations_ai_test_it.py @@ -70,9 +70,12 @@ def extract_prediction(response): recommendationengine is None, "Recommendations AI dependencies not installed.") class RecommendationAIIT(unittest.TestCase): + test_ran = False + def test_create_catalog_item(self): with TestPipeline(is_integration_test=True) as p: + RecommendationAIIT.test_ran = True output = ( p | 'Create data' >> beam.Create([CATALOG_ITEM]) | 'Create CatalogItem' >> @@ -85,6 +88,7 @@ def test_create_user_event(self): USER_EVENT = {"event_type": "page-visit", "user_info": {"visitor_id": "1"}} with TestPipeline(is_integration_test=True) as p: + RecommendationAIIT.test_ran = True output = ( p | 'Create data' >> beam.Create([USER_EVENT]) | 'Create UserEvent' >> recommendations_ai.WriteUserEvent(project=GCP_TEST_PROJECT) @@ -96,6 +100,7 @@ def test_predict(self): USER_EVENT = {"event_type": "page-visit", "user_info": {"visitor_id": "1"}} with TestPipeline(is_integration_test=True) as p: + RecommendationAIIT.test_ran = True output = ( p | 'Create data' >> beam.Create([USER_EVENT]) | 'Predict UserEvent' >> recommendations_ai.PredictUserEvent( @@ -106,6 +111,9 @@ def test_predict(self): @classmethod def tearDownClass(cls): + if not cls.test_ran: + raise unittest.SkipTest('all test skipped') + client = recommendationengine.CatalogServiceClient() parent = ( f'projects/{GCP_TEST_PROJECT}/locations/' diff --git a/sdks/python/apache_beam/ml/inference/base.py b/sdks/python/apache_beam/ml/inference/base.py index 90d43cfddb940..fc8ac59a1fb74 100644 --- a/sdks/python/apache_beam/ml/inference/base.py +++ b/sdks/python/apache_beam/ml/inference/base.py @@ -482,6 +482,12 @@ def __init__( from the cohort. When model updates occur, the metrics will be reported in the form `--`. + Loading multiple models at the same time can increase the risk of an out of + memory (OOM) exception. To avoid this issue, use the parameter + `max_models_per_worker_hint` to limit the number of models that are loaded + at the same time. For more information about memory management, see + `Use a keyed `ModelHandler _`. # pylint: disable=line-too-long + Args: unkeyed: Either (a) an implementation of ModelHandler that does not @@ -491,7 +497,8 @@ def __init__( models can be held in memory at one time per worker process. For example, if your worker has 8 GB of memory provisioned and your workers take up 1 GB each, you should set this to 7 to allow all models to sit - in memory with some buffer. + in memory with some buffer. For more information about memory management, + see `Use a keyed `ModelHandler _`. # pylint: disable=line-too-long """ self._metrics_collectors: Dict[str, _MetricsCollector] = {} self._default_metrics_collector: _MetricsCollector = None @@ -505,7 +512,7 @@ def __init__( 'postprocessing functions defined into a keyed model handler. All ' 'pre/postprocessing functions must be defined on the outer model' 'handler.') - self._env_vars = unkeyed._env_vars + self._env_vars = getattr(unkeyed, '_env_vars', {}) self._unkeyed = unkeyed return @@ -546,7 +553,7 @@ def __init__( 'overriding the KeyedModelHandler.batch_elements_kwargs() method.', hints, batch_kwargs) - env_vars = mh._env_vars + env_vars = getattr(mh, '_env_vars', {}) if len(env_vars) > 0: logging.warning( 'mh %s defines the following _env_vars which will be ignored %s. ' @@ -809,7 +816,7 @@ def __init__(self, unkeyed: ModelHandler[ExampleT, PredictionT, ModelT]): 'pre/postprocessing functions must be defined on the outer model' 'handler.') self._unkeyed = unkeyed - self._env_vars = unkeyed._env_vars + self._env_vars = getattr(unkeyed, '_env_vars', {}) def load_model(self) -> ModelT: return self._unkeyed.load_model() @@ -888,7 +895,7 @@ def __init__( preprocess_fn: the preprocessing function to use. """ self._base = base - self._env_vars = base._env_vars + self._env_vars = getattr(base, '_env_vars', {}) self._preprocess_fn = preprocess_fn def load_model(self) -> ModelT: @@ -944,7 +951,7 @@ def __init__( postprocess_fn: the preprocessing function to use. """ self._base = base - self._env_vars = base._env_vars + self._env_vars = getattr(base, '_env_vars', {}) self._postprocess_fn = postprocess_fn def load_model(self) -> ModelT: @@ -1025,7 +1032,6 @@ def __init__( self._clock = clock self._metrics_namespace = metrics_namespace self._model_metadata_pcoll = model_metadata_pcoll - self._enable_side_input_loading = self._model_metadata_pcoll is not None self._with_exception_handling = False self._watch_model_pattern = watch_model_pattern self._kwargs = kwargs @@ -1126,12 +1132,12 @@ def expand( self._model_handler, self._clock, self._metrics_namespace, - self._enable_side_input_loading, + self._model_metadata_pcoll is not None, self._model_tag), self._inference_args, beam.pvalue.AsSingleton( self._model_metadata_pcoll, - ) if self._enable_side_input_loading else None).with_resource_hints( + ) if self._model_metadata_pcoll else None).with_resource_hints( **resource_hints) if self._with_exception_handling: diff --git a/sdks/python/apache_beam/ml/inference/base_test.py b/sdks/python/apache_beam/ml/inference/base_test.py index 1b1a7393872cc..7075810ff0f09 100644 --- a/sdks/python/apache_beam/ml/inference/base_test.py +++ b/sdks/python/apache_beam/ml/inference/base_test.py @@ -1513,6 +1513,28 @@ def test_model_manager_evicts_correct_num_of_models_after_being_incremented( mh3.load_model, tag=tag3).acquire() self.assertEqual(8, model3.predict(10)) + def test_run_inference_watch_file_pattern_side_input_label(self): + pipeline = TestPipeline() + # label of the WatchPattern transform. + side_input_str = 'WatchFilePattern/ApplyGlobalWindow' + from apache_beam.ml.inference.utils import WatchFilePattern + file_pattern_side_input = ( + pipeline + | 'WatchFilePattern' >> WatchFilePattern(file_pattern='fake/path/*')) + pcoll = pipeline | 'start' >> beam.Create([1, 2, 3]) + result_pcoll = pcoll | base.RunInference( + FakeModelHandler(), model_metadata_pcoll=file_pattern_side_input) + assert side_input_str in str(result_pcoll.producer.side_inputs[0]) + + def test_run_inference_watch_file_pattern_keyword_arg_side_input_label(self): + # label of the WatchPattern transform. + side_input_str = 'WatchFilePattern/ApplyGlobalWindow' + pipeline = TestPipeline() + pcoll = pipeline | 'start' >> beam.Create([1, 2, 3]) + result_pcoll = pcoll | base.RunInference( + FakeModelHandler(), watch_model_pattern='fake/path/*') + assert side_input_str in str(result_pcoll.producer.side_inputs[0]) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py index 3ec063808ae32..878d7bfc9cf27 100644 --- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py +++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py @@ -573,6 +573,7 @@ def __init__( task: Union[str, PipelineTask] = "", model: str = "", *, + device: Optional[str] = None, inference_fn: PipelineInferenceFn = _default_pipeline_inference_fn, load_pipeline_args: Optional[Dict[str, Any]] = None, inference_args: Optional[Dict[str, Any]] = None, @@ -583,10 +584,6 @@ def __init__( """ Implementation of the ModelHandler interface for Hugging Face Pipelines. - **Note:** To specify which device to use (CPU/GPU), - use the load_pipeline_args with key-value as you would do in the usual - Hugging Face pipeline. Ex: load_pipeline_args={'device':0}) - Example Usage model:: pcoll | RunInference(HuggingFacePipelineModelHandler( task="fill-mask")) @@ -606,6 +603,11 @@ def __init__( task="text-generation", model="meta-llama/Llama-2-7b-hf", load_pipeline_args={'model_kwargs':{'quantization_map':config}}) + device (str): the device (`"CPU"` or `"GPU"`) on which you wish to run + the pipeline. Defaults to GPU. If GPU is not available then it falls + back to CPU. You can also use advanced option like `device_map` with + key-value pair as you would do in the usual Hugging Face pipeline using + `load_pipeline_args`. Ex: load_pipeline_args={'device_map':auto}). inference_fn: the inference function to use during RunInference. Default is _default_pipeline_inference_fn. load_pipeline_args (Dict[str, Any]): keyword arguments to provide load @@ -638,8 +640,36 @@ def __init__( if max_batch_size is not None: self._batching_kwargs['max_batch_size'] = max_batch_size self._large_model = large_model + + # Check if the device is specified twice. If true then the device parameter + # of model handler is overridden. + self._deduplicate_device_value(device) _validate_constructor_args_hf_pipeline(self._task, self._model) + def _deduplicate_device_value(self, device: Optional[str]): + current_device = device.upper() if device else None + if (current_device and current_device != 'CPU' and current_device != 'GPU'): + raise ValueError( + f"Invalid device value: {device}. Please specify " + "either CPU or GPU. Defaults to GPU if no value " + "is provided.") + if 'device' not in self._load_pipeline_args: + if current_device == 'CPU': + self._load_pipeline_args['device'] = 'cpu' + else: + if is_gpu_available_torch(): + self._load_pipeline_args['device'] = 'cuda:1' + else: + _LOGGER.warning( + "HuggingFaceModelHandler specified a 'GPU' device, " + "but GPUs are not available. Switching to CPU.") + self._load_pipeline_args['device'] = 'cpu' + else: + if current_device: + raise ValueError( + '`device` specified in `load_pipeline_args`. `device` ' + 'parameter for HuggingFacePipelineModelHandler will be ignored.') + def load_model(self): """Loads and initializes the pipeline for processing.""" return pipeline( diff --git a/sdks/python/apache_beam/ml/inference/vertex_ai_inference_it_test.py b/sdks/python/apache_beam/ml/inference/vertex_ai_inference_it_test.py index 168ab031abb18..7c96dbe8b8471 100644 --- a/sdks/python/apache_beam/ml/inference/vertex_ai_inference_it_test.py +++ b/sdks/python/apache_beam/ml/inference/vertex_ai_inference_it_test.py @@ -65,8 +65,7 @@ def test_vertex_ai_run_flower_image_classification(self): test_pipeline.get_full_options_as_args(**extra_opts)) self.assertEqual(FileSystems().exists(output_file), True) - @pytest.mark.uses_vertex_ai - @pytest.mark.it_postcommit + @pytest.mark.vertex_ai_postcommit def test_vertex_ai_run_llm_text_classification(self): output_file = '/'.join([_OUTPUT_DIR, str(uuid.uuid4()), 'output.txt']) diff --git a/sdks/python/apache_beam/ml/transforms/base.py b/sdks/python/apache_beam/ml/transforms/base.py index a45928f5c8bfb..b3a30bb5f1256 100644 --- a/sdks/python/apache_beam/ml/transforms/base.py +++ b/sdks/python/apache_beam/ml/transforms/base.py @@ -25,6 +25,7 @@ from typing import TypeVar import apache_beam as beam +from apache_beam.metrics.metric import Metrics __all__ = ['MLTransform', 'ProcessHandler', 'BaseOperation'] @@ -32,8 +33,8 @@ TransformedMetadataT = TypeVar('TransformedMetadataT') # Input/Output types to the MLTransform. -ExampleT = TypeVar('ExampleT') MLTransformOutputT = TypeVar('MLTransformOutputT') +ExampleT = TypeVar('ExampleT') # Input to the apply() method of BaseOperation. OperationInputT = TypeVar('OperationInputT') @@ -66,16 +67,6 @@ def apply_transform(self, data: OperationInputT, inputs: input data. """ - @abc.abstractmethod - def get_artifacts( - self, data: OperationInputT, - output_column_prefix: str) -> Optional[Dict[str, OperationOutputT]]: - """ - If the operation generates any artifacts, they can be returned from this - method. - """ - pass - def __call__(self, data: OperationInputT, output_column_name: str) -> Dict[str, OperationOutputT]: """ @@ -83,11 +74,15 @@ def __call__(self, data: OperationInputT, This method will invoke the apply() method of the class. """ transformed_data = self.apply_transform(data, output_column_name) - artifacts = self.get_artifacts(data, output_column_name) - if artifacts: - transformed_data = {**transformed_data, **artifacts} return transformed_data + def get_counter(self): + """ + Returns the counter name for the operation. + """ + counter_name = self.__class__.__name__ + return Metrics.counter(MLTransform, f'BeamML_{counter_name}') + class ProcessHandler(Generic[ExampleT, MLTransformOutputT], abc.ABC): """ @@ -194,6 +189,9 @@ def __init__( transforms=transforms) # type: ignore[arg-type] self._process_handler = process_handler + self.transforms = transforms + self._counter = Metrics.counter( + MLTransform, f'BeamML_{self.__class__.__name__}') def expand( self, pcoll: beam.PCollection[ExampleT] @@ -209,8 +207,11 @@ def expand( Args: pcoll: A PCollection of ExampleT type. Returns: - A PCollection of MLTransformOutputT type. + A PCollection of MLTransformOutputT type """ + _ = ( + pcoll.pipeline + | "MLTransformMetricsUsage" >> MLTransformMetricsUsage(self)) return self._process_handler.process_data(pcoll) def with_transform(self, transform: BaseOperation): @@ -230,3 +231,26 @@ def _validate_transform(self, transform): raise TypeError( 'transform must be a subclass of BaseOperation. ' 'Got: %s instead.' % type(transform)) + + +class MLTransformMetricsUsage(beam.PTransform): + def __init__(self, ml_transform: MLTransform): + self._ml_transform = ml_transform + self._ml_transform._counter.inc() + + def expand(self, pipeline): + def _increment_counters(): + # increment for MLTransform. + self._ml_transform._counter.inc() + # increment if data processing transforms are passed. + transforms = ( + self._ml_transform.transforms or + self._ml_transform._process_handler.transforms) + if transforms: + for transform in transforms: + transform.get_counter().inc() + + _ = ( + pipeline + | beam.Create([None]) + | beam.Map(lambda _: _increment_counters())) diff --git a/sdks/python/apache_beam/ml/transforms/base_test.py b/sdks/python/apache_beam/ml/transforms/base_test.py index df7a6d26b47c7..2e447964541ba 100644 --- a/sdks/python/apache_beam/ml/transforms/base_test.py +++ b/sdks/python/apache_beam/ml/transforms/base_test.py @@ -27,6 +27,7 @@ from parameterized import parameterized import apache_beam as beam +from apache_beam.metrics.metric import MetricsFilter from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to @@ -244,6 +245,30 @@ def test_ml_transforms_on_multiple_columns_multiple_transforms(self): equal_to(expected_output_y, equals_fn=np.array_equal), label='actual_output_y') + def test_mltransform_with_counter(self): + transforms = [ + tft.ComputeAndApplyVocabulary(columns=['y']), + tft.ScaleTo01(columns=['x']) + ] + data = [{'x': [1, 2, 3], 'y': ['a', 'b', 'c']}] + with beam.Pipeline() as p: + _ = ( + p | beam.Create(data) + | base.MLTransform( + transforms=transforms, + write_artifact_location=self.artifact_location)) + scale_to_01_counter = MetricsFilter().with_name('BeamML_ScaleTo01') + vocab_counter = MetricsFilter().with_name( + 'BeamML_ComputeAndApplyVocabulary') + mltransform_counter = MetricsFilter().with_name('BeamML_MLTransform') + result = p.result + self.assertEqual( + result.metrics().query(scale_to_01_counter)['counters'][0].result, 1) + self.assertEqual( + result.metrics().query(vocab_counter)['counters'][0].result, 1) + self.assertEqual( + result.metrics().query(mltransform_counter)['counters'][0].result, 1) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/ml/transforms/handlers_test.py b/sdks/python/apache_beam/ml/transforms/handlers_test.py index 3342ec76cae59..327c8c76c0e9f 100644 --- a/sdks/python/apache_beam/ml/transforms/handlers_test.py +++ b/sdks/python/apache_beam/ml/transforms/handlers_test.py @@ -58,14 +58,6 @@ def apply_transform(self, inputs, output_column_name, **kwargs): return {output_column_name: inputs * 10} -class _FakeOperationWithArtifacts(TFTOperation): - def apply_transform(self, inputs, output_column_name, **kwargs): - return {output_column_name: inputs} - - def get_artifacts(self, data, col_name): - return {'artifact': tf.convert_to_tensor([1])} - - class IntType(NamedTuple): x: int @@ -106,16 +98,6 @@ def test_tft_operation_preprocessing_fn( actual_result = process_handler.process_data_fn(inputs) self.assertDictEqual(actual_result, expected_result) - def test_preprocessing_fn_with_artifacts(self): - process_handler = handlers.TFTProcessHandler( - transforms=[_FakeOperationWithArtifacts(columns=['x'])], - artifact_location=self.artifact_location) - inputs = {'x': [1, 2, 3]} - preprocessing_fn = process_handler.process_data_fn - actual_result = preprocessing_fn(inputs) - expected_result = {'x': [1, 2, 3], 'artifact': tf.convert_to_tensor([1])} - self.assertDictEqual(actual_result, expected_result) - def test_input_type_from_schema_named_tuple_pcoll(self): data = [{'x': 1}] with beam.Pipeline() as p: diff --git a/sdks/python/apache_beam/ml/transforms/tft.py b/sdks/python/apache_beam/ml/transforms/tft.py index 1d492642cd60e..c7b8ff0153247 100644 --- a/sdks/python/apache_beam/ml/transforms/tft.py +++ b/sdks/python/apache_beam/ml/transforms/tft.py @@ -45,9 +45,7 @@ import tensorflow as tf import tensorflow_transform as tft from apache_beam.ml.transforms.base import BaseOperation -from tensorflow_transform import analyzers from tensorflow_transform import common_types -from tensorflow_transform import tf_utils __all__ = [ 'ComputeAndApplyVocabulary', @@ -77,6 +75,8 @@ def wrapper(fn): return wrapper +# TODO: https://github.com/apache/beam/pull/29016 +# Add support for outputting artifacts to a text file in human readable form. class TFTOperation(BaseOperation[common_types.TensorType, common_types.TensorType]): def __init__(self, columns: List[str]) -> None: @@ -95,13 +95,6 @@ def __init__(self, columns: List[str]) -> None: "Columns are not specified. Please specify the column for the " " op %s" % self.__class__.__name__) - def get_artifacts(self, data: common_types.TensorType, - col_name: str) -> Dict[str, common_types.TensorType]: - """ - Returns the artifacts generated by the operation. - """ - return {} - @tf.function def _split_string_with_delimiter(self, data, delimiter): """ @@ -240,15 +233,6 @@ def apply_transform( } return output_dict - def get_artifacts(self, data: common_types.TensorType, - col_name: str) -> Dict[str, common_types.TensorType]: - mean_var = tft.analyzers._mean_and_var(data) - shape = [tf.shape(data)[0], 1] - return { - col_name + '_mean': tf.broadcast_to(mean_var[0], shape), - col_name + '_var': tf.broadcast_to(mean_var[1], shape), - } - @register_input_dtype(float) class ScaleTo01(TFTOperation): @@ -280,14 +264,6 @@ def __init__( self.elementwise = elementwise self.name = name - def get_artifacts(self, data: common_types.TensorType, - col_name: str) -> Dict[str, common_types.TensorType]: - shape = [tf.shape(data)[0], 1] - return { - col_name + '_min': tf.broadcast_to(tft.min(data), shape), - col_name + '_max': tf.broadcast_to(tft.max(data), shape) - } - def apply_transform( self, data: common_types.TensorType, output_column_name: str) -> Dict[str, common_types.TensorType]: @@ -368,34 +344,6 @@ def __init__( self.elementwise = elementwise self.name = name - def get_artifacts(self, data: common_types.TensorType, - col_name: str) -> Dict[str, common_types.TensorType]: - num_buckets = self.num_buckets - epsilon = self.epsilon - elementwise = self.elementwise - - if num_buckets < 1: - raise ValueError('Invalid num_buckets %d' % num_buckets) - - if isinstance(data, (tf.SparseTensor, tf.RaggedTensor)) and elementwise: - raise ValueError( - 'bucketize requires `x` to be dense if `elementwise=True`') - - x_values = tf_utils.get_values(data) - - if epsilon is None: - # See explanation in args documentation for epsilon. - epsilon = min(1.0 / num_buckets, 0.01) - - quantiles = analyzers.quantiles( - x_values, num_buckets, epsilon, reduce_instance_dims=not elementwise) - shape = [ - tf.shape(data)[0], num_buckets - 1 if num_buckets > 1 else num_buckets - ] - # These quantiles are used as the bucket boundaries in the later stages. - # Should we change the prefix _quantiles to _bucket_boundaries? - return {col_name + '_quantiles': tf.broadcast_to(quantiles, shape)} - def apply_transform( self, data: common_types.TensorType, output_column_name: str) -> Dict[str, common_types.TensorType]: @@ -572,6 +520,7 @@ def __init__( ngram_range: Tuple[int, int] = (1, 1), ngrams_separator: Optional[str] = None, compute_word_count: bool = False, + key_vocab_filename: str = 'key_vocab_mapping', name: Optional[str] = None, ): """ @@ -592,9 +541,9 @@ def __init__( n-gram sizes. seperator: A string that will be inserted between each ngram. compute_word_count: A boolean that specifies whether to compute - the unique word count and add it as an artifact to the output. - Note that the count will be computed over the entire dataset so - it will be the same value for all inputs. + the unique word count over the entire dataset. Defaults to False. + key_vocab_filename: The file name for the key vocabulary file when + compute_word_count is True. name: A name for the operation (optional). Note that original order of the input may not be preserved. @@ -605,33 +554,26 @@ def __init__( self.ngrams_separator = ngrams_separator self.name = name self.split_string_by_delimiter = split_string_by_delimiter + self.key_vocab_filename = key_vocab_filename if compute_word_count: self.compute_word_count_fn = count_unqiue_words else: - self.compute_word_count_fn = lambda *args, **kwargs: {} + self.compute_word_count_fn = lambda *args, **kwargs: None if ngram_range != (1, 1) and not ngrams_separator: raise ValueError( 'ngrams_separator must be specified when ngram_range is not (1, 1)') - def get_artifacts(self, data: tf.SparseTensor, - col_name: str) -> Dict[str, tf.Tensor]: - return self.compute_word_count_fn(data, col_name) - def apply_transform(self, data: tf.SparseTensor, output_col_name: str): if self.split_string_by_delimiter: data = self._split_string_with_delimiter( data, self.split_string_by_delimiter) output = tft.bag_of_words( data, self.ngram_range, self.ngrams_separator, self.name) + # word counts are written to the key_vocab_filename + self.compute_word_count_fn(data, self.key_vocab_filename) return {output_col_name: output} -def count_unqiue_words(data: tf.SparseTensor, - output_col_name: str) -> Dict[str, tf.Tensor]: - keys, count = tft.count_per_key(data) - shape = [tf.shape(data)[0], tf.shape(keys)[0]] - return { - output_col_name + '_unique_elements': tf.broadcast_to(keys, shape), - output_col_name + '_counts': tf.broadcast_to(count, shape) - } +def count_unqiue_words(data: tf.SparseTensor, output_vocab_name: str) -> None: + tft.count_per_key(data, key_vocabulary_filename=output_vocab_name) diff --git a/sdks/python/apache_beam/ml/transforms/tft_test.py b/sdks/python/apache_beam/ml/transforms/tft_test.py index 41f59c868c3bd..38ded6a809af0 100644 --- a/sdks/python/apache_beam/ml/transforms/tft_test.py +++ b/sdks/python/apache_beam/ml/transforms/tft_test.py @@ -17,6 +17,7 @@ # pytype: skip-file +import os import shutil import tempfile import unittest @@ -38,31 +39,6 @@ if not tft: raise unittest.SkipTest('tensorflow_transform is not installed.') -z_score_expected = {'x_mean': 3.5, 'x_var': 2.9166666666666665} - - -def assert_z_score_artifacts(element): - element = element.as_dict() - assert 'x_mean' in element - assert 'x_var' in element - assert element['x_mean'] == z_score_expected['x_mean'] - assert element['x_var'] == z_score_expected['x_var'] - - -def assert_ScaleTo01_artifacts(element): - element = element.as_dict() - assert 'x_min' in element - assert 'x_max' in element - assert element['x_min'] == 1 - assert element['x_max'] == 6 - - -def assert_bucketize_artifacts(element): - element = element.as_dict() - assert 'x_quantiles' in element - assert np.array_equal( - element['x_quantiles'], np.array([3, 5], dtype=np.float32)) - class ScaleZScoreTest(unittest.TestCase): def setUp(self) -> None: @@ -100,7 +76,18 @@ def test_z_score(self): | "MLTransform" >> base.MLTransform( write_artifact_location=self.artifact_location).with_transform( tft.ScaleToZScore(columns=['x']))) - _ = (result | beam.Map(assert_z_score_artifacts)) + expected_data = [ + np.array([-1.46385], dtype=np.float32), + np.array([-0.87831], dtype=np.float32), + np.array([-0.29277], dtype=np.float32), + np.array([0.29277], dtype=np.float32), + np.array([0.87831], dtype=np.float32), + np.array([1.46385], dtype=np.float32), + ] + + actual_data = (result | beam.Map(lambda x: x.x)) + assert_that( + actual_data, equal_to(expected_data, equals_fn=np.array_equal)) def test_z_score_list_data(self): list_data = [{'x': [1, 2, 3]}, {'x': [4, 5, 6]}] @@ -111,7 +98,14 @@ def test_z_score_list_data(self): | "listMLTransform" >> base.MLTransform( write_artifact_location=self.artifact_location).with_transform( tft.ScaleToZScore(columns=['x']))) - _ = (list_result | beam.Map(assert_z_score_artifacts)) + + expected_data = [ + np.array([-1.46385, -0.87831, -0.29277], dtype=np.float32), + np.array([0.29277, 0.87831, 1.46385], dtype=np.float32) + ] + actual_data = (list_result | beam.Map(lambda x: x.x)) + assert_that( + actual_data, equal_to(expected_data, equals_fn=np.array_equal)) class ScaleTo01Test(unittest.TestCase): @@ -130,7 +124,6 @@ def test_ScaleTo01_list(self): | "MLTransform" >> base.MLTransform( write_artifact_location=self.artifact_location).with_transform( tft.ScaleTo01(columns=['x']))) - _ = (list_result | beam.Map(assert_ScaleTo01_artifacts)) expected_output = [ np.array([0, 0.2, 0.4], dtype=np.float32), @@ -150,7 +143,6 @@ def test_ScaleTo01(self): write_artifact_location=self.artifact_location).with_transform( tft.ScaleTo01(columns=['x']))) - _ = (result | beam.Map(assert_ScaleTo01_artifacts)) expected_output = ( np.array([0], dtype=np.float32), np.array([0.2], dtype=np.float32), @@ -179,7 +171,6 @@ def test_bucketize(self): | "MLTransform" >> base.MLTransform( write_artifact_location=self.artifact_location).with_transform( tft.Bucketize(columns=['x'], num_buckets=3))) - _ = (result | beam.Map(assert_bucketize_artifacts)) transformed_data = (result | beam.Map(lambda x: x.x)) expected_data = [ @@ -202,8 +193,6 @@ def test_bucketize_list(self): | "MLTransform" >> base.MLTransform( write_artifact_location=self.artifact_location).with_transform( tft.Bucketize(columns=['x'], num_buckets=3))) - _ = (list_result | beam.Map(assert_bucketize_artifacts)) - transformed_data = ( list_result | "TransformedColumnX" >> beam.Map(lambda ele: ele.x)) @@ -214,36 +203,6 @@ def test_bucketize_list(self): assert_that( transformed_data, equal_to(expected_data, equals_fn=np.array_equal)) - @parameterized.expand([ - (range(1, 10), [4, 7]), - (range(9, 0, -1), [4, 7]), - (range(19, 0, -1), [10]), - (range(1, 100), [25, 50, 75]), - # similar to the above but with odd number of elements - (range(1, 100, 2), [25, 51, 75]), - (range(99, 0, -1), range(10, 100, 10)) - ]) - def test_bucketize_boundaries(self, test_input, expected_boundaries): - # boundaries are outputted as artifacts for the Bucketize transform. - data = [{'x': [i]} for i in test_input] - num_buckets = len(expected_boundaries) + 1 - with beam.Pipeline() as p: - result = ( - p - | "Create" >> beam.Create(data) - | "MLTransform" >> base.MLTransform( - write_artifact_location=self.artifact_location).with_transform( - tft.Bucketize(columns=['x'], num_buckets=num_buckets))) - actual_boundaries = ( - result - | beam.Map(lambda x: x.as_dict()) - | beam.Map(lambda x: x['x_quantiles'])) - - def assert_boundaries(actual_boundaries): - assert np.array_equal(actual_boundaries, expected_boundaries) - - _ = (actual_boundaries | beam.Map(assert_boundaries)) - class ApplyBucketsTest(unittest.TestCase): def setUp(self) -> None: @@ -731,10 +690,6 @@ def test_bag_of_words_on_by_splitting_input_text(self): assert_that(result, equal_to(expected_data, equals_fn=np.array_equal)) def test_count_per_key_on_list(self): - def map_element_to_count(elements, counts): - d = {elements[i]: counts[i] for i in range(len(elements))} - return d - data = [{ 'x': ['I', 'like', 'pie', 'pie', 'pie'], }, { @@ -743,25 +698,28 @@ def map_element_to_count(elements, counts): 'x': ['Banana', 'Banana', 'Apple', 'Apple', 'Apple', 'Apple'] }] with beam.Pipeline() as p: - result = ( + _ = ( p | "Create" >> beam.Create(data) | "MLTransform" >> base.MLTransform( write_artifact_location=self.artifact_location, transforms=[ - tft.BagOfWords(columns=['x'], compute_word_count=True) + tft.BagOfWords( + columns=['x'], + compute_word_count=True, + key_vocab_filename='my_vocab') ])) - # the unique elements and counts are artifacts and will be - # stored in the result and same for all the elements in the - # PCollection. - result = result | beam.Map( - lambda x: map_element_to_count(x.x_unique_elements, x.x_counts)) + def validate_count_per_key(key_vocab_filename): + key_vocab_location = os.path.join( + self.artifact_location, 'transform_fn/assets', key_vocab_filename) + with open(key_vocab_location, 'r') as f: + key_vocab_list = [line.strip() for line in f] + return key_vocab_list - expected_data = [{ - b'Apple': 4, b'Banana': 2, b'I': 1, b'like': 1, b'pie': 4, b'yum': 2 - }] * 3 # since there are 3 elements in input. - assert_that(result, equal_to(expected_data)) + expected_data = ['2 yum', '4 Apple', '1 like', '1 I', '4 pie', '2 Banana'] + actual_data = validate_count_per_key('my_vocab') + self.assertEqual(expected_data, actual_data) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/options/pipeline_options.py b/sdks/python/apache_beam/options/pipeline_options.py index 3fbf7eff7dd62..bbdafb5409765 100644 --- a/sdks/python/apache_beam/options/pipeline_options.py +++ b/sdks/python/apache_beam/options/pipeline_options.py @@ -515,6 +515,14 @@ def _add_argparse_args(cls, parser): 'at transform level. Interpretation of hints is defined by ' 'Beam runners.')) + parser.add_argument( + '--auto_unique_labels', + default=False, + action='store_true', + help='Whether to automatically generate unique transform labels ' + 'for every transform. The default behavior is to raise an ' + 'exception if a transform is created with a non-unique label.') + class CrossLanguageOptions(PipelineOptions): @classmethod @@ -1127,6 +1135,22 @@ def _add_argparse_args(cls, parser): dest='min_cpu_platform', type=str, help='GCE minimum CPU platform. Default is determined by GCP.') + parser.add_argument( + '--max_cache_memory_usage_mb', + dest='max_cache_memory_usage_mb', + type=int, + default=100, + help=( + 'Size of the SDK Harness cache to store user state and side ' + 'inputs in MB. Default is 100MB. If the cache is full, least ' + 'recently used elements will be evicted. This cache is per ' + 'each SDK Harness instance. SDK Harness is a component ' + 'responsible for executing the user code and communicating with ' + 'the runner. Depending on the runner, there may be more than one ' + 'SDK Harness process running on the same worker node. Increasing ' + 'cache size might improve performance of some pipelines, but can ' + 'lead to an increase in memory consumption and OOM errors if ' + 'workers are not appropriately provisioned.')) def validate(self, validator): errors = [] diff --git a/sdks/python/apache_beam/pipeline.py b/sdks/python/apache_beam/pipeline.py index 042b483d50f1b..ed0736250d1f2 100644 --- a/sdks/python/apache_beam/pipeline.py +++ b/sdks/python/apache_beam/pipeline.py @@ -54,6 +54,7 @@ import shutil import tempfile import unicodedata +import uuid from collections import defaultdict from typing import TYPE_CHECKING from typing import Any @@ -88,6 +89,7 @@ from apache_beam.transforms import ParDo from apache_beam.transforms import ptransform from apache_beam.transforms.display import DisplayData +from apache_beam.transforms.display import HasDisplayData from apache_beam.transforms.resources import merge_resource_hints from apache_beam.transforms.resources import resource_hints_from_options from apache_beam.transforms.sideinputs import get_sideinput_index @@ -108,7 +110,7 @@ __all__ = ['Pipeline', 'PTransformOverride'] -class Pipeline(object): +class Pipeline(HasDisplayData): """A pipeline object that manages a DAG of :class:`~apache_beam.pvalue.PValue` s and their :class:`~apache_beam.transforms.ptransform.PTransform` s. @@ -133,9 +135,12 @@ def runner_implemented_transforms(cls): common_urns.primitives.IMPULSE.urn, ]) - def __init__(self, runner=None, options=None, argv=None): - # type: (Optional[Union[str, PipelineRunner]], Optional[PipelineOptions], Optional[List[str]]) -> None - + def __init__( + self, + runner: Optional[Union[str, PipelineRunner]] = None, + options: Optional[PipelineOptions] = None, + argv: Optional[List[str]] = None, + display_data: Optional[Dict[str, Any]] = None): """Initialize a pipeline object. Args: @@ -151,6 +156,8 @@ def __init__(self, runner=None, options=None, argv=None): to be used for building a :class:`~apache_beam.options.pipeline_options.PipelineOptions` object. This will only be used if argument **options** is :data:`None`. + display_data (Dict[str, Any]): a dictionary of static data associated + with this pipeline that can be displayed when it runs. Raises: ValueError: if either the runner or options argument is not @@ -233,6 +240,11 @@ def __init__(self, runner=None, options=None, argv=None): # Records whether this pipeline contains any external transforms. self.contains_external_transforms = False + self._display_data = display_data or {} + + def display_data(self): + # type: () -> Dict[str, Any] + return self._display_data @property # type: ignore[misc] # decorated property not supported def options(self): @@ -670,13 +682,20 @@ def apply( alter_label_if_ipython(transform, pvalueish) full_label = '/'.join( - [self._current_transform().full_label, label or - transform.label]).lstrip('/') + [self._current_transform().full_label, transform.label]).lstrip('/') if full_label in self.applied_labels: - raise RuntimeError( - 'A transform with label "%s" already exists in the pipeline. ' - 'To apply a transform with a specified label write ' - 'pvalue | "label" >> transform' % full_label) + auto_unique_labels = self._options.view_as( + StandardOptions).auto_unique_labels + if auto_unique_labels: + # If auto_unique_labels is set, we will append a unique suffix to the + # label to make it unique. + unique_label = self._generate_unique_label(transform) + return self.apply(transform, pvalueish, unique_label) + else: + raise RuntimeError( + 'A transform with label "%s" already exists in the pipeline. ' + 'To apply a transform with a specified label write ' + 'pvalue | "label" >> transform' % full_label) self.applied_labels.add(full_label) pvalueish, inputs = transform._extract_input_pvalues(pvalueish) @@ -752,6 +771,19 @@ def apply( self.transforms_stack.pop() return pvalueish_result + def _generate_unique_label( + self, + transform # type: str + ): + # type: (...) -> str + + """ + Given a transform, generate a unique label for it based on current label. + """ + unique_suffix = uuid.uuid4().hex[:6] + return '%s_%s' % (transform.label, unique_suffix) + + def _infer_result_type( self, transform, # type: ptransform.PTransform @@ -914,7 +946,8 @@ def visit_transform(self, transform_node): proto = beam_runner_api_pb2.Pipeline( root_transform_ids=[root_transform_id], components=context.to_runner_api(), - requirements=context.requirements()) + requirements=context.requirements(), + display_data=DisplayData('', self._display_data).to_proto()) proto.components.transforms[root_transform_id].unique_name = ( root_transform_id) self.merge_compatible_environments(proto) @@ -970,7 +1003,11 @@ def from_runner_api( # type: (...) -> Pipeline """For internal use only; no backwards-compatibility guarantees.""" - p = Pipeline(runner=runner, options=options) + p = Pipeline( + runner=runner, + options=options, + display_data={str(ix): d + for ix, d in enumerate(proto.display_data)}) from apache_beam.runners import pipeline_context context = pipeline_context.PipelineContext( proto.components, requirements=proto.requirements) diff --git a/sdks/python/apache_beam/pipeline_test.py b/sdks/python/apache_beam/pipeline_test.py index c9ac4ce4c13dd..113d1a99990c0 100644 --- a/sdks/python/apache_beam/pipeline_test.py +++ b/sdks/python/apache_beam/pipeline_test.py @@ -22,6 +22,7 @@ import copy import platform import unittest +import uuid import mock import pytest @@ -266,6 +267,32 @@ def test_reuse_custom_transform_instance(self): 'pipeline. To apply a transform with a specified label write ' 'pvalue | "label" >> transform') + def test_auto_unique_labels(self): + + opts = PipelineOptions(["--auto_unique_labels"]) + with mock.patch.object(uuid, 'uuid4') as mock_uuid_gen: + mock_uuids = [mock.Mock(hex='UUID01XXX'), mock.Mock(hex='UUID02XXX')] + mock_uuid_gen.side_effect = mock_uuids + with TestPipeline(options=opts) as pipeline: + pcoll = pipeline | 'pcoll' >> Create([1, 2, 3]) + + def identity(x): + return x + + pcoll2 = pcoll | Map(identity) + pcoll3 = pcoll2 | Map(identity) + pcoll4 = pcoll3 | Map(identity) + assert_that(pcoll4, equal_to([1, 2, 3])) + + map_id_full_labels = { + label + for label in pipeline.applied_labels if "Map(identity)" in label + } + map_id_leaf_labels = {label.split(":")[-1] for label in map_id_full_labels} + # Only the first 6 chars of the UUID hex should be used + assert map_id_leaf_labels == set( + ["Map(identity)", "Map(identity)_UUID01", "Map(identity)_UUID02"]) + def test_reuse_cloned_custom_transform_instance(self): with TestPipeline() as pipeline: pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3]) diff --git a/sdks/python/apache_beam/pvalue.py b/sdks/python/apache_beam/pvalue.py index 2e86c9eb51c77..90882651d0b24 100644 --- a/sdks/python/apache_beam/pvalue.py +++ b/sdks/python/apache_beam/pvalue.py @@ -673,6 +673,9 @@ def __init__(self, **kwargs): def as_dict(self): return dict(self.__dict__) + # For compatibility with named tuples. + _asdict = as_dict + def __iter__(self): for _, value in self.__dict__.items(): yield value diff --git a/sdks/python/apache_beam/runners/common.py b/sdks/python/apache_beam/runners/common.py index 99cd26cc40987..1cd0a30446634 100644 --- a/sdks/python/apache_beam/runners/common.py +++ b/sdks/python/apache_beam/runners/common.py @@ -765,6 +765,7 @@ def __init__(self, # Try to prepare all the arguments that can just be filled in # without any additional work. in the process function. # Also cache all the placeholders needed in the process function. + input_args = list(input_args) ( self.placeholders_for_process, self.args_for_process, @@ -1437,7 +1438,8 @@ def process(self, windowed_value): return [] def _maybe_sample_exception( - self, exn: BaseException, windowed_value: WindowedValue) -> None: + self, exn: BaseException, + windowed_value: Optional[WindowedValue]) -> None: if self.execution_context is None: return @@ -1927,6 +1929,12 @@ def validate_transform(transform_id): raise ValueError( "Incompatible input coder %s and output coder %s for transform %s" % (transform_id, input_coder, output_coder)) + elif transform_proto.spec.urn == common_urns.primitives.ASSIGN_WINDOWS.urn: + if not transform_proto.inputs: + raise ValueError("Missing input for transform: %s" % transform_proto) + elif transform_proto.spec.urn == common_urns.primitives.PAR_DO.urn: + if not transform_proto.inputs: + raise ValueError("Missing input for transform: %s" % transform_proto) for t in transform_proto.subtransforms: validate_transform(t) diff --git a/sdks/python/apache_beam/runners/dataflow/internal/names.py b/sdks/python/apache_beam/runners/dataflow/internal/names.py index 8347a597e9688..9a96baeb2a3c2 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/names.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/names.py @@ -34,6 +34,6 @@ # Unreleased sdks use container image tag specified below. # Update this tag whenever there is a change that # requires changes to SDK harness container or SDK harness launcher. -BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20230927' +BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20231102' DATAFLOW_CONTAINER_IMAGE_REPOSITORY = 'gcr.io/cloud-dataflow/v1beta3' diff --git a/sdks/python/apache_beam/runners/interactive/extensions/apache-beam-jupyterlab-sidepanel/yarn.lock b/sdks/python/apache_beam/runners/interactive/extensions/apache-beam-jupyterlab-sidepanel/yarn.lock index fd09b6c9eb008..f72e23ebd403e 100644 --- a/sdks/python/apache_beam/runners/interactive/extensions/apache-beam-jupyterlab-sidepanel/yarn.lock +++ b/sdks/python/apache_beam/runners/interactive/extensions/apache-beam-jupyterlab-sidepanel/yarn.lock @@ -6767,10 +6767,10 @@ mv@2.1.1: ncp "~2.0.0" rimraf "~2.4.0" -nanoid@^3.1.28: - version "3.3.2" - resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.2.tgz#c89622fafb4381cd221421c69ec58547a1eec557" - integrity sha512-CuHBogktKwpm5g2sRgv83jEy2ijFzBwMoYA60orPDR7ynsLijJDqgsi4RDGj3OJpy3Ieb+LYwiRmIOGyytgITA== +nanoid@^3.3.6: + version "3.3.6" + resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.6.tgz#443380c856d6e9f9824267d960b4236ad583ea4c" + integrity sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA== nanomatch@^1.2.9: version "1.2.13" @@ -7279,6 +7279,11 @@ picocolors@^0.2.1: resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-0.2.1.tgz#570670f793646851d1ba135996962abad587859f" integrity sha512-cMlDqaLEqfSaW8Z7N5Jw+lyIW869EzT73/F5lhtY9cLGoVxSXznfgfXMO0Z5K0o0Q2TkTXq+0KFsdnSe3jDViA== +picocolors@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.0.0.tgz#cb5bdc74ff3f51892236eaf79d68bc44564ab81c" + integrity sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ== + picomatch@^2.0.4, picomatch@^2.0.5: version "2.2.2" resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.2.2.tgz#21f333e9b6b8eaff02468f5146ea406d345f4dad" @@ -7392,13 +7397,13 @@ postcss-value-parser@^4.1.0: integrity sha512-97DXOFbQJhk71ne5/Mt6cOu6yxsSfM0QGQyl0L25Gca4yGWEGJaig7l7gbCX623VqTBNGLRLaVUCnNkcedlRSQ== postcss@^8.0.2, postcss@^8.2.15: - version "8.3.9" - resolved "https://registry.yarnpkg.com/postcss/-/postcss-8.3.9.tgz#98754caa06c4ee9eb59cc48bd073bb6bd3437c31" - integrity sha512-f/ZFyAKh9Dnqytx5X62jgjhhzttjZS7hMsohcI7HEI5tjELX/HxCy3EFhsRxyzGvrzFF+82XPvCS8T9TFleVJw== + version "8.4.31" + resolved "https://registry.yarnpkg.com/postcss/-/postcss-8.4.31.tgz#92b451050a9f914da6755af352bdc0192508656d" + integrity sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ== dependencies: - nanoid "^3.1.28" - picocolors "^0.2.1" - source-map-js "^0.6.2" + nanoid "^3.3.6" + picocolors "^1.0.0" + source-map-js "^1.0.2" prelude-ls@^1.2.1: version "1.2.1" @@ -8351,10 +8356,10 @@ source-list-map@^2.0.0: resolved "https://registry.yarnpkg.com/source-list-map/-/source-list-map-2.0.1.tgz#3993bd873bfc48479cca9ea3a547835c7c154b34" integrity sha512-qnQ7gVMxGNxsiL4lEuJwe/To8UnK7fAnmbGEEH8RpLouuKbeEm0lhbQVFIrNSuB+G7tVrAlVsZgETT5nljf+Iw== -source-map-js@^0.6.2: - version "0.6.2" - resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-0.6.2.tgz#0bb5de631b41cfbda6cfba8bd05a80efdfd2385e" - integrity sha512-/3GptzWzu0+0MBQFrDKzw/DvvMTUORvgY6k6jd/VS6iCR4RDTKWH6v6WPwQoUO8667uQEf9Oe38DxAYWY5F/Ug== +source-map-js@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/source-map-js/-/source-map-js-1.0.2.tgz#adbc361d9c62df380125e7f161f71c826f1e490c" + integrity sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw== source-map-resolve@^0.5.0: version "0.5.3" diff --git a/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/29c9237ddf4f3d5988a503069b4d3c47.png b/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/29c9237ddf4f3d5988a503069b4d3c47.png index c748ea1a2d0a4..382063f75092d 100644 Binary files a/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/29c9237ddf4f3d5988a503069b4d3c47.png and b/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/29c9237ddf4f3d5988a503069b4d3c47.png differ diff --git a/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/7a35f487b2a5f3a9b9852a8659eeb4bd.png b/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/7a35f487b2a5f3a9b9852a8659eeb4bd.png index b21d3b606a37a..f3bf660dba0f5 100644 Binary files a/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/7a35f487b2a5f3a9b9852a8659eeb4bd.png and b/sdks/python/apache_beam/runners/interactive/testing/integration/goldens/Linux/7a35f487b2a5f3a9b9852a8659eeb4bd.png differ diff --git a/sdks/python/apache_beam/runners/portability/expansion_service.py b/sdks/python/apache_beam/runners/portability/expansion_service.py index 9670ac1ad7be0..8be9d98508edb 100644 --- a/sdks/python/apache_beam/runners/portability/expansion_service.py +++ b/sdks/python/apache_beam/runners/portability/expansion_service.py @@ -54,7 +54,8 @@ def with_pipeline(component, pcoll_id=None): context = pipeline_context.PipelineContext( request.components, default_environment=self._default_environment, - namespace=request.namespace) + namespace=request.namespace, + requirements=request.requirements) producers = { pcoll_id: (context.transforms.get_by_id(t_id), pcoll_tag) for t_id, diff --git a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py index 9a40a55c76017..3b302e334a5fa 100644 --- a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py +++ b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server.py @@ -80,8 +80,8 @@ def executable_jar(self): return job_server.JavaJarJobServer.local_jar(url) def flink_version(self): - full_version = requests.get('%s/v1/config' % - self._master_url).json()['flink-version'] + full_version = requests.get( + '%s/v1/config' % self._master_url, timeout=60).json()['flink-version'] # Only return up to minor version. return '.'.join(full_version.split('.')[:2]) diff --git a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server_test.py b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server_test.py index 1294f4653b2a7..12ba3940d3965 100644 --- a/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server_test.py +++ b/sdks/python/apache_beam/runners/portability/flink_uber_jar_job_server_test.py @@ -37,9 +37,11 @@ def temp_name(*args, **kwargs): with tempfile.NamedTemporaryFile(*args, **kwargs) as t: name = t.name - yield name - if os.path.exists(name): - os.unlink(name) + try: + yield name + finally: + if os.path.exists(name): + os.unlink(name) class FlinkUberJarJobServerTest(unittest.TestCase): diff --git a/sdks/python/apache_beam/runners/portability/portable_runner.py b/sdks/python/apache_beam/runners/portability/portable_runner.py index 9ff03ec1d0614..ab5ee9fff6f9f 100644 --- a/sdks/python/apache_beam/runners/portability/portable_runner.py +++ b/sdks/python/apache_beam/runners/portability/portable_runner.py @@ -415,7 +415,8 @@ def start_and_replace_loopback_environments(pipeline, options): portable_options.environment_config, server = ( worker_pool_main.BeamFnExternalWorkerPoolServicer.start( state_cache_size= - sdk_worker_main._get_state_cache_size(experiments), + sdk_worker_main._get_state_cache_size_bytes( + options=options), data_buffer_time_limit_ms= sdk_worker_main._get_data_buffer_time_limit_ms(experiments), use_process=use_loopback_process_worker)) diff --git a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py index 6bb27b5746daf..a99bec840bee7 100644 --- a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py +++ b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server_test.py @@ -40,9 +40,11 @@ def temp_name(*args, **kwargs): with tempfile.NamedTemporaryFile(*args, **kwargs) as t: name = t.name - yield name - if os.path.exists(name): - os.unlink(name) + try: + yield name + finally: + if os.path.exists(name): + os.unlink(name) def spark_job(): diff --git a/sdks/python/apache_beam/runners/portability/stager.py b/sdks/python/apache_beam/runners/portability/stager.py index ace573de0a621..d59b3e32bc17b 100644 --- a/sdks/python/apache_beam/runners/portability/stager.py +++ b/sdks/python/apache_beam/runners/portability/stager.py @@ -771,15 +771,30 @@ def _build_setup_package(setup_file, # type: str try: os.chdir(os.path.dirname(setup_file)) if build_setup_args is None: - build_setup_args = [ - Stager._get_python_executable(), - os.path.basename(setup_file), - 'sdist', - '--dist-dir', - temp_dir - ] - _LOGGER.info('Executing command: %s', build_setup_args) - processes.check_output(build_setup_args) + # if build is installed in the user env, use it to + # build the sdist else fallback to legacy setup.py sdist call. + try: + build_setup_args = [ + Stager._get_python_executable(), + '-m', + 'build', + '--sdist', + '--outdir', + temp_dir, + os.path.dirname(setup_file), + ] + _LOGGER.info('Executing command: %s', build_setup_args) + processes.check_output(build_setup_args) + except RuntimeError: + build_setup_args = [ + Stager._get_python_executable(), + os.path.basename(setup_file), + 'sdist', + '--dist-dir', + temp_dir + ] + _LOGGER.info('Executing command: %s', build_setup_args) + processes.check_output(build_setup_args) output_files = glob.glob(os.path.join(temp_dir, '*.tar.gz')) if not output_files: raise RuntimeError( diff --git a/sdks/python/apache_beam/runners/worker/bundle_processor.py b/sdks/python/apache_beam/runners/worker/bundle_processor.py index 935ba83709c0d..7ff0ad258bc2d 100644 --- a/sdks/python/apache_beam/runners/worker/bundle_processor.py +++ b/sdks/python/apache_beam/runners/worker/bundle_processor.py @@ -227,8 +227,13 @@ def process_encoded(self, encoded_windowed_values): if self.index == self.stop - 1: return self.index += 1 - decoded_value = self.windowed_coder_impl.decode_from_stream( - input_stream, True) + try: + decoded_value = self.windowed_coder_impl.decode_from_stream( + input_stream, True) + except Exception as exn: + raise ValueError( + "Error decoding input stream with coder " + + str(self.windowed_coder)) from exn self.output(decoded_value) def monitoring_infos(self, transform_id, tag_to_pcollection_id): diff --git a/sdks/python/apache_beam/runners/worker/logger.py b/sdks/python/apache_beam/runners/worker/logger.py index e01e3863349bf..e1c84bc6ded27 100644 --- a/sdks/python/apache_beam/runners/worker/logger.py +++ b/sdks/python/apache_beam/runners/worker/logger.py @@ -64,8 +64,10 @@ def PerThreadLoggingContext(**kwargs): """A context manager to add per thread attributes.""" stack = per_thread_worker_data.stack stack.append(kwargs) - yield - stack.pop() + try: + yield + finally: + stack.pop() class JsonLogFormatter(logging.Formatter): diff --git a/sdks/python/apache_beam/runners/worker/sdk_worker_main.py b/sdks/python/apache_beam/runners/worker/sdk_worker_main.py index d3442fcb5987f..1af0071edc14c 100644 --- a/sdks/python/apache_beam/runners/worker/sdk_worker_main.py +++ b/sdks/python/apache_beam/runners/worker/sdk_worker_main.py @@ -36,6 +36,7 @@ from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import ProfilingOptions from apache_beam.options.pipeline_options import SetupOptions +from apache_beam.options.pipeline_options import WorkerOptions from apache_beam.options.value_provider import RuntimeValueProvider from apache_beam.portability.api import endpoints_pb2 from apache_beam.runners.internal import names @@ -159,7 +160,8 @@ def create_harness(environment, dry_run=False): control_address=control_service_descriptor.url, status_address=status_service_descriptor.url, worker_id=_worker_id, - state_cache_size=_get_state_cache_size(experiments), + state_cache_size=_get_state_cache_size_bytes( + options=sdk_pipeline_options), data_buffer_time_limit_ms=_get_data_buffer_time_limit_ms(experiments), profiler_factory=profiler.Profile.factory_from_options( sdk_pipeline_options.view_as(ProfilingOptions)), @@ -239,24 +241,28 @@ def _parse_pipeline_options(options_json): return PipelineOptions.from_dictionary(_load_pipeline_options(options_json)) -def _get_state_cache_size(experiments): - """Defines the upper number of state items to cache. - - Note: state_cache_size is an experimental flag and might not be available in - future releases. +def _get_state_cache_size_bytes(options): + """Return the maximum size of state cache in bytes. Returns: - an int indicating the maximum number of megabytes to cache. - Default is 0 MB + an int indicating the maximum number of bytes to cache. """ - + max_cache_memory_usage_mb = options.view_as( + WorkerOptions).max_cache_memory_usage_mb + # to maintain backward compatibility + experiments = options.view_as(DebugOptions).experiments or [] for experiment in experiments: # There should only be 1 match so returning from the loop if re.match(r'state_cache_size=', experiment): + _LOGGER.warning( + '--experiments=state_cache_size=X is deprecated and will be removed ' + 'in future releases.' + 'Please use --max_cache_memory_usage_mb=X to set the cache size for ' + 'user state API and side inputs.') return int( re.match(r'state_cache_size=(?P.*)', experiment).group('state_cache_size')) << 20 - return 0 + return max_cache_memory_usage_mb << 20 def _get_data_buffer_time_limit_ms(experiments): diff --git a/sdks/python/apache_beam/runners/worker/sdk_worker_main_test.py b/sdks/python/apache_beam/runners/worker/sdk_worker_main_test.py index 00e09840787f1..498a07b70e9e6 100644 --- a/sdks/python/apache_beam/runners/worker/sdk_worker_main_test.py +++ b/sdks/python/apache_beam/runners/worker/sdk_worker_main_test.py @@ -234,6 +234,19 @@ def test_gcp_profiler_uses_job_name_when_enabled_as_experiment(self): sdk_worker_main._start_profiler(gcp_profiler_name, "version") sdk_worker_main._start_profiler.assert_called_with("sample_job", "version") + @unittest.mock.patch.dict(os.environ, {"JOB_NAME": "sample_job"}, clear=True) + def test_pipeline_option_max_cache_memory_usage_mb(self): + options = PipelineOptions(flags=['--max_cache_memory_usage_mb=50']) + + cache_size = sdk_worker_main._get_state_cache_size_bytes(options) + self.assertEqual(cache_size, 50 << 20) + + @unittest.mock.patch.dict(os.environ, {"JOB_NAME": "sample_job"}, clear=True) + def test_pipeline_option_max_cache_memory_usage_mb_with_experiments(self): + options = PipelineOptions(flags=['--experiments=state_cache_size=50']) + cache_size = sdk_worker_main._get_state_cache_size_bytes(options) + self.assertEqual(cache_size, 50 << 20) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/testing/analyzers/README.md b/sdks/python/apache_beam/testing/analyzers/README.md index 076f173f9d71b..cc8629f9a57a7 100644 --- a/sdks/python/apache_beam/testing/analyzers/README.md +++ b/sdks/python/apache_beam/testing/analyzers/README.md @@ -35,16 +35,13 @@ update already created GitHub issue or ignore performance alert by not creating ## Config file structure -The config file defines the structure to run change point analysis on a given test. To add a test to the config file, +The yaml defines the structure to run change point analysis on a given test. To add a test config to the yaml file, please follow the below structure. -**NOTE**: The Change point analysis only supports reading the metric data from Big Query for now. +**NOTE**: The Change point analysis only supports reading the metric data from `BigQuery` only. ``` -# the test_1 must be a unique id. -test_1: - test_description: Pytorch image classification on 50k images of size 224 x 224 with resnet 152 - test_target: apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks +test_1: # a unique id for each test config. metrics_dataset: beam_run_inference metrics_table: torch_inference_imagenet_results_resnet152 project: apache-beam-testing @@ -55,11 +52,15 @@ test_1: num_runs_in_change_point_window: 30 # optional parameter ``` -**NOTE**: `test_target` is optional. It is used for identifying the test that was causing the regression. +#### Optional Parameters: -**Note**: By default, the tool fetches metrics from BigQuery tables. `metrics_dataset`, `metrics_table`, `project` and `metric_name` should match with the values defined for performance/load tests. -The above example uses this [test configuration](https://github.com/apache/beam/blob/0a91d139dea4276dc46176c4cdcdfce210fc50c4/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L30) -to fill up the values required to fetch the data from source. +These are the optional parameters that can be added to the test config in addition to the parameters mentioned above. + +- `test_target`: Identifies the test responsible for the regression. + +- `test_name`: Denotes the name of the test as stored in the BigQuery table. + +**Note**: The tool, by default, pulls metrics from BigQuery tables. Ensure that the values for `metrics_dataset`, `metrics_table`, `project`, and `metric_name` align with those defined for performance/load tests. The provided example utilizes this [test configuration](https://github.com/apache/beam/blob/0a91d139dea4276dc46176c4cdcdfce210fc50c4/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L30) to populate the necessary values for data retrieval. ### Different ways to avoid false positive change points @@ -76,8 +77,35 @@ setting `num_runs_in_change_point_window=7` will achieve it. ## Register a test for performance alerts -If a new test needs to be registered for the performance alerting tool, please add the required test parameters to the -config file. +If a new test needs to be registered for the performance alerting tool, + +- You can either add it to the config file that is already present. +- You can define your own yaml file and call the [perf_analysis.run()](https://github.com/apache/beam/blob/a46bc12a256dcaa3ae2cc9e5d6fdcaa82b59738b/sdks/python/apache_beam/testing/analyzers/perf_analysis.py#L152) method. + + +## Integrating the Perf Alert Tool with a Custom BigQuery Schema + +By default, the Perf Alert Tool retrieves metrics from the `apache-beam-testing` BigQuery projects. All performance and load tests within Beam utilize a standard [schema](https://github.com/apache/beam/blob/a7e12db9b5977c4a7b13554605c0300389a3d6da/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py#L70) for metrics publication. The tool inherently recognizes and operates with this schema when extracting metrics from BigQuery tables. + +To fetch the data from a BigQuery dataset that is not a default setting of the Apache Beam's setting, One can inherit the `MetricsFetcher` class and implement the abstract method `fetch_metric_data`. This method should return a tuple of desired metric values and timestamps of the metric values of when it was published. + +``` +from apache_beam.testing.analyzers import perf_analysis +config_file_path = +my_metric_fetcher = MyMetricsFetcher() # inherited from MetricsFetcher +perf_analysis.run(config_file_path, my_metrics_fetcher) +``` + +``Note``: The metrics and timestamps should be sorted based on the timestamps values in ascending order. + +### Configuring GitHub Parameters + +Out of the box, the performance alert tool targets the `apache/beam` repository when raising issues. If you wish to utilize this tool for another repository, you'll need to pre-set a couple of environment variables: + +- `REPO_OWNER`: Represents the owner of the repository. (e.g., `apache`) +- `REPO_NAME`: Specifies the repository name itself. (e.g., `beam`) + +Before initiating the tool, also ensure that the `GITHUB_TOKEN` is set to an authenticated GitHub token. This permits the tool to generate GitHub issues whenever performance alerts arise. ## Triage performance alert issues diff --git a/sdks/python/apache_beam/testing/analyzers/__init__.py b/sdks/python/apache_beam/testing/analyzers/__init__.py index cce3acad34a49..136d9f5f5d8a2 100644 --- a/sdks/python/apache_beam/testing/analyzers/__init__.py +++ b/sdks/python/apache_beam/testing/analyzers/__init__.py @@ -14,3 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +""" +Peformance alert tooling for Apache Beam. No backwards compatibility +guarantees. +""" diff --git a/sdks/python/apache_beam/testing/analyzers/constants.py b/sdks/python/apache_beam/testing/analyzers/constants.py index 8f8bdf13300c1..09ab5c5959082 100644 --- a/sdks/python/apache_beam/testing/analyzers/constants.py +++ b/sdks/python/apache_beam/testing/analyzers/constants.py @@ -72,3 +72,4 @@ }] _ANOMALY_MARKER = ' <---- Anomaly' +_EDGE_SEGMENT_SIZE = 3 diff --git a/sdks/python/apache_beam/testing/analyzers/github_issues_utils.py b/sdks/python/apache_beam/testing/analyzers/github_issues_utils.py index e1f20baa50a68..cbbb9e5d3a2e0 100644 --- a/sdks/python/apache_beam/testing/analyzers/github_issues_utils.py +++ b/sdks/python/apache_beam/testing/analyzers/github_issues_utils.py @@ -21,10 +21,11 @@ from typing import Optional from typing import Tuple -import pandas as pd import requests from apache_beam.testing.analyzers import constants +from apache_beam.testing.analyzers.perf_analysis_utils import MetricContainer +from apache_beam.testing.analyzers.perf_analysis_utils import TestConfigContainer try: _GITHUB_TOKEN: Optional[str] = os.environ['GITHUB_TOKEN'] @@ -34,8 +35,8 @@ 'A Github Personal Access token is required ' 'to create Github Issues.') -_BEAM_GITHUB_REPO_OWNER = 'apache' -_BEAM_GITHUB_REPO_NAME = 'beam' +_GITHUB_REPO_OWNER = os.environ.get('REPO_OWNER', 'apache') +_GITHUB_REPO_NAME = os.environ.get('REPO_NAME', 'beam') # Adding GitHub Rest API version to the header to maintain version stability. # For more information, please look at # https://github.blog/2022-11-28-to-infinity-and-beyond-enabling-the-future-of-githubs-rest-api-with-api-versioning/ # pylint: disable=line-too-long @@ -60,6 +61,8 @@ _AWAITING_TRIAGE_LABEL = 'awaiting triage' _PERF_ALERT_LABEL = 'perf-alert' +_REQUEST_TIMEOUT_SECS = 60 + def create_issue( title: str, @@ -77,10 +80,10 @@ def create_issue( Tuple containing GitHub issue number and issue URL. """ url = "https://api.github.com/repos/{}/{}/issues".format( - _BEAM_GITHUB_REPO_OWNER, _BEAM_GITHUB_REPO_NAME) + _GITHUB_REPO_OWNER, _GITHUB_REPO_NAME) data = { - 'owner': _BEAM_GITHUB_REPO_OWNER, - 'repo': _BEAM_GITHUB_REPO_NAME, + 'owner': _GITHUB_REPO_OWNER, + 'repo': _GITHUB_REPO_NAME, 'title': title, 'body': description, 'labels': [_AWAITING_TRIAGE_LABEL, _PERF_ALERT_LABEL] @@ -88,7 +91,10 @@ def create_issue( if labels: data['labels'].extend(labels) # type: ignore response = requests.post( - url=url, data=json.dumps(data), headers=_HEADERS).json() + url=url, + data=json.dumps(data), + headers=_HEADERS, + timeout=_REQUEST_TIMEOUT_SECS).json() return response['number'], response['html_url'] @@ -108,56 +114,57 @@ def comment_on_issue(issue_number: int, issue, and the comment URL. """ url = 'https://api.github.com/repos/{}/{}/issues/{}'.format( - _BEAM_GITHUB_REPO_OWNER, _BEAM_GITHUB_REPO_NAME, issue_number) + _GITHUB_REPO_OWNER, _GITHUB_REPO_NAME, issue_number) open_issue_response = requests.get( url, json.dumps({ - 'owner': _BEAM_GITHUB_REPO_OWNER, - 'repo': _BEAM_GITHUB_REPO_NAME, + 'owner': _GITHUB_REPO_OWNER, + 'repo': _GITHUB_REPO_NAME, 'issue_number': issue_number }, default=str), - headers=_HEADERS).json() + headers=_HEADERS, + timeout=_REQUEST_TIMEOUT_SECS).json() if open_issue_response['state'] == 'open': data = { - 'owner': _BEAM_GITHUB_REPO_OWNER, - 'repo': _BEAM_GITHUB_REPO_NAME, + 'owner': _GITHUB_REPO_OWNER, + 'repo': _GITHUB_REPO_NAME, 'body': comment_description, issue_number: issue_number, } response = requests.post( - open_issue_response['comments_url'], json.dumps(data), headers=_HEADERS) + open_issue_response['comments_url'], + json.dumps(data), + headers=_HEADERS, + timeout=_REQUEST_TIMEOUT_SECS) return True, response.json()['html_url'] return False, '' def add_awaiting_triage_label(issue_number: int): url = 'https://api.github.com/repos/{}/{}/issues/{}/labels'.format( - _BEAM_GITHUB_REPO_OWNER, _BEAM_GITHUB_REPO_NAME, issue_number) + _GITHUB_REPO_OWNER, _GITHUB_REPO_NAME, issue_number) requests.post( - url, json.dumps({'labels': [_AWAITING_TRIAGE_LABEL]}), headers=_HEADERS) + url, + json.dumps({'labels': [_AWAITING_TRIAGE_LABEL]}), + headers=_HEADERS, + timeout=_REQUEST_TIMEOUT_SECS) def get_issue_description( - test_name: str, - metric_name: str, - timestamps: List[pd.Timestamp], - metric_values: List, + test_config_container: TestConfigContainer, + metric_container: MetricContainer, change_point_index: int, max_results_to_display: int = 5, - test_description: Optional[str] = None, ) -> str: """ Args: - metric_name: Metric name used for the Change Point Analysis. - timestamps: Timestamps of the metrics when they were published to the - Database. Timestamps are expected in ascending order. - metric_values: metric values for the previous runs. - change_point_index: Index for the change point. The element in the - index of the metric_values would be the change point. - max_results_to_display: Max number of results to display from the change - point index, in both directions of the change point index. + test_config_container: TestConfigContainer containing test metadata. + metric_container: MetricContainer containing metric data. + change_point_index: Index of the change point in the metric data. + max_results_to_display: Max number of results to display from the change + point index, in both directions of the change point index. Returns: str: Description used to fill the GitHub issues description. @@ -167,22 +174,30 @@ def get_issue_description( description = [] - description.append(_ISSUE_DESCRIPTION_TEMPLATE.format(test_name, metric_name)) + description.append( + _ISSUE_DESCRIPTION_TEMPLATE.format( + test_config_container.test_id, test_config_container.metric_name)) + + if test_config_container.test_name: + description.append(("`test_name:` " + f'{test_config_container.test_name}')) - description.append(("`Test description:` " + - f'{test_description}') if test_description else '') + if test_config_container.test_description: + description.append( + ("`Test description:` " + f'{test_config_container.test_description}')) description.append('```') runs_to_display = [] max_timestamp_index = min( - change_point_index + max_results_to_display, len(metric_values) - 1) + change_point_index + max_results_to_display, + len(metric_container.values) - 1) min_timestamp_index = max(0, change_point_index - max_results_to_display) # run in reverse to display the most recent runs first. for i in reversed(range(min_timestamp_index, max_timestamp_index + 1)): row_template = _METRIC_INFO_TEMPLATE.format( - timestamps[i].ctime(), format(metric_values[i], '.2f')) + metric_container.timestamps[i].ctime(), + format(metric_container.values[i], '.2f')) if i == change_point_index: row_template += constants._ANOMALY_MARKER runs_to_display.append(row_template) diff --git a/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml new file mode 100644 index 0000000000000..2a33ae31797d9 --- /dev/null +++ b/sdks/python/apache_beam/testing/analyzers/io_tests_config.yaml @@ -0,0 +1,256 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +spanner_io_read: + test_description: | + SpannerIO Read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,read-spanner + metric_name: + - RunTime + - EstimatedCost + +spanner_io_read_runnerV2: + test_description: | + SpannerIO RunnerV2 Read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,read_spanner_v2 + metric_name: + - RunTime + - EstimatedCost + +spanner_io_write: + test_description: | + SpannerIO write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,write-spanner + metric_name: + - RunTime + - EstimatedCost + +spanner_io_write_runnerV2: + test_description: | + SpannerIO RunnerV2 write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testSpannerWriteAndRead,write_spanner_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_read: + test_description: | + BigQueryIO Storage write API read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,read-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_read_runnerV2: + test_description: | + BigQueryIO RunnerV2 Storage write API read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,read_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_write: + test_description: | + BigQueryIO Storage write API write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,write-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_storage_api_write_runnerV2: + test_description: | + BigQueryIO Storage write API write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testStorageAPIWriteThenRead,write_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_read: + test_description: | + BigQueryIO Avro file loads read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,read-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_read_runnerV2: + test_description: | + BigQueryIO RunnerV2 Avro file loads read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,read_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_write: + test_description: | + BigQueryIO Avro file loads write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,write-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_avro_file_loads_write_runnerV2: + test_description: | + BigQueryIO RunnerV2 Avro file loads write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testAvroFileLoadsWriteThenRead,write_bigquery_v2 + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_json_file_loads_read: + test_description: | + BigQueryIO Json file loads read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testJsonFileLoadsWriteThenRead,read-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigquery_io_json_file_loads_write: + test_description: | + BigQueryIO Json file loads write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testJsonFileLoadsWriteThenRead,write-bigquery + metric_name: + - RunTime + - EstimatedCost + +bigtable_io_read: + test_description: | + BigTableIO read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testBigtableWriteAndRead,read-bigtable + metric_name: + - RunTime + - EstimatedCost + +bigtable_io_write: + test_description: | + BigTableIO write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testBigtableWriteAndRead,write-bigtable + metric_name: + - RunTime + - EstimatedCost + +text_io_read: + test_description: | + TextIO read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,read-textio + metric_name: + - RunTime + - EstimatedCost + +text_io_read_runnerV2: + test_description: | + TextIO RunnerV2 read test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,read_textio_v2 + metric_name: + - RunTime + - EstimatedCost + +text_io_write: + test_description: | + TextIO write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,write-textio + metric_name: + - RunTime + - EstimatedCost + +text_io_write_runnerV2: + test_description: | + TextIO RunnerV2 write test 100 GB. + project: apache-beam-testing + metrics_dataset: performance_tests + metrics_table: io_performance_metrics + # test_name is in the format testName,pipelineName + test_name: testTextIOWriteThenRead,write_textio_v2 + metric_name: + - RunTime + - EstimatedCost diff --git a/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py new file mode 100644 index 0000000000000..ee9d04e6260ff --- /dev/null +++ b/sdks/python/apache_beam/testing/analyzers/load_test_perf_analysis.py @@ -0,0 +1,98 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import logging + +from apache_beam.testing.analyzers import constants +from apache_beam.testing.analyzers import perf_analysis +from apache_beam.testing.analyzers import perf_analysis_utils +from apache_beam.testing.analyzers.perf_analysis_utils import MetricContainer +from apache_beam.testing.analyzers.perf_analysis_utils import TestConfigContainer + +try: + from google.cloud import bigquery +except ImportError: + bigquery = None # type: ignore + + +class LoadTestMetricsFetcher(perf_analysis_utils.MetricsFetcher): + """ + Metrics fetcher used to get metric data from a BigQuery table. The metrics + are fetched and returned as a dataclass containing lists of timestamps and + metric_values. + """ + def fetch_metric_data( + self, *, test_config: TestConfigContainer) -> MetricContainer: + if test_config.test_name: + test_name, pipeline_name = test_config.test_name.split(',') + else: + raise Exception("test_name not provided in config.") + + query = f""" + SELECT timestamp, metric.value + FROM {test_config.project}.{test_config.metrics_dataset}.{test_config.metrics_table} + CROSS JOIN UNNEST(metrics) AS metric + WHERE test_name = "{test_name}" AND pipeline_name = "{pipeline_name}" AND metric.name = "{test_config.metric_name}" + ORDER BY timestamp DESC + LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS} + """ + logging.debug("Running query: %s" % query) + if bigquery is None: + raise ImportError('Bigquery dependencies are not installed.') + client = bigquery.Client() + query_job = client.query(query=query) + metric_data = query_job.result().to_dataframe() + if metric_data.empty: + logging.error( + "No results returned from BigQuery. Please check the query.") + return MetricContainer( + values=metric_data['value'].tolist(), + timestamps=metric_data['timestamp'].tolist(), + ) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + load_test_metrics_fetcher = LoadTestMetricsFetcher() + + parser = argparse.ArgumentParser() + parser.add_argument( + '--config_file_path', + required=True, + type=str, + help='Path to the config file that contains data to run the Change Point ' + 'Analysis.The default file will used will be ' + 'apache_beam/testing/analyzers/tests.config.yml. ' + 'If you would like to use the Change Point Analysis for finding ' + 'performance regression in the tests, ' + 'please provide an .yml file in the same structure as the above ' + 'mentioned file. ') + parser.add_argument( + '--save_alert_metadata', + action='store_true', + default=False, + help='Save perf alert/ GH Issue metadata to BigQuery table.') + known_args, unknown_args = parser.parse_known_args() + + if unknown_args: + logging.warning('Discarding unknown arguments : %s ' % unknown_args) + + perf_analysis.run( + big_query_metrics_fetcher=load_test_metrics_fetcher, + config_file_path=known_args.config_file_path, + # Set this to true while running in production. + save_alert_metadata=known_args.save_alert_metadata) diff --git a/sdks/python/apache_beam/testing/analyzers/perf_analysis.py b/sdks/python/apache_beam/testing/analyzers/perf_analysis.py index 7f1ffbb944e9d..5802fe0414543 100644 --- a/sdks/python/apache_beam/testing/analyzers/perf_analysis.py +++ b/sdks/python/apache_beam/testing/analyzers/perf_analysis.py @@ -22,67 +22,117 @@ import argparse import logging -import os import uuid from datetime import datetime from datetime import timezone from typing import Any from typing import Dict -from typing import Optional import pandas as pd from apache_beam.testing.analyzers import constants +from apache_beam.testing.analyzers.perf_analysis_utils import BigQueryMetricsFetcher +from apache_beam.testing.analyzers.perf_analysis_utils import ChangePointConfig from apache_beam.testing.analyzers.perf_analysis_utils import GitHubIssueMetaData +from apache_beam.testing.analyzers.perf_analysis_utils import MetricsFetcher +from apache_beam.testing.analyzers.perf_analysis_utils import TestConfigContainer from apache_beam.testing.analyzers.perf_analysis_utils import create_performance_alert -from apache_beam.testing.analyzers.perf_analysis_utils import fetch_metric_data from apache_beam.testing.analyzers.perf_analysis_utils import find_latest_change_point_index from apache_beam.testing.analyzers.perf_analysis_utils import get_existing_issues_data from apache_beam.testing.analyzers.perf_analysis_utils import is_change_point_in_valid_window -from apache_beam.testing.analyzers.perf_analysis_utils import is_perf_alert +from apache_beam.testing.analyzers.perf_analysis_utils import is_sibling_change_point from apache_beam.testing.analyzers.perf_analysis_utils import publish_issue_metadata_to_big_query from apache_beam.testing.analyzers.perf_analysis_utils import read_test_config -from apache_beam.testing.analyzers.perf_analysis_utils import validate_config -from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsFetcher -def run_change_point_analysis(params, test_name, big_query_metrics_fetcher): +def get_test_config_container( + params: Dict[str, Any], + test_id: str, + metric_name: str, +) -> TestConfigContainer: """ Args: - params: Dict containing parameters to run change point analysis. - test_id: Test id for the current test. + params: Dict containing parameters to run change point analysis. + Returns: + TestConfigContainer object containing test config parameters. + """ + return TestConfigContainer( + project=params['project'], + metrics_dataset=params['metrics_dataset'], + metrics_table=params['metrics_table'], + metric_name=metric_name, + test_id=test_id, + test_description=params['test_description'], + test_name=params.get('test_name', None), + labels=params.get('labels', None), + ) + + +def get_change_point_config(params: Dict[str, Any], ) -> ChangePointConfig: + """ + Args: + params: Dict containing parameters to run change point analysis. + Returns: + ChangePointConfig object containing change point analysis parameters. + """ + return ChangePointConfig( + min_runs_between_change_points=params.get( + 'min_runs_between_change_points', + constants._DEFAULT_MIN_RUNS_BETWEEN_CHANGE_POINTS), + num_runs_in_change_point_window=params.get( + 'num_runs_in_change_point_window', + constants._DEFAULT_NUM_RUMS_IN_CHANGE_POINT_WINDOW)) + + +def run_change_point_analysis( + test_config_container: TestConfigContainer, + big_query_metrics_fetcher: MetricsFetcher, + change_point_config: ChangePointConfig = ChangePointConfig(), + save_alert_metadata: bool = False, +): + """ + Args: + test_config_container: TestConfigContainer containing test metadata for + fetching data and running change point analysis. big_query_metrics_fetcher: BigQuery metrics fetcher used to fetch data for change point analysis. + change_point_config: ChangePointConfig containing parameters to run + change point analysis. + save_alert_metadata: bool indicating if issue metadata + should be published to BigQuery table. Returns: bool indicating if a change point is observed and alerted on GitHub. """ - logging.info("Running change point analysis for test %s" % test_name) - if not validate_config(params.keys()): - raise ValueError( - f"Please make sure all these keys {constants._PERF_TEST_KEYS} " - f"are specified for the {test_name}") + logging.info( + "Running change point analysis for test ID :%s on metric: % s" % + (test_config_container.test_id, test_config_container.metric_name)) - metric_name = params['metric_name'] + # test_name will be used to query a single test from + # multiple tests in a single BQ table. Right now, the default + # assumption is that all the test have an individual BQ table + # but this might not be case for other tests(such as IO tests where + # a single BQ tables stores all the data) + test_name = test_config_container.test_name min_runs_between_change_points = ( - constants._DEFAULT_MIN_RUNS_BETWEEN_CHANGE_POINTS) - if 'min_runs_between_change_points' in params: - min_runs_between_change_points = params['min_runs_between_change_points'] + change_point_config.min_runs_between_change_points) num_runs_in_change_point_window = ( - constants._DEFAULT_NUM_RUMS_IN_CHANGE_POINT_WINDOW) - if 'num_runs_in_change_point_window' in params: - num_runs_in_change_point_window = params['num_runs_in_change_point_window'] + change_point_config.num_runs_in_change_point_window) - metric_values, timestamps = fetch_metric_data( - params=params, - big_query_metrics_fetcher=big_query_metrics_fetcher - ) + metric_container = big_query_metrics_fetcher.fetch_metric_data( + test_config=test_config_container) + metric_container.sort_by_timestamp() + + metric_values = metric_container.values + timestamps = metric_container.timestamps change_point_index = find_latest_change_point_index( metric_values=metric_values) if not change_point_index: - logging.info("Change point is not detected for the test %s" % test_name) + logging.info( + "Change point is not detected for the test ID %s" % + test_config_container.test_id) return False # since timestamps are ordered in ascending order and # num_runs_in_change_point_window refers to the latest runs, @@ -92,22 +142,31 @@ def run_change_point_analysis(params, test_name, big_query_metrics_fetcher): if not is_change_point_in_valid_window(num_runs_in_change_point_window, latest_change_point_run): logging.info( - 'Performance regression/improvement found for the test: %s. ' + 'Performance regression/improvement found for the test ID: %s. ' 'on metric %s. Since the change point run %s ' 'lies outside the num_runs_in_change_point_window distance: %s, ' 'alert is not raised.' % ( - test_name, - metric_name, + test_config_container.test_id, + test_config_container.metric_name, latest_change_point_run + 1, num_runs_in_change_point_window)) return False - is_alert = True + is_valid_change_point = True last_reported_issue_number = None - issue_metadata_table_name = f'{params.get("metrics_table")}_{metric_name}' + + # create a unique table name for each test and metric combination. + # for beam load tests, metric_name and metric table are enough to + # create a unique table name. For templates/IO tests, add `test_name`. + issue_metadata_table_name = ( + f'{test_config_container.metrics_table}_{test_config_container.metric_name}' # pylint: disable=line-too-long + ) + if test_config_container.test_name: + issue_metadata_table_name = ( + f'{issue_metadata_table_name}_{test_config_container.test_name}') + existing_issue_data = get_existing_issues_data( - table_name=issue_metadata_table_name, - big_query_metrics_fetcher=big_query_metrics_fetcher) + table_name=issue_metadata_table_name) if existing_issue_data is not None: existing_issue_timestamps = existing_issue_data[ @@ -117,39 +176,50 @@ def run_change_point_analysis(params, test_name, big_query_metrics_fetcher): # convert numpy.int64 to int last_reported_issue_number = last_reported_issue_number.item() - is_alert = is_perf_alert( + is_valid_change_point = is_sibling_change_point( previous_change_point_timestamps=existing_issue_timestamps, change_point_index=change_point_index, timestamps=timestamps, - min_runs_between_change_points=min_runs_between_change_points) - if is_alert: + min_runs_between_change_points=min_runs_between_change_points, + test_id=test_config_container.test_id) + + # for testing purposes, we don't want to create an issue even if there is + # a valid change point. This is useful when we want to test the change point + # analysis logic without creating an issue. + if is_valid_change_point and save_alert_metadata: issue_number, issue_url = create_performance_alert( - metric_name, test_name, timestamps, - metric_values, change_point_index, - params.get('labels', None), - last_reported_issue_number, - test_description = params.get('test_description', None), + test_config_container=test_config_container, + metric_container=metric_container, + change_point_index=change_point_index, + existing_issue_number=last_reported_issue_number, ) issue_metadata = GitHubIssueMetaData( issue_timestamp=pd.Timestamp( datetime.now().replace(tzinfo=timezone.utc)), # BQ doesn't allow '.' in table name - test_name=test_name.replace('.', '_'), - metric_name=metric_name, - test_id=uuid.uuid4().hex, + test_id=test_config_container.test_id.replace('.', '_'), + test_name=test_name or uuid.uuid4().hex, + metric_name=test_config_container.metric_name, change_point=metric_values[change_point_index], issue_number=issue_number, issue_url=issue_url, - change_point_timestamp=timestamps[change_point_index]) - + change_point_timestamp=timestamps[change_point_index], + ) publish_issue_metadata_to_big_query( - issue_metadata=issue_metadata, table_name=issue_metadata_table_name) - - return is_alert + issue_metadata=issue_metadata, + table_name=issue_metadata_table_name, + project=test_config_container.project, + ) + return is_valid_change_point -def run(config_file_path: Optional[str] = None) -> None: +def run( + *, + config_file_path: str, + big_query_metrics_fetcher: MetricsFetcher = BigQueryMetricsFetcher(), + save_alert_metadata: bool = False, +) -> None: """ run is the entry point to run change point analysis on test metric data, which is read from config file, and if there is a performance @@ -163,19 +233,25 @@ def run(config_file_path: Optional[str] = None) -> None: defined in the config file. """ - if config_file_path is None: - config_file_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), 'tests_config.yaml') - tests_config: Dict[str, Dict[str, Any]] = read_test_config(config_file_path) - big_query_metrics_fetcher = BigQueryMetricsFetcher() + for test_id, params in tests_config.items(): + # single test config can have multiple metrics so we need to + # iterate over all the metrics and run change point analysis + # for each metric. + metric_names = params['metric_name'] + if isinstance(metric_names, str): + metric_names = [metric_names] - for test_name, params in tests_config.items(): - run_change_point_analysis( - params=params, - test_name=test_name, - big_query_metrics_fetcher=big_query_metrics_fetcher) + for metric_name in metric_names: + test_config_container = get_test_config_container( + params=params, test_id=test_id, metric_name=metric_name) + change_point_config = get_change_point_config(params) + run_change_point_analysis( + test_config_container=test_config_container, + big_query_metrics_fetcher=big_query_metrics_fetcher, + change_point_config=change_point_config, + save_alert_metadata=save_alert_metadata) if __name__ == '__main__': @@ -184,7 +260,7 @@ def run(config_file_path: Optional[str] = None) -> None: parser = argparse.ArgumentParser() parser.add_argument( '--config_file_path', - default=None, + required=True, type=str, help='Path to the config file that contains data to run the Change Point ' 'Analysis.The default file will used will be ' @@ -193,9 +269,17 @@ def run(config_file_path: Optional[str] = None) -> None: 'performance regression in the tests, ' 'please provide an .yml file in the same structure as the above ' 'mentioned file. ') + parser.add_argument( + '--save_alert_metadata', + action='store_true', + help='Save perf alert/ GH Issue metadata to BigQuery table.') known_args, unknown_args = parser.parse_known_args() if unknown_args: logging.warning('Discarding unknown arguments : %s ' % unknown_args) - run(known_args.config_file_path) + run( + config_file_path=known_args.config_file_path, + # Set this to true while running in production. + save_alert_metadata=known_args.save_alert_metadata # pylint: disable=line-too-long + ) diff --git a/sdks/python/apache_beam/testing/analyzers/perf_analysis_test.py b/sdks/python/apache_beam/testing/analyzers/perf_analysis_test.py index 094cd9c47ec02..4ef394d4ffab5 100644 --- a/sdks/python/apache_beam/testing/analyzers/perf_analysis_test.py +++ b/sdks/python/apache_beam/testing/analyzers/perf_analysis_test.py @@ -16,7 +16,6 @@ # # pytype: skip-file -import datetime import logging import os import re @@ -32,54 +31,55 @@ from apache_beam.io.filesystems import FileSystems from apache_beam.testing.analyzers import constants from apache_beam.testing.analyzers import github_issues_utils + from apache_beam.testing.analyzers.perf_analysis_utils import BigQueryMetricsFetcher + from apache_beam.testing.analyzers.perf_analysis_utils import MetricContainer + from apache_beam.testing.analyzers.perf_analysis_utils import TestConfigContainer from apache_beam.testing.analyzers.perf_analysis_utils import is_change_point_in_valid_window - from apache_beam.testing.analyzers.perf_analysis_utils import is_perf_alert + from apache_beam.testing.analyzers.perf_analysis_utils import is_edge_change_point + from apache_beam.testing.analyzers.perf_analysis_utils import is_sibling_change_point from apache_beam.testing.analyzers.perf_analysis_utils import e_divisive from apache_beam.testing.analyzers.perf_analysis_utils import filter_change_points_by_median_threshold from apache_beam.testing.analyzers.perf_analysis_utils import find_change_points from apache_beam.testing.analyzers.perf_analysis_utils import find_latest_change_point_index from apache_beam.testing.analyzers.perf_analysis_utils import validate_config from apache_beam.testing.load_tests import load_test_metrics_utils + except ImportError as e: - analysis = None # type: ignore + raise unittest.SkipTest('Missing dependencies to run perf analysis tests.') # mock methods. -def get_fake_data_with_no_change_point(**kwargs): +def get_fake_data_with_no_change_point(*args, **kwargs): num_samples = 20 metric_values = [1] * num_samples - timestamps = list(range(num_samples)) - return metric_values, timestamps + timestamps = [pd.Timestamp(i) for i in range(num_samples)] + return MetricContainer(metric_values, timestamps) -def get_fake_data_with_change_point(**kwargs): +def get_fake_data_with_change_point(*args, **kwargs): # change point will be at index 13. num_samples = 20 metric_values = [0] * 12 + [3] + [4] * 7 - timestamps = [i for i in range(num_samples)] - return metric_values, timestamps + timestamps = [pd.Timestamp(i) for i in range(num_samples)] + return MetricContainer(metric_values, timestamps) def get_existing_issue_data(**kwargs): # change point found at index 13. So passing 13 in the # existing issue data in mock method. return pd.DataFrame([{ - constants._CHANGE_POINT_TIMESTAMP_LABEL: 13, + constants._CHANGE_POINT_TIMESTAMP_LABEL: pd.Timestamp(13), constants._ISSUE_NUMBER: np.array([0]) }]) -@unittest.skipIf( - analysis is None, - 'Missing dependencies. ' - 'Test dependencies are missing for the Analyzer.') class TestChangePointAnalysis(unittest.TestCase): def setUp(self) -> None: self.single_change_point_series = [0] * 10 + [1] * 10 self.multiple_change_point_series = self.single_change_point_series + [ 2 ] * 20 - self.timestamps = list(range(5)) + self.timestamps = [pd.Timestamp(i) for i in range(5)] self.params = { 'test_description': 'fake_description', 'metrics_dataset': 'fake_dataset', @@ -125,44 +125,52 @@ def test_validate_config(self): def test_duplicate_change_point(self): change_point_index = 2 min_runs_between_change_points = 1 - is_alert = is_perf_alert( + is_alert = is_sibling_change_point( previous_change_point_timestamps=[self.timestamps[0]], timestamps=self.timestamps, change_point_index=change_point_index, - min_runs_between_change_points=min_runs_between_change_points) + min_runs_between_change_points=min_runs_between_change_points, + test_id=self.test_id) self.assertTrue(is_alert) def test_duplicate_change_points_are_not_valid_alerts(self): change_point_index = 2 min_runs_between_change_points = 1 - is_alert = is_perf_alert( + is_alert = is_sibling_change_point( previous_change_point_timestamps=[self.timestamps[3]], timestamps=self.timestamps, change_point_index=change_point_index, - min_runs_between_change_points=min_runs_between_change_points) + min_runs_between_change_points=min_runs_between_change_points, + test_id=self.test_id) self.assertFalse(is_alert) - is_alert = is_perf_alert( + is_alert = is_sibling_change_point( previous_change_point_timestamps=[ self.timestamps[0], self.timestamps[3] ], timestamps=self.timestamps, change_point_index=change_point_index, - min_runs_between_change_points=min_runs_between_change_points) + min_runs_between_change_points=min_runs_between_change_points, + test_id=self.test_id) self.assertFalse(is_alert) - @mock.patch( - 'apache_beam.testing.analyzers.perf_analysis.fetch_metric_data', + @mock.patch.object( + BigQueryMetricsFetcher, + 'fetch_metric_data', get_fake_data_with_no_change_point) def test_no_alerts_when_no_change_points(self): - is_alert = analysis.run_change_point_analysis( + test_config_container = analysis.get_test_config_container( params=self.params, - test_name=self.test_id, - big_query_metrics_fetcher=None) + test_id=self.test_id, + metric_name=self.params['metric_name']) + is_alert = analysis.run_change_point_analysis( + test_config_container=test_config_container, + big_query_metrics_fetcher=BigQueryMetricsFetcher()) self.assertFalse(is_alert) - @mock.patch( - 'apache_beam.testing.analyzers.perf_analysis.fetch_metric_data', + @mock.patch.object( + BigQueryMetricsFetcher, + 'fetch_metric_data', get_fake_data_with_change_point) @mock.patch( 'apache_beam.testing.analyzers.perf_analysis.get_existing_issues_data', @@ -176,14 +184,18 @@ def test_no_alerts_when_no_change_points(self): '.create_performance_alert', return_value=(0, '')) def test_alert_on_data_with_change_point(self, *args): - is_alert = analysis.run_change_point_analysis( + test_config_container = analysis.get_test_config_container( params=self.params, - test_name=self.test_id, - big_query_metrics_fetcher=None) + test_id=self.test_id, + metric_name=self.params['metric_name']) + is_alert = analysis.run_change_point_analysis( + test_config_container=test_config_container, + big_query_metrics_fetcher=BigQueryMetricsFetcher()) self.assertTrue(is_alert) - @mock.patch( - 'apache_beam.testing.analyzers.perf_analysis.fetch_metric_data', + @mock.patch.object( + BigQueryMetricsFetcher, + 'fetch_metric_data', get_fake_data_with_change_point) @mock.patch( 'apache_beam.testing.analyzers.perf_analysis.get_existing_issues_data', @@ -196,23 +208,34 @@ def test_alert_on_data_with_change_point(self, *args): 'apache_beam.testing.analyzers.perf_analysis.create_performance_alert', return_value=(0, '')) def test_alert_on_data_with_reported_change_point(self, *args): - is_alert = analysis.run_change_point_analysis( + test_config_container = analysis.get_test_config_container( params=self.params, - test_name=self.test_id, - big_query_metrics_fetcher=None) + test_id=self.test_id, + metric_name=self.params['metric_name']) + is_alert = analysis.run_change_point_analysis( + test_config_container=test_config_container, + big_query_metrics_fetcher=BigQueryMetricsFetcher()) self.assertFalse(is_alert) def test_change_point_has_anomaly_marker_in_gh_description(self): - metric_values, timestamps = get_fake_data_with_change_point() - timestamps = [datetime.datetime.fromtimestamp(ts) for ts in timestamps] + metric_container = get_fake_data_with_change_point() + metric_values = metric_container.values change_point_index = find_latest_change_point_index(metric_values) - description = github_issues_utils.get_issue_description( - test_name=self.test_id, - test_description=self.params['test_description'], + test_config_container = TestConfigContainer( + project=self.params['project'], + metrics_dataset=self.params['metrics_dataset'], + metrics_table=self.params['metrics_table'], metric_name=self.params['metric_name'], - metric_values=metric_values, - timestamps=timestamps, + test_id=self.test_id, + test_description=self.params['test_description'], + test_name=self.params.get('test_name', None), + labels=self.params.get('labels', None), + ) + + description = github_issues_utils.get_issue_description( + test_config_container=test_config_container, + metric_container=metric_container, change_point_index=change_point_index, max_results_to_display=( constants._NUM_RESULTS_TO_DISPLAY_ON_ISSUE_DESCRIPTION)) @@ -241,6 +264,15 @@ def read_csv(path): metric_values, change_points) self.assertEqual(len(valid_points), 0) + def test_change_point_on_edge_segment(self): + data = [1] * 50 + [100] + change_points = find_change_points(data) + self.assertEqual(change_points, [50]) + + self.assertEqual(is_edge_change_point(change_points[0], len(data)), True) + + self.assertEqual(find_latest_change_point_index(data), None) + if __name__ == '__main__': logging.getLogger().setLevel(logging.DEBUG) diff --git a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py index 0a559fc4beebd..a9015d715e908 100644 --- a/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py +++ b/sdks/python/apache_beam/testing/analyzers/perf_analysis_utils.py @@ -14,11 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import abc import logging from dataclasses import asdict from dataclasses import dataclass from statistics import median -from typing import Any from typing import Dict from typing import List from typing import Optional @@ -30,12 +30,16 @@ from google.api_core import exceptions from apache_beam.testing.analyzers import constants -from apache_beam.testing.analyzers import github_issues_utils from apache_beam.testing.load_tests import load_test_metrics_utils -from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsFetcher from apache_beam.testing.load_tests.load_test_metrics_utils import BigQueryMetricsPublisher from signal_processing_algorithms.energy_statistics.energy_statistics import e_divisive +# pylint: disable=ungrouped-imports +try: + from google.cloud import bigquery +except ImportError: + bigquery = None # type: ignore + @dataclass(frozen=True) class GitHubIssueMetaData: @@ -54,14 +58,60 @@ class GitHubIssueMetaData: change_point: float +@dataclass +class ChangePointConfig: + """ + This class holds the change point configuration parameters. + """ + min_runs_between_change_points: int = ( + constants._DEFAULT_MIN_RUNS_BETWEEN_CHANGE_POINTS) + num_runs_in_change_point_window: int = ( + constants._DEFAULT_NUM_RUMS_IN_CHANGE_POINT_WINDOW) + + +@dataclass +class TestConfigContainer: + metric_name: str + project: str + metrics_dataset: str + metrics_table: str + test_id: str # unique id for each test config. + test_description: str + test_name: Optional[str] = None + labels: Optional[List[str]] = None + + +@dataclass +class MetricContainer: + """ + This class holds the metric values and timestamps for a given metric. + Args: + metric_values: List of metric values. + timestamps: List of pandas timestamps corresponding to the metric values. + """ + + values: List[Union[int, float]] + timestamps: List[pd.Timestamp] + + def sort_by_timestamp(self, in_place=True): + """ + Sorts the metric values and timestamps in ascending order wrt timestamps. + Args: + in_place: If True, sort the metric values and timestamps in place. + """ + timestamps, values = zip(*sorted(zip(self.timestamps, self.values))) + if not in_place: + return MetricContainer(values=values, timestamps=timestamps) + self.timestamps, self.values = zip(*sorted( + zip(self.timestamps, self.values))) + + def is_change_point_in_valid_window( num_runs_in_change_point_window: int, latest_change_point_run: int) -> bool: return num_runs_in_change_point_window > latest_change_point_run -def get_existing_issues_data( - table_name: str, big_query_metrics_fetcher: BigQueryMetricsFetcher -) -> Optional[pd.DataFrame]: +def get_existing_issues_data(table_name: str) -> Optional[pd.DataFrame]: """ Finds the most recent GitHub issue created for the test_name. If no table found with name=test_name, return (None, None) @@ -73,20 +123,28 @@ def get_existing_issues_data( LIMIT 10 """ try: - df = big_query_metrics_fetcher.fetch(query=query) + if bigquery is None: + raise ImportError('Bigquery dependencies are not installed.') + client = bigquery.Client() + query_job = client.query(query=query) + existing_issue_data = query_job.result().to_dataframe() except exceptions.NotFound: # If no table found, that means this is first performance regression # on the current test+metric. return None - return df + return existing_issue_data -def is_perf_alert( +def is_sibling_change_point( previous_change_point_timestamps: List[pd.Timestamp], change_point_index: int, timestamps: List[pd.Timestamp], - min_runs_between_change_points: int) -> bool: + min_runs_between_change_points: int, + test_id: str, +) -> bool: """ + Sibling change points are the change points that are close to each other. + Search the previous_change_point_timestamps with current observed change point sibling window and determine if it is a duplicate change point or not. @@ -105,6 +163,18 @@ def is_perf_alert( for previous_change_point_timestamp in previous_change_point_timestamps: if (sibling_change_point_min_timestamp <= previous_change_point_timestamp <= sibling_change_point_max_timestamp): + logging.info( + 'Performance regression/improvement found for the test ID: %s. ' + 'Since the change point timestamp %s ' + 'lies within the sibling change point window: %s, ' + 'alert is not raised.' % ( + test_id, + previous_change_point_timestamp.strftime('%Y-%m-%d %H:%M:%S'), + ( + sibling_change_point_min_timestamp.strftime( + '%Y-%m-%d %H:%M:%S'), + sibling_change_point_max_timestamp.strftime( + '%Y-%m-%d %H:%M:%S')))) return False return True @@ -123,33 +193,6 @@ def validate_config(keys): return constants._PERF_TEST_KEYS.issubset(keys) -def fetch_metric_data( - params: Dict[str, Any], big_query_metrics_fetcher: BigQueryMetricsFetcher -) -> Tuple[List[Union[int, float]], List[pd.Timestamp]]: - """ - Args: - params: Dict containing keys required to fetch data from a data source. - big_query_metrics_fetcher: A BigQuery metrics fetcher for fetch metrics. - Returns: - Tuple[List[Union[int, float]], List[pd.Timestamp]]: Tuple containing list - of metric_values and list of timestamps. Both are sorted in ascending - order wrt timestamps. - """ - query = f""" - SELECT * - FROM {params['project']}.{params['metrics_dataset']}.{params['metrics_table']} - WHERE CONTAINS_SUBSTR(({load_test_metrics_utils.METRICS_TYPE_LABEL}), '{params['metric_name']}') - ORDER BY {load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL} DESC - LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS} - """ - metric_data: pd.DataFrame = big_query_metrics_fetcher.fetch(query=query) - metric_data.sort_values( - by=[load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL], inplace=True) - return ( - metric_data[load_test_metrics_utils.VALUE_LABEL].tolist(), - metric_data[load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL].tolist()) - - def find_change_points(metric_values: List[Union[float, int]]): return e_divisive(metric_values) @@ -170,15 +213,34 @@ def find_latest_change_point_index(metric_values: List[Union[float, int]]): if not change_points_indices: return None change_points_indices.sort() - return change_points_indices[-1] + # Remove the change points that are at the edges of the data. + # https://github.com/apache/beam/issues/28757 + # Remove this workaround once we have a good solution to deal + # with the edge change points. + change_point_index = change_points_indices[-1] + if is_edge_change_point(change_point_index, + len(metric_values), + constants._EDGE_SEGMENT_SIZE): + logging.info( + 'The change point %s is located at the edge of the data with an edge ' + 'segment size of %s. This change point will be ignored for now, ' + 'awaiting additional data. Should the change point persist after ' + 'gathering more data, an alert will be raised.' % + (change_point_index, constants._EDGE_SEGMENT_SIZE)) + return None + return change_point_index -def publish_issue_metadata_to_big_query(issue_metadata, table_name): +def publish_issue_metadata_to_big_query( + issue_metadata, + table_name, + project=constants._BQ_PROJECT_NAME, +): """ - Published issue_metadata to BigQuery with table name=test_name. + Published issue_metadata to BigQuery with table name. """ bq_metrics_publisher = BigQueryMetricsPublisher( - project_name=constants._BQ_PROJECT_NAME, + project_name=project, dataset=constants._BQ_DATASET, table=table_name, bq_schema=constants._SCHEMA) @@ -189,34 +251,32 @@ def publish_issue_metadata_to_big_query(issue_metadata, table_name): def create_performance_alert( - metric_name: str, - test_name: str, - timestamps: List[pd.Timestamp], - metric_values: List[Union[int, float]], + test_config_container: TestConfigContainer, + metric_container: MetricContainer, change_point_index: int, - labels: List[str], existing_issue_number: Optional[int], - test_description: Optional[str] = None) -> Tuple[int, str]: +) -> Tuple[int, str]: """ Creates performance alert on GitHub issues and returns GitHub issue number and issue URL. """ + # avoid circular imports + # pylint: disable=wrong-import-order, wrong-import-position + from apache_beam.testing.analyzers import github_issues_utils + description = github_issues_utils.get_issue_description( - test_name=test_name, - test_description=test_description, - metric_name=metric_name, - timestamps=timestamps, - metric_values=metric_values, + test_config_container=test_config_container, + metric_container=metric_container, change_point_index=change_point_index, max_results_to_display=( constants._NUM_RESULTS_TO_DISPLAY_ON_ISSUE_DESCRIPTION)) issue_number, issue_url = github_issues_utils.report_change_point_on_issues( title=github_issues_utils._ISSUE_TITLE_TEMPLATE.format( - test_name, metric_name + test_config_container.test_id, test_config_container.metric_name ), description=description, - labels=labels, + labels=test_config_container.labels, existing_issue_number=existing_issue_number) logging.info( @@ -253,3 +313,62 @@ def filter_change_points_by_median_threshold( if relative_change > threshold: valid_change_points.append(idx) return valid_change_points + + +def is_edge_change_point( + change_point_index, + data_size, + edge_segment_size=constants._EDGE_SEGMENT_SIZE): + """ + Removes the change points that are at the edges of the data. + Args: + change_point_index: Index of the change point. + data_size: Size of the data. + edge_segment_size: Size of the edge segment. + """ + return change_point_index > data_size - edge_segment_size + + +class MetricsFetcher(metaclass=abc.ABCMeta): + @abc.abstractmethod + def fetch_metric_data( + self, *, test_config: TestConfigContainer) -> MetricContainer: + """ + Define SQL query and fetch the timestamp values and metric values + from BigQuery tables. + """ + raise NotImplementedError + + +class BigQueryMetricsFetcher(MetricsFetcher): + def fetch_metric_data( + self, *, test_config: TestConfigContainer) -> MetricContainer: + """ + Args: + test_config: TestConfigContainer containing metadata required to fetch + metric data from BigQuery. + Returns: + MetricContainer containing metric values and timestamps. + """ + project = test_config.project + metrics_dataset = test_config.metrics_dataset + metrics_table = test_config.metrics_table + metric_name = test_config.metric_name + query = f""" + SELECT * + FROM {project}.{metrics_dataset}.{metrics_table} + WHERE CONTAINS_SUBSTR(({load_test_metrics_utils.METRICS_TYPE_LABEL}), '{metric_name}') + ORDER BY {load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL} DESC + LIMIT {constants._NUM_DATA_POINTS_TO_RUN_CHANGE_POINT_ANALYSIS} + """ + if bigquery is None: + raise ImportError('Bigquery dependencies are not installed.') + client = bigquery.Client() + query_job = client.query(query=query) + metric_data = query_job.result().to_dataframe() + # metric_data.sort_values( + # by=[load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL], inplace=True) + return MetricContainer( + values=metric_data[load_test_metrics_utils.VALUE_LABEL].tolist(), + timestamps=metric_data[ + load_test_metrics_utils.SUBMIT_TIMESTAMP_LABEL].tolist()) diff --git a/sdks/python/apache_beam/testing/analyzers/tests_config.yaml b/sdks/python/apache_beam/testing/analyzers/tests_config.yaml index f808f5e41d740..2e72cd5cc301f 100644 --- a/sdks/python/apache_beam/testing/analyzers/tests_config.yaml +++ b/sdks/python/apache_beam/testing/analyzers/tests_config.yaml @@ -16,10 +16,10 @@ # # for the unique key to define a test, please use the following format: -# {test_name}-{metric_name} +# {test_id}-{metric_name} pytorch_image_classification_benchmarks-resnet152-mean_inference_batch_latency_micro_secs: - test_description: + test_description: | Pytorch image classification on 50k images of size 224 x 224 with resnet 152. Test link - https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L63 Test dashboard - http://metrics.beam.apache.org/d/ZpS8Uf44z/python-ml-runinference-benchmarks?orgId=1&viewPanel=2 @@ -30,7 +30,7 @@ pytorch_image_classification_benchmarks-resnet152-mean_inference_batch_latency_m metric_name: mean_inference_batch_latency_micro_secs pytorch_image_classification_benchmarks-resnet101-mean_load_model_latency_milli_secs: - test_description: + test_description: | Pytorch image classification on 50k images of size 224 x 224 with resnet 101. Test link - https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L34 Test dashboard - http://metrics.beam.apache.org/d/ZpS8Uf44z/python-ml-runinference-benchmarks?orgId=1&viewPanel=7 @@ -41,7 +41,7 @@ pytorch_image_classification_benchmarks-resnet101-mean_load_model_latency_milli_ metric_name: mean_load_model_latency_milli_secs pytorch_image_classification_benchmarks-resnet101-mean_inference_batch_latency_micro_secs: - test_description: + test_description: | Pytorch image classification on 50k images of size 224 x 224 with resnet 101. Test link - https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L34 Test dashboard - http://metrics.beam.apache.org/d/ZpS8Uf44z/python-ml-runinference-benchmarks?orgId=1&viewPanel=2 @@ -52,7 +52,7 @@ pytorch_image_classification_benchmarks-resnet101-mean_inference_batch_latency_m metric_name: mean_inference_batch_latency_micro_secs pytorch_image_classification_benchmarks-resnet152-GPU-mean_inference_batch_latency_micro_secs: - test_description: + test_description: | Pytorch image classification on 50k images of size 224 x 224 with resnet 152 with Tesla T4 GPU. Test link - https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L151 Test dashboard - http://metrics.beam.apache.org/d/ZpS8Uf44z/python-ml-runinference-benchmarks?orgId=1&viewPanel=7 @@ -63,7 +63,7 @@ pytorch_image_classification_benchmarks-resnet152-GPU-mean_inference_batch_laten metric_name: mean_inference_batch_latency_micro_secs pytorch_image_classification_benchmarks-resnet152-GPU-mean_load_model_latency_milli_secs: - test_description: + test_description: | Pytorch image classification on 50k images of size 224 x 224 with resnet 152 with Tesla T4 GPU. Test link - https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L151 Test dashboard - http://metrics.beam.apache.org/d/ZpS8Uf44z/python-ml-runinference-benchmarks?orgId=1&viewPanel=7 @@ -74,7 +74,7 @@ pytorch_image_classification_benchmarks-resnet152-GPU-mean_load_model_latency_mi metric_name: mean_load_model_latency_milli_secs pytorch_image_classification_benchmarks-resnet152-GPU-mean_inference_batch_latency_micro_secs: - test_description: + test_description: | Pytorch image classification on 50k images of size 224 x 224 with resnet 152 with Tesla T4 GPU. Test link - https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy#L151). Test dashboard - http://metrics.beam.apache.org/d/ZpS8Uf44z/python-ml-runinference-benchmarks?from=now-90d&to=now&viewPanel=2 @@ -85,7 +85,7 @@ pytorch_image_classification_benchmarks-resnet152-GPU-mean_inference_batch_laten metric_name: mean_inference_batch_latency_micro_secs test_cloudml_benchmark_cirteo_no_shuffle_10GB-runtime_sec: - test_description: + test_description: | TFT Criteo test on 10 GB data with no Reshuffle. Test link - [Test link](https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/sdks/python/apache_beam/testing/benchmarks/cloudml/cloudml_benchmark_test.py#L82) metrics_dataset: beam_cloudml @@ -94,7 +94,7 @@ test_cloudml_benchmark_cirteo_no_shuffle_10GB-runtime_sec: metric_name: runtime_sec test_cloudml_benchmark_criteo_10GB-runtime_sec: - test_description: + test_description: | TFT Criteo test on 10 GB data. Test link - https://github.com/apache/beam/blob/42d0a6e3564d8b9c5d912428a6de18fb22a13ac1/sdks/python/apache_beam/testing/benchmarks/cloudml/cloudml_benchmark_test.py#LL104C7-L104C41 metrics_dataset: beam_cloudml @@ -104,7 +104,7 @@ test_cloudml_benchmark_criteo_10GB-runtime_sec: # Python Combine load tests at http://metrics.beam.apache.org/d/WNzYt13Zk/combine-load-tests?orgId=1 combine_python_batch_2gb_10_byte_records: - test_description: + test_description: | Combine Python Load Test 2 GB 10 byte records Test link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_Combine_Python.groovy#L76C24-L76C65 Test dashboard - http://metrics.beam.apache.org/d/WNzYt13Zk/combine-load-tests?orgId=1&from=now-90d&to=now&var-processingType=batch&var-sdk=python&viewPanel=2 @@ -115,7 +115,7 @@ combine_python_batch_2gb_10_byte_records: project: apache-beam-testing combine_python_batch_2gb_fanout_4: - test_description: + test_description: | Combine Python Load test - 2GB Fanout 4 Test link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_Combine_Python.groovy#L52 Test Dashboard - http://metrics.beam.apache.org/d/WNzYt13Zk/combine-load-tests?orgId=1&from=now-90d&to=now&var-processingType=batch&var-sdk=python&viewPanel=4 @@ -126,7 +126,8 @@ combine_python_batch_2gb_fanout_4: project: apache-beam-testing combine_python_batch_2gb_fanout_8: - test_description: Combine Python Load test - 2GB Fanout 8 + test_description: | + Combine Python Load test - 2GB Fanout 8 test_target: apache_beam.testing.load_tests.combine_test metrics_dataset: load_test metrics_table: python_dataflow_batch_combine_5 @@ -135,7 +136,7 @@ combine_python_batch_2gb_fanout_8: # Python Batch GBK load tests at http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-tests?orgId=1&var-processingType=batch&var-sdk=python&from=now-90d&to=now gbk_python_batch_load_test_2gb_of_10B_records: - test_description: + test_description: | GroupByKey Python Load test - 2GB of 10B records python | GBK | Small records (10B) Test Dashboard - http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-tests?orgId=1&var-processingType=batch&var-sdk=python&from=now-90d&to=now&viewPanel=2 @@ -147,7 +148,7 @@ gbk_python_batch_load_test_2gb_of_10B_records: project: apache-beam-testing gbk_python_batch_load_test_2gb_of_100B_records: - test_description: + test_description: | GroupByKey Python Load test - 2GB of 100B records python | GBK | Medium records (100B) Test Dashboard - http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-tests?orgId=1&var-processingType=batch&var-sdk=python&from=now-90d&to=now&viewPanel=3 @@ -159,7 +160,7 @@ gbk_python_batch_load_test_2gb_of_100B_records: project: apache-beam-testing gbk_python_batch_load_test_2gb_of_100KB_records: - test_description: + test_description: | GroupByKey Python Load test - 2GB of 100kB records python | GBK | Large records (100kB) Test Dashboard - http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-tests?orgId=1&var-processingType=batch&var-sdk=python&from=now-6M&to=now&viewPanel=4&inspect=4 @@ -173,7 +174,7 @@ gbk_python_batch_load_test_2gb_of_100KB_records: gbk_python_batch_load_test_fanout_4_times_with_2GB_10byte_records_total: # this test looks little noisy. Remove this if it causes too many false # positives. - test_description: + test_description: | GroupByKey Python Load test - fanout 4 times with 2GB 10-byte records total python | GBK | Fanout 4 Test Dashboard - http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-tests?orgId=1&var-processingType=batch&var-sdk=python&from=now-90d&to=now&viewPanel=5 @@ -188,7 +189,7 @@ gbk_python_batch_load_test_fanout_4_times_with_2GB_10byte_records_total: gbk_python_batch_load_test_fanout_8_times_with_2GB_10byte_records_total: # this test looks little noisy. Remove this if it causes too many false # positives. - test_description: + test_description: | GroupByKey Python Load test - fanout 8 times with 2GB 10-byte records total python | GBK | Fanout 8 Test Dashboard - http://metrics.beam.apache.org/d/UYZ-oJ3Zk/gbk-load-tests?orgId=1&var-processingType=batch&var-sdk=python&from=now-90d&to=now&viewPanel=6 @@ -201,7 +202,7 @@ gbk_python_batch_load_test_fanout_8_times_with_2GB_10byte_records_total: # Python SideInput load tests at http://metrics.beam.apache.org/d/-E9aGlFGk/side-input-load-tests?orgId=1&from=now-90d&to=now sideinpts_python_batch_1gb_1kb_10workers_1000window_1key_percent_dict: - test_description: + test_description: | python | Side Input | 1 GB dictionary, 1% of keys, 1000 fixed windows Test Link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy#L120 Test Dashboard - http://metrics.beam.apache.org/d/-E9aGlFGk/side-input-load-tests?orgId=1&from=now-90d&to=now&viewPanel=8 @@ -212,7 +213,7 @@ sideinpts_python_batch_1gb_1kb_10workers_1000window_1key_percent_dict: sideinpts_python_batch_1gb_1kb_10workers_1000window_99key_percent_dict: - test_description: + test_description: | python | Side Input | 1 GB dictionary, 99% of keys, 1000 fixed windows Test Link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy#L133 Test Dashboard - http://metrics.beam.apache.org/d/-E9aGlFGk/side-input-load-tests?orgId=1&from=now-90d&to=now&viewPanel=9 @@ -222,7 +223,7 @@ sideinpts_python_batch_1gb_1kb_10workers_1000window_99key_percent_dict: project: apache-beam-testing sideinpts_python_batch_10gb_1kb_10workers_1000window_first_iterable: - test_description: + test_description: | python | Side Input | 10 GB iterable, 1% of elements, 1000 fixed windows Test Link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy#L146 Test Dashboard - http://metrics.beam.apache.org/d/-E9aGlFGk/side-input-load-tests?orgId=1&from=now-90d&to=now&viewPanel=10 @@ -233,7 +234,7 @@ sideinpts_python_batch_10gb_1kb_10workers_1000window_first_iterable: sideinpts_python_batch_10gb_1kb_10workers_1000window_first_iterable: - test_description: + test_description: | python | Side Input | 10 GB iterable, all elements, 1000 fixed windows Test Link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy#L159 Test Dashboard - http://metrics.beam.apache.org/d/-E9aGlFGk/side-input-load-tests?orgId=1&from=now-90d&to=now&viewPanel=11 @@ -245,7 +246,7 @@ sideinpts_python_batch_10gb_1kb_10workers_1000window_first_iterable: # Python CoGBK load tests at http://metrics.beam.apache.org/d/fK0U4JqWz/cogbk-load-tests?orgId=1&var-processingType=batch&var-sdk=python cogbk_python_batch_load_test_2GB_of_100B_records_with_a_single_key: - test_description: + test_description: | CoGroupByKey Python Load test - 2GB of 100B records with a single key python | coGBK | 100B records with a single key Test Link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_coGBK_Python.groovy#L32C25-L32C76 @@ -257,7 +258,7 @@ cogbk_python_batch_load_test_2GB_of_100B_records_with_a_single_key: project: apache-beam-testing cogbk_python_batch_load_test_2GB_of_100B_records_with_a_multiple_key: - test_description: + test_description: | CoGroupByKey Python Load test - 2GB of 100B records with multiple keys python | coGBK | 100B records with multiple keys @@ -270,7 +271,7 @@ cogbk_python_batch_load_test_2GB_of_100B_records_with_a_multiple_key: project: apache-beam-testing cogbk_python_batch_load_test_reiterate_4times_10KB_values: - test_description: + test_description: | CoGroupByKey Python Load test - reiterate 4 times 10kB values python | coGBK | reiteration 10kB value Test Link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_coGBK_Python.groovy#L96 @@ -281,7 +282,7 @@ cogbk_python_batch_load_test_reiterate_4times_10KB_values: project: apache-beam-testing cogbk_python_batch_load_test_reiterate_4times_2MB_values: - test_description: + test_description: | CoGroupByKey Python Load test - reiterate 4 times 2 MB values python | coGBK | reiteration 2MB value Test Link - https://github.com/apache/beam/blob/5e38decf9e723a385057131b01bbd33d8c60bda3/.test-infra/jenkins/job_LoadTests_coGBK_Python.groovy#L128 diff --git a/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py b/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py index 2296588ec4965..bdf6f476212db 100644 --- a/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py +++ b/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py @@ -420,7 +420,8 @@ def publish_performance_influxdb(self, query_num, perf): auth = HTTPBasicAuth(user, password) try: - response = requests.post(url, params=query_str, data=payload, auth=auth) + response = requests.post( + url, params=query_str, data=payload, auth=auth, timeout=60) except requests.exceptions.RequestException as e: logging.warning('Failed to publish metrics to InfluxDB: ' + str(e)) else: diff --git a/sdks/python/apache_beam/testing/load_tests/build.gradle b/sdks/python/apache_beam/testing/load_tests/build.gradle index 144f7d12ba6c3..538d4a01bfee5 100644 --- a/sdks/python/apache_beam/testing/load_tests/build.gradle +++ b/sdks/python/apache_beam/testing/load_tests/build.gradle @@ -59,7 +59,7 @@ task run(type: Exec, dependsOn: installGcpTest) { ignoreExitValue true doLast { - if (execResult.exitValue != 0) { + if (executionResult.get().exitValue != 0) { throw new GradleException('error occurred') } } diff --git a/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py b/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py index 92a5f68351fe0..1ff46a3f7d19b 100644 --- a/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py +++ b/sdks/python/apache_beam/testing/load_tests/load_test_metrics_utils.py @@ -38,7 +38,6 @@ from typing import Optional from typing import Union -import pandas as pd import requests from requests.auth import HTTPBasicAuth @@ -536,7 +535,8 @@ def publish(self, results): self.options.http_auth_enabled() else None try: - response = requests.post(url, params=query_str, data=payload, auth=auth) + response = requests.post( + url, params=query_str, data=payload, auth=auth, timeout=60) except requests.exceptions.RequestException as e: _LOGGER.warning('Failed to publish metrics to InfluxDB: ' + str(e)) else: @@ -650,13 +650,3 @@ def __init__(self): def process(self, element): yield self.timestamp_val_fn( element, self.timestamp_fn(micros=int(self.time_fn() * 1000000))) - - -class BigQueryMetricsFetcher: - def __init__(self): - self.client = bigquery.Client() - - def fetch(self, query) -> pd.DataFrame: - query_job = self.client.query(query=query) - result = query_job.result() - return result.to_dataframe() diff --git a/sdks/python/apache_beam/transforms/core.py b/sdks/python/apache_beam/transforms/core.py index e980dccea7444..a6fd3b184d4c2 100644 --- a/sdks/python/apache_beam/transforms/core.py +++ b/sdks/python/apache_beam/transforms/core.py @@ -2258,6 +2258,10 @@ def __init__(self, pcoll, exception_handling_args, upstream_errors=()): self._exception_handling_args = exception_handling_args self._upstream_errors = upstream_errors + @property + def pipeline(self): + return self._pcoll.pipeline + @property def element_type(self): return self._pcoll.element_type @@ -2324,6 +2328,10 @@ def __init__(self, pvalue, exception_handling_args=None): else: self._pvalue = _PValueWithErrors(pvalue, exception_handling_args) + @property + def pipeline(self): + return self._pvalue.pipeline + @property def element_type(self): return self._pvalue.element_type @@ -3364,9 +3372,16 @@ def expand(self, pcoll): for name, expr in self._fields}))).as_result() def infer_output_type(self, input_type): + def extract_return_type(expr): + expr_hints = get_type_hints(expr) + if (expr_hints and expr_hints.has_simple_output_type() and + expr_hints.simple_output_type(None) != typehints.Any): + return expr_hints.simple_output_type(None) + else: + return trivial_inference.infer_return_type(expr, [input_type]) + return row_type.RowTypeConstraint.from_fields([ - (name, trivial_inference.infer_return_type(expr, [input_type])) - for (name, expr) in self._fields + (name, extract_return_type(expr)) for (name, expr) in self._fields ]) diff --git a/sdks/python/apache_beam/transforms/display.py b/sdks/python/apache_beam/transforms/display.py index b52a8fd5b6dd9..0d1dd552413e2 100644 --- a/sdks/python/apache_beam/transforms/display.py +++ b/sdks/python/apache_beam/transforms/display.py @@ -45,6 +45,7 @@ from datetime import timedelta from typing import TYPE_CHECKING from typing import List +from typing import Union from apache_beam.portability import common_urns from apache_beam.portability.api import beam_runner_api_pb2 @@ -101,7 +102,8 @@ def __init__( ): # type: (...) -> None self.namespace = namespace - self.items = [] # type: List[DisplayDataItem] + self.items = [ + ] # type: List[Union[DisplayDataItem, beam_runner_api_pb2.DisplayData]] self._populate_items(display_data_dict) def _populate_items(self, display_data_dict): @@ -112,26 +114,31 @@ def _populate_items(self, display_data_dict): subcomponent_display_data = DisplayData( element._get_display_data_namespace(), element.display_data()) self.items += subcomponent_display_data.items - continue - if isinstance(element, DisplayDataItem): + elif isinstance(element, DisplayDataItem): if element.should_drop(): continue element.key = key element.namespace = self.namespace self.items.append(element) - continue - # If it's not a HasDisplayData element, - # nor a dictionary, then it's a simple value - self.items.append( - DisplayDataItem(element, namespace=self.namespace, key=key)) + elif isinstance(element, beam_runner_api_pb2.DisplayData): + self.items.append(element) + + else: + # If it's not a HasDisplayData element, + # nor a dictionary, then it's a simple value + self.items.append( + DisplayDataItem(element, namespace=self.namespace, key=key)) def to_proto(self): # type: (...) -> List[beam_runner_api_pb2.DisplayData] """Returns a List of Beam proto representation of Display data.""" def create_payload(dd): + if isinstance(dd, beam_runner_api_pb2.DisplayData): + return dd + display_data_dict = None try: display_data_dict = dd.get_dict() diff --git a/sdks/python/apache_beam/transforms/external.py b/sdks/python/apache_beam/transforms/external.py index 44bf2398a6dd4..0d0b6f1e7be25 100644 --- a/sdks/python/apache_beam/transforms/external.py +++ b/sdks/python/apache_beam/transforms/external.py @@ -663,9 +663,11 @@ def get_local_namespace(cls): @contextlib.contextmanager def outer_namespace(cls, namespace): prev = cls.get_local_namespace() - cls._external_namespace.value = namespace - yield - cls._external_namespace.value = prev + try: + cls._external_namespace.value = namespace + yield + finally: + cls._external_namespace.value = prev @classmethod def _fresh_namespace(cls): diff --git a/sdks/python/apache_beam/transforms/fully_qualified_named_transform.py b/sdks/python/apache_beam/transforms/fully_qualified_named_transform.py index f9b1c12d51338..ab2cadd166a91 100644 --- a/sdks/python/apache_beam/transforms/fully_qualified_named_transform.py +++ b/sdks/python/apache_beam/transforms/fully_qualified_named_transform.py @@ -43,8 +43,10 @@ class FullyQualifiedNamedTransform(ptransform.PTransform): @contextlib.contextmanager def with_filter(cls, filter): old_filter, cls._FILTER_GLOB = cls._FILTER_GLOB, filter - yield - cls._FILTER_GLOB = old_filter + try: + yield + finally: + cls._FILTER_GLOB = old_filter def __init__(self, constructor, args, kwargs): self._constructor = constructor diff --git a/sdks/python/apache_beam/transforms/ptransform.py b/sdks/python/apache_beam/transforms/ptransform.py index 28614c6561c7f..fcff86d4c50c1 100644 --- a/sdks/python/apache_beam/transforms/ptransform.py +++ b/sdks/python/apache_beam/transforms/ptransform.py @@ -1101,6 +1101,22 @@ def __ror__(self, pvalueish, _unused=None): def expand(self, pvalue): raise RuntimeError("Should never be expanded directly.") + def __getattr__(self, attr): + transform_attr = getattr(self.transform, attr) + if callable(transform_attr): + + @wraps(transform_attr) + def wrapper(*args, **kwargs): + result = transform_attr(*args, **kwargs) + if isinstance(result, PTransform): + return _NamedPTransform(result, self.label) + else: + return result + + return wrapper + else: + return transform_attr + # Defined here to avoid circular import issues for Beam library transforms. def annotate_yaml(constructor): diff --git a/sdks/python/apache_beam/transforms/resources.py b/sdks/python/apache_beam/transforms/resources.py index 7bb202ab5660a..7c4160df8eddd 100644 --- a/sdks/python/apache_beam/transforms/resources.py +++ b/sdks/python/apache_beam/transforms/resources.py @@ -42,6 +42,7 @@ 'ResourceHint', 'AcceleratorHint', 'MinRamHint', + 'CpuCountHint', 'merge_resource_hints', 'parse_resource_hints', 'resource_hints_from_options', @@ -177,6 +178,21 @@ def get_merged_value( ResourceHint.register_resource_hint('minRam', MinRamHint) +class CpuCountHint(ResourceHint): + """Describes number of CPUs available in transform's execution environment.""" + urn = resource_hints.CPU_COUNT.urn + + @classmethod + def get_merged_value( + cls, outer_value, inner_value): # type: (bytes, bytes) -> bytes + return ResourceHint._use_max(outer_value, inner_value) + + +ResourceHint.register_resource_hint('cpu_count', CpuCountHint) +# Alias for interoperability with SDKs preferring camelCase. +ResourceHint.register_resource_hint('cpuCount', CpuCountHint) + + def parse_resource_hints(hints): # type: (Dict[Any, Any]) -> Dict[str, bytes] parsed_hints = {} for hint, value in hints.items(): diff --git a/sdks/python/apache_beam/transforms/resources_test.py b/sdks/python/apache_beam/transforms/resources_test.py index 939391b7adcb8..939bdcd626514 100644 --- a/sdks/python/apache_beam/transforms/resources_test.py +++ b/sdks/python/apache_beam/transforms/resources_test.py @@ -46,6 +46,11 @@ class ResourcesTest(unittest.TestCase): val='gpu', urn='beam:resources:accelerator:v1', bytestr=b'gpu'), + param( + name='cpu_count', + val='4', + urn='beam:resources:cpu_count:v1', + bytestr=b'4'), ]) def test_known_resource_hints(self, name, val, urn, bytestr): t = PTransform() @@ -56,6 +61,7 @@ def test_known_resource_hints(self, name, val, urn, bytestr): @parameterized.expand([ param(name='min_ram', val='3,500G'), param(name='accelerator', val=1), + param(name='cpu_count', val=1), param(name='unknown_hint', val=1) ]) def test_resource_hint_parsing_fails_early(self, name, val): diff --git a/sdks/python/apache_beam/transforms/util.py b/sdks/python/apache_beam/transforms/util.py index fb0e8e9789d8b..cacfdb37d7b41 100644 --- a/sdks/python/apache_beam/transforms/util.py +++ b/sdks/python/apache_beam/transforms/util.py @@ -61,6 +61,7 @@ from apache_beam.transforms.trigger import Always from apache_beam.transforms.userstate import BagStateSpec from apache_beam.transforms.userstate import CombiningValueStateSpec +from apache_beam.transforms.userstate import ReadModifyWriteStateSpec from apache_beam.transforms.userstate import TimerSpec from apache_beam.transforms.userstate import on_timer from apache_beam.transforms.window import NonMergingWindowFn @@ -392,7 +393,7 @@ def ignore_next_timing(self): def record_time(self, batch_size): start = self._clock() yield - elapsed = self._clock() - start + elapsed = float(self._clock() - start) elapsed_msec = 1e3 * elapsed + self._remainder_msecs if self._record_metrics: self._size_distribution.update(batch_size) @@ -646,6 +647,107 @@ def finish_bundle(self): self._target_batch_size = self._batch_size_estimator.next_batch_size() +def _pardo_stateful_batch_elements( + input_coder: coders.Coder, + batch_size_estimator: _BatchSizeEstimator, + max_buffering_duration_secs: int, + clock=time.time): + ELEMENT_STATE = BagStateSpec('values', input_coder) + COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn()) + BATCH_SIZE_STATE = ReadModifyWriteStateSpec('batch_size', input_coder) + WINDOW_TIMER = TimerSpec('window_end', TimeDomain.WATERMARK) + BUFFERING_TIMER = TimerSpec('buffering_end', TimeDomain.REAL_TIME) + BATCH_ESTIMATOR_STATE = ReadModifyWriteStateSpec( + 'batch_estimator', coders.PickleCoder()) + + class _StatefulBatchElementsDoFn(DoFn): + def process( + self, + element, + window=DoFn.WindowParam, + element_state=DoFn.StateParam(ELEMENT_STATE), + count_state=DoFn.StateParam(COUNT_STATE), + batch_size_state=DoFn.StateParam(BATCH_SIZE_STATE), + batch_estimator_state=DoFn.StateParam(BATCH_ESTIMATOR_STATE), + window_timer=DoFn.TimerParam(WINDOW_TIMER), + buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): + window_timer.set(window.end) + # Drop the fixed key since we don't care about it + element_state.add(element[1]) + count_state.add(1) + count = count_state.read() + target_size = batch_size_state.read() + # Should only happen on the first element + if target_size is None: + batch_estimator = batch_size_estimator + target_size = batch_estimator.next_batch_size() + batch_size_state.write(target_size) + batch_estimator_state.write(batch_estimator) + + if count == 1 and max_buffering_duration_secs > 0: + # First element in batch, start buffering timer + buffering_timer.set(clock() + max_buffering_duration_secs) + + if count >= target_size: + return self.flush_batch( + element_state, + count_state, + batch_size_state, + batch_estimator_state, + buffering_timer) + + @on_timer(WINDOW_TIMER) + def on_window_timer( + self, + element_state=DoFn.StateParam(ELEMENT_STATE), + count_state=DoFn.StateParam(COUNT_STATE), + batch_size_state=DoFn.StateParam(BATCH_SIZE_STATE), + batch_estimator_state=DoFn.StateParam(BATCH_ESTIMATOR_STATE), + buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): + return self.flush_batch( + element_state, + count_state, + batch_size_state, + batch_estimator_state, + buffering_timer) + + @on_timer(BUFFERING_TIMER) + def on_buffering_timer( + self, + element_state=DoFn.StateParam(ELEMENT_STATE), + count_state=DoFn.StateParam(COUNT_STATE), + batch_size_state=DoFn.StateParam(BATCH_SIZE_STATE), + batch_estimator_state=DoFn.StateParam(BATCH_ESTIMATOR_STATE), + buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): + return self.flush_batch( + element_state, + count_state, + batch_size_state, + batch_estimator_state, + buffering_timer) + + def flush_batch( + self, + element_state, + count_state, + batch_size_state, + batch_estimator_state, + buffering_timer): + batch = [element for element in element_state.read()] + if not batch: + return + element_state.clear() + count_state.clear() + batch_estimator = batch_estimator_state.read() + with batch_estimator.record_time(len(batch)): + yield batch + batch_size_state.write(batch_estimator.next_batch_size()) + batch_estimator_state.write(batch_estimator) + buffering_timer.clear() + + return _StatefulBatchElementsDoFn() + + @typehints.with_input_types(T) @typehints.with_output_types(List[T]) class BatchElements(PTransform): @@ -677,6 +779,9 @@ class BatchElements(PTransform): in seconds, excluding fixed cost target_batch_duration_secs_including_fixed_cost: (optional) a target for total time per bundle, in seconds, including fixed cost + max_batch_duration_secs: (optional) the maximum amount of time to buffer + a batch before emitting. Setting this argument to be non-none uses the + stateful implementation of BatchElements. element_size_fn: (optional) A mapping of an element to its contribution to batch size, defaulting to every element having size 1. When provided, attempts to provide batches of optimal total size which may consist of @@ -696,6 +801,7 @@ def __init__( target_batch_overhead=.05, target_batch_duration_secs=10, target_batch_duration_secs_including_fixed_cost=None, + max_batch_duration_secs=None, *, element_size_fn=lambda x: 1, variance=0.25, @@ -712,10 +818,20 @@ def __init__( clock=clock, record_metrics=record_metrics) self._element_size_fn = element_size_fn + self._max_batch_dur = max_batch_duration_secs + self._clock = clock def expand(self, pcoll): if getattr(pcoll.pipeline.runner, 'is_streaming', False): raise NotImplementedError("Requires stateful processing (BEAM-2687)") + elif self._max_batch_dur is not None: + coder = coders.registry.get_coder(pcoll) + return pcoll | WithKeys(0) | ParDo( + _pardo_stateful_batch_elements( + coder, + self._batch_size_estimator, + self._max_batch_dur, + self._clock)) elif pcoll.windowing.is_default(): # This is the same logic as _GlobalWindowsBatchingDoFn, but optimized # for that simpler case. diff --git a/sdks/python/apache_beam/transforms/util_test.py b/sdks/python/apache_beam/transforms/util_test.py index d8a8bacb96cdf..5dfe166d3c31b 100644 --- a/sdks/python/apache_beam/transforms/util_test.py +++ b/sdks/python/apache_beam/transforms/util_test.py @@ -190,6 +190,30 @@ def sleep(self, duration): class BatchElementsTest(unittest.TestCase): + NUM_ELEMENTS = 10 + BATCH_SIZE = 5 + + @staticmethod + def _create_test_data(): + scientists = [ + "Einstein", + "Darwin", + "Copernicus", + "Pasteur", + "Curie", + "Faraday", + "Newton", + "Bohr", + "Galilei", + "Maxwell" + ] + + data = [] + for i in range(BatchElementsTest.NUM_ELEMENTS): + index = i % len(scientists) + data.append(scientists[index]) + return data + def test_constant_batch(self): # Assumes a single bundle... p = TestPipeline() @@ -461,6 +485,142 @@ def test_numpy_regression(self): self._run_regression_test( util._BatchSizeEstimator.linear_regression_numpy, True) + def test_stateful_constant_batch(self): + # Assumes a single bundle... + p = TestPipeline() + output = ( + p + | beam.Create(range(35)) + | util.BatchElements( + min_batch_size=10, max_batch_size=10, max_batch_duration_secs=100) + | beam.Map(len)) + assert_that(output, equal_to([10, 10, 10, 5])) + res = p.run() + res.wait_until_finish() + + def test_stateful_in_global_window(self): + with TestPipeline() as pipeline: + collection = pipeline \ + | beam.Create( + BatchElementsTest._create_test_data()) \ + | util.BatchElements( + min_batch_size=BatchElementsTest.BATCH_SIZE, + max_batch_size=BatchElementsTest.BATCH_SIZE, + max_batch_duration_secs=100) + num_batches = collection | beam.combiners.Count.Globally() + assert_that( + num_batches, + equal_to([ + int( + math.ceil( + BatchElementsTest.NUM_ELEMENTS / + BatchElementsTest.BATCH_SIZE)) + ])) + + def test_stateful_buffering_timer_in_fixed_window_streaming(self): + window_duration = 6 + max_buffering_duration_secs = 100 + + start_time = timestamp.Timestamp(0) + test_stream = ( + TestStream().add_elements([ + TimestampedValue(value, start_time + i) for i, + value in enumerate(BatchElementsTest._create_test_data()) + ]).advance_processing_time(150).advance_watermark_to( + start_time + window_duration).advance_watermark_to( + start_time + window_duration + + 1).advance_watermark_to_infinity()) + + with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: + # To trigger the processing time timer, use a fake clock with start time + # being Timestamp(0). + fake_clock = FakeClock(now=start_time) + + num_elements_per_batch = ( + pipeline | test_stream + | "fixed window" >> WindowInto(FixedWindows(window_duration)) + | util.BatchElements( + min_batch_size=BatchElementsTest.BATCH_SIZE, + max_batch_size=BatchElementsTest.BATCH_SIZE, + max_batch_duration_secs=max_buffering_duration_secs, + clock=fake_clock) + | "count elements in batch" >> Map(lambda x: (None, len(x))) + | GroupByKey() + | "global window" >> WindowInto(GlobalWindows()) + | FlatMapTuple(lambda k, vs: vs)) + + # Window duration is 6 and batch size is 5, so output batch size + # should be 5 (flush because of batch size reached). + expected_0 = 5 + # There is only one element left in the window so batch size + # should be 1 (flush because of max buffering duration reached). + expected_1 = 1 + # Collection has 10 elements, there are only 4 left, so batch size should + # be 4 (flush because of end of window reached). + expected_2 = 4 + assert_that( + num_elements_per_batch, + equal_to([expected_0, expected_1, expected_2]), + "assert2") + + def test_stateful_buffering_timer_in_global_window_streaming(self): + max_buffering_duration_secs = 42 + + start_time = timestamp.Timestamp(0) + test_stream = TestStream().advance_watermark_to(start_time) + for i, value in enumerate(BatchElementsTest._create_test_data()): + test_stream.add_elements( + [TimestampedValue(value, start_time + i)]) \ + .advance_processing_time(5) + test_stream.advance_watermark_to( + start_time + BatchElementsTest.NUM_ELEMENTS + 1) \ + .advance_watermark_to_infinity() + + with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: + # Set a batch size larger than the total number of elements. + # Since we're in a global window, we would have been waiting + # for all the elements to arrive without the buffering time limit. + batch_size = BatchElementsTest.NUM_ELEMENTS * 2 + + # To trigger the processing time timer, use a fake clock with start time + # being Timestamp(0). Since the fake clock never really advances during + # the pipeline execution, meaning that the timer is always set to the same + # value, the timer will be fired on every element after the first firing. + fake_clock = FakeClock(now=start_time) + + num_elements_per_batch = ( + pipeline | test_stream + | WindowInto( + GlobalWindows(), + trigger=Repeatedly(AfterCount(1)), + accumulation_mode=trigger.AccumulationMode.DISCARDING) + | util.BatchElements( + min_batch_size=batch_size, + max_batch_size=batch_size, + max_batch_duration_secs=max_buffering_duration_secs, + clock=fake_clock) + | 'count elements in batch' >> Map(lambda x: (None, len(x))) + | GroupByKey() + | FlatMapTuple(lambda k, vs: vs)) + + # We will flush twice when the max buffering duration is reached and when + # the global window ends. + assert_that(num_elements_per_batch, equal_to([9, 1])) + + def test_stateful_grows_to_max_batch(self): + # Assumes a single bundle... + with TestPipeline() as p: + res = ( + p + | beam.Create(range(164)) + | util.BatchElements( + min_batch_size=1, + max_batch_size=50, + max_batch_duration_secs=100, + clock=FakeClock()) + | beam.Map(len)) + assert_that(res, equal_to([1, 1, 2, 4, 8, 16, 32, 50, 50])) + class IdentityWindowTest(unittest.TestCase): def test_window_preserved(self): diff --git a/sdks/python/apache_beam/typehints/arrow_type_compatibility.py b/sdks/python/apache_beam/typehints/arrow_type_compatibility.py index c8e425f0e96a7..34a37a886bab5 100644 --- a/sdks/python/apache_beam/typehints/arrow_type_compatibility.py +++ b/sdks/python/apache_beam/typehints/arrow_type_compatibility.py @@ -311,9 +311,9 @@ def from_typehints(element_type, element_type = RowTypeConstraint.from_user_type(element_type) if element_type is None: raise TypeError( - "Element type must be compatible with Beam Schemas (" - "https://beam.apache.org/documentation/programming-guide/#schemas) " - "for batch type pa.Table.") + f"Element type {element_type} must be compatible with Beam Schemas " + "(https://beam.apache.org/documentation/programming-guide/#schemas)" + " for batch type pa.Table.") return PyarrowBatchConverter(element_type) diff --git a/sdks/python/apache_beam/typehints/arrow_type_compatibility_test.py b/sdks/python/apache_beam/typehints/arrow_type_compatibility_test.py index e708b151d9056..1e9ab3f27bd9c 100644 --- a/sdks/python/apache_beam/typehints/arrow_type_compatibility_test.py +++ b/sdks/python/apache_beam/typehints/arrow_type_compatibility_test.py @@ -206,7 +206,7 @@ class ArrowBatchConverterErrorsTest(unittest.TestCase): ( pa.Table, Any, - r'Element type must be compatible with Beam Schemas', + r'Element type .* must be compatible with Beam Schemas', ), ]) def test_construction_errors( diff --git a/sdks/python/apache_beam/typehints/decorators_test.py b/sdks/python/apache_beam/typehints/decorators_test.py index 239c9bd570789..3baf9fa8322fc 100644 --- a/sdks/python/apache_beam/typehints/decorators_test.py +++ b/sdks/python/apache_beam/typehints/decorators_test.py @@ -38,6 +38,7 @@ T = TypeVariable('T') # Name is 'T' so it converts to a beam type with the same name. # mypy requires that the name of the variable match, so we must ignore this. +# pylint: disable=typevar-name-mismatch T_typing = typing.TypeVar('T') # type: ignore diff --git a/sdks/python/apache_beam/typehints/native_type_compatibility.py b/sdks/python/apache_beam/typehints/native_type_compatibility.py index e916f34146f17..b2960ba0c7b77 100644 --- a/sdks/python/apache_beam/typehints/native_type_compatibility.py +++ b/sdks/python/apache_beam/typehints/native_type_compatibility.py @@ -45,6 +45,12 @@ frozenset: typing.FrozenSet, } +_CONVERTED_COLLECTIONS = [ + collections.abc.Set, + collections.abc.MutableSet, + collections.abc.Collection, +] + def _get_args(typ): """Returns a list of arguments to the given type. @@ -113,6 +119,10 @@ def _match_is_exactly_iterable(user_type): return getattr(user_type, '__origin__', None) is expected_origin +def _match_is_exactly_collection(user_type): + return getattr(user_type, '__origin__', None) is collections.abc.Collection + + def match_is_named_tuple(user_type): return ( _safe_issubclass(user_type, typing.Tuple) and @@ -148,6 +158,15 @@ def _match_is_union(user_type): return False +def match_is_set(user_type): + if _safe_issubclass(user_type, typing.Set): + return True + elif getattr(user_type, '__origin__', None) is not None: + return _safe_issubclass(user_type.__origin__, collections.abc.Set) + else: + return False + + def is_any(typ): return typ is typing.Any @@ -232,6 +251,7 @@ def convert_to_beam_type(typ): None) == 'collections.abc': typ = convert_collections_to_typing(typ) + typ_module = getattr(typ, '__module__', None) if isinstance(typ, typing.TypeVar): # This is a special case, as it's not parameterized by types. # Also, identity must be preserved through conversion (i.e. the same @@ -254,8 +274,13 @@ def convert_to_beam_type(typ): # TODO(https://github.com/apache/beam/issues/20076): Currently unhandled. _LOGGER.info('Converting NewType type hint to Any: "%s"', typ) return typehints.Any - elif getattr(typ, '__module__', None) != 'typing': - # Only translate types from the typing module. + elif (typ_module != 'typing') and (typ_module != 'collections.abc'): + # Only translate types from the typing and collections.abc modules. + return typ + if (typ_module == 'collections.abc' and + typ.__origin__ not in _CONVERTED_COLLECTIONS): + # TODO(https://github.com/apache/beam/issues/29135): + # Support more collections types return typ type_map = [ @@ -278,13 +303,12 @@ def convert_to_beam_type(typ): match=_match_issubclass(typing.List), arity=1, beam_type=typehints.List), - _TypeMapEntry( - match=_match_issubclass(typing.Set), arity=1, - beam_type=typehints.Set), + # FrozenSets are a specific instance of a set, so we check this first. _TypeMapEntry( match=_match_issubclass(typing.FrozenSet), arity=1, beam_type=typehints.FrozenSet), + _TypeMapEntry(match=match_is_set, arity=1, beam_type=typehints.Set), # NamedTuple is a subclass of Tuple, but it needs special handling. # We just convert it to Any for now. # This MUST appear before the entry for the normal Tuple. @@ -303,6 +327,10 @@ def convert_to_beam_type(typ): match=_match_issubclass(typing.Iterator), arity=1, beam_type=typehints.Iterator), + _TypeMapEntry( + match=_match_is_exactly_collection, + arity=1, + beam_type=typehints.Collection), ] # Find the first matching entry. diff --git a/sdks/python/apache_beam/typehints/native_type_compatibility_test.py b/sdks/python/apache_beam/typehints/native_type_compatibility_test.py index b9280c57a393a..2e6db6a7733c6 100644 --- a/sdks/python/apache_beam/typehints/native_type_compatibility_test.py +++ b/sdks/python/apache_beam/typehints/native_type_compatibility_test.py @@ -20,6 +20,7 @@ # pytype: skip-file import collections.abc +import enum import sys import typing import unittest @@ -54,6 +55,11 @@ class _TestPair(typing.NamedTuple('TestTuple', [('first', T), ('second', T)]), pass +class _TestEnum(enum.Enum): + FOO = enum.auto() + BAR = enum.auto() + + class NativeTypeCompatibilityTest(unittest.TestCase): def test_convert_to_beam_type(self): test_cases = [ @@ -106,6 +112,7 @@ def test_convert_to_beam_type(self): typehints.List[_TestGeneric[int]]), ('nested generic with any', typing.List[_TestPair[typing.Any]], typehints.List[_TestPair[typing.Any]]), + ('raw enum', _TestEnum, _TestEnum), ] for test_case in test_cases: @@ -122,20 +129,22 @@ def test_convert_to_beam_type(self): def test_convert_to_beam_type_with_builtin_types(self): if sys.version_info >= (3, 9): - test_cases = [('builtin dict', dict[str, int], typehints.Dict[str, int]), - ('builtin list', list[str], typehints.List[str]), - ('builtin tuple', tuple[str], typehints.Tuple[str]), - ('builtin set', set[str], typehints.Set[str]), - ( - 'nested builtin', - dict[str, list[tuple[float]]], - typehints.Dict[str, - typehints.List[typehints.Tuple[float]]]), - ( - 'builtin nested tuple', - tuple[str, list], - typehints.Tuple[str, typehints.List[typehints.Any]], - )] + test_cases = [ + ('builtin dict', dict[str, int], typehints.Dict[str, int]), + ('builtin list', list[str], typehints.List[str]), + ('builtin tuple', tuple[str], + typehints.Tuple[str]), ('builtin set', set[str], typehints.Set[str]), + ('builtin frozenset', frozenset[int], typehints.FrozenSet[int]), + ( + 'nested builtin', + dict[str, list[tuple[float]]], + typehints.Dict[str, typehints.List[typehints.Tuple[float]]]), + ( + 'builtin nested tuple', + tuple[str, list], + typehints.Tuple[str, typehints.List[typehints.Any]], + ) + ] for test_case in test_cases: description = test_case[0] @@ -171,6 +180,24 @@ def test_convert_to_beam_type_with_collections_types(self): 'mapping not caught', collections.abc.Mapping[str, int], collections.abc.Mapping[str, int]), + ('set', collections.abc.Set[str], typehints.Set[str]), + ('mutable set', collections.abc.MutableSet[int], typehints.Set[int]), + ( + 'enum set', + collections.abc.Set[_TestEnum], + typehints.Set[_TestEnum]), + ( + 'enum mutable set', + collections.abc.MutableSet[_TestEnum], + typehints.Set[_TestEnum]), + ( + 'collection enum', + collections.abc.Collection[_TestEnum], + typehints.Collection[_TestEnum]), + ( + 'collection of tuples', + collections.abc.Collection[tuple[str, int]], + typehints.Collection[typehints.Tuple[str, int]]), ] for test_case in test_cases: diff --git a/sdks/python/apache_beam/typehints/schemas.py b/sdks/python/apache_beam/typehints/schemas.py index 229a8af20bb6e..ea836430e8e2e 100644 --- a/sdks/python/apache_beam/typehints/schemas.py +++ b/sdks/python/apache_beam/typehints/schemas.py @@ -93,6 +93,7 @@ from apache_beam.typehints.native_type_compatibility import _match_is_exactly_mapping from apache_beam.typehints.native_type_compatibility import _match_is_optional from apache_beam.typehints.native_type_compatibility import _safe_issubclass +from apache_beam.typehints.native_type_compatibility import convert_to_typing_type from apache_beam.typehints.native_type_compatibility import extract_optional_type from apache_beam.typehints.native_type_compatibility import match_is_named_tuple from apache_beam.typehints.schema_registry import SCHEMA_REGISTRY @@ -284,6 +285,9 @@ def typing_to_runner_api(self, type_: type) -> schema_pb2.FieldType: if row_type_constraint is not None: return self.typing_to_runner_api(row_type_constraint) + if isinstance(type_, typehints.TypeConstraint): + type_ = convert_to_typing_type(type_) + # All concrete types (other than NamedTuple sub-classes) should map to # a supported primitive type. if type_ in PRIMITIVE_TO_ATOMIC_TYPE: diff --git a/sdks/python/apache_beam/typehints/typehints.py b/sdks/python/apache_beam/typehints/typehints.py index 238bf8c321d63..4fd4b97e82cde 100644 --- a/sdks/python/apache_beam/typehints/typehints.py +++ b/sdks/python/apache_beam/typehints/typehints.py @@ -82,6 +82,7 @@ 'Dict', 'Set', 'FrozenSet', + 'Collection', 'Iterable', 'Iterator', 'Generator', @@ -1017,6 +1018,66 @@ def __getitem__(self, type_param): FrozenSetTypeConstraint = FrozenSetHint.FrozenSetTypeConstraint +class CollectionHint(CompositeTypeHint): + """ A Collection type-hint. + + Collection[X] defines a type-hint for a collection of homogenous types. 'X' + may be either a built-in Python type or another nested TypeConstraint. + + This represents a collections.abc.Collection type, which implements + __contains__, __iter__, and __len__. This acts as a parent type for + sets but has fewer guarantees for mixins. + """ + class CollectionTypeConstraint(SequenceTypeConstraint): + def __init__(self, type_param): + super().__init__(type_param, abc.Collection) + + def __repr__(self): + return 'Collection[%s]' % repr(self.inner_type) + + @staticmethod + def _is_subclass_constraint(sub): + return isinstance( + sub, + ( + CollectionTypeConstraint, + FrozenSetTypeConstraint, + SetTypeConstraint, + ListConstraint)) + + def _consistent_with_check_(self, sub): + if self._is_subclass_constraint(sub): + return is_consistent_with(sub.inner_type, self.inner_type) + elif isinstance(sub, TupleConstraint): + if not sub.tuple_types: + # The empty tuple is consistent with Iterator[T] for any T. + return True + # Each element in the hetrogenious tuple must be consistent with + # the collection type. + # E.g. Tuple[A, B] < Collection[C] if A < C and B < C. + return all( + is_consistent_with(elem, self.inner_type) + for elem in sub.tuple_types) + # TODO(https://github.com/apache/beam/issues/29135): allow for + # consistency checks with Mapping types + elif isinstance(sub, DictConstraint): + return True + elif not isinstance(sub, TypeConstraint): + if getattr(sub, '__origin__', None) is not None and getattr( + sub, '__args__', None) is not None: + return issubclass(sub, abc.Collection) and is_consistent_with( + sub.__args__, self.inner_type) + return False + + def __getitem__(self, type_param): + validate_composite_type_param( + type_param, error_msg_prefix='Parameter to a Collection hint') + return self.CollectionTypeConstraint(type_param) + + +CollectionTypeConstraint = CollectionHint.CollectionTypeConstraint + + class IterableHint(CompositeTypeHint): """An Iterable type-hint. @@ -1187,6 +1248,7 @@ def __getitem__(self, type_params): Dict = DictHint() Set = SetHint() FrozenSet = FrozenSetHint() +Collection = CollectionHint() Iterable = IterableHint() Iterator = IteratorHint() Generator = GeneratorHint() diff --git a/sdks/python/apache_beam/typehints/typehints_test.py b/sdks/python/apache_beam/typehints/typehints_test.py index a1a1913bded71..c395893a23ba3 100644 --- a/sdks/python/apache_beam/typehints/typehints_test.py +++ b/sdks/python/apache_beam/typehints/typehints_test.py @@ -19,6 +19,7 @@ # pytype: skip-file +import collections.abc import functools import sys import typing @@ -845,6 +846,18 @@ class SetHintTestCase(BaseSetHintTest.CommonTests): beam_type = typehints.Set string_type = 'Set' + def test_builtin_compatibility(self): + if sys.version_info >= (3, 9): + self.assertCompatible(set[int], collections.abc.Set[int]) + self.assertCompatible(set[int], collections.abc.MutableSet[int]) + + def test_collections_compatibility(self): + if sys.version_info >= (3, 9): + self.assertCompatible( + collections.abc.Set[int], collections.abc.MutableSet[int]) + self.assertCompatible( + collections.abc.MutableSet[int], collections.abc.Set[int]) + class FrozenSetHintTestCase(BaseSetHintTest.CommonTests): py_type = frozenset @@ -852,6 +865,35 @@ class FrozenSetHintTestCase(BaseSetHintTest.CommonTests): string_type = 'FrozenSet' +class CollectionHintTestCase(TypeHintTestCase): + def test_type_constraint_compatibility(self): + self.assertCompatible(typehints.Collection[int], typehints.Set[int]) + self.assertCompatible(typehints.Iterable[int], typehints.Collection[int]) + self.assertCompatible(typehints.Collection[int], typehints.FrozenSet[int]) + self.assertCompatible( + typehints.Collection[typehints.Any], typehints.Collection[int]) + self.assertCompatible(typehints.Collection[int], typehints.Tuple[int]) + self.assertCompatible(typehints.Any, typehints.Collection[str]) + self.assertCompatible(typehints.Collection[str], typehints.List[str]) + + def test_one_way_compatibility(self): + self.assertNotCompatible(typehints.Set[int], typehints.Collection[int]) + self.assertNotCompatible( + typehints.FrozenSet[int], typehints.Collection[int]) + self.assertNotCompatible(typehints.Tuple[int], typehints.Collection[int]) + self.assertNotCompatible(typehints.Collection[int], typehints.Iterable[int]) + self.assertNotCompatible(typehints.List[str], typehints.Collection[str]) + + def test_getitem_invalid_composite_type_param(self): + with self.assertRaises(TypeError) as e: + typehints.Collection[5] + self.assertEqual( + 'Parameter to a Collection hint must be a ' + 'non-sequence, a type, or a TypeConstraint. 5 is ' + 'an instance of int.', + e.exception.args[0]) + + class IterableHintTestCase(TypeHintTestCase): def test_getitem_invalid_composite_type_param(self): with self.assertRaises(TypeError) as e: @@ -880,6 +922,7 @@ def test_compatibility(self): self.assertCompatible( typehints.Iterable[typehints.Any], typehints.List[typehints.Tuple[int, bool]]) + self.assertCompatible(typehints.Iterable[int], typehints.Collection[int]) def test_tuple_compatibility(self): self.assertCompatible(typehints.Iterable[int], typehints.Tuple[int, ...]) diff --git a/sdks/python/apache_beam/utils/subprocess_server.py b/sdks/python/apache_beam/utils/subprocess_server.py index f566c3ea29146..f6e214046f37d 100644 --- a/sdks/python/apache_beam/utils/subprocess_server.py +++ b/sdks/python/apache_beam/utils/subprocess_server.py @@ -36,6 +36,7 @@ import grpc +from apache_beam.io.filesystems import FileSystems from apache_beam.version import __version__ as beam_version _LOGGER = logging.getLogger(__name__) @@ -272,7 +273,10 @@ def local_jar(cls, url, cache_dir=None): os.makedirs(cache_dir) # TODO: Clean up this cache according to some policy. try: - url_read = urlopen(url) + try: + url_read = FileSystems.open(url) + except ValueError: + url_read = urlopen(url) with open(cached_jar + '.tmp', 'wb') as jar_write: shutil.copyfileobj(url_read, jar_write, length=1 << 20) os.rename(cached_jar + '.tmp', cached_jar) diff --git a/sdks/python/apache_beam/utils/transform_service_launcher.py b/sdks/python/apache_beam/utils/transform_service_launcher.py index 33feab9bf29c9..ac492513aba5c 100644 --- a/sdks/python/apache_beam/utils/transform_service_launcher.py +++ b/sdks/python/apache_beam/utils/transform_service_launcher.py @@ -86,6 +86,7 @@ def __init__(self, project_name, port, beam_version=None): compose_file = os.path.join(temp_dir, 'docker-compose.yml') + # Creating the credentials volume. credentials_dir = os.path.join(temp_dir, 'credentials_dir') if not os.path.exists(credentials_dir): os.mkdir(credentials_dir) @@ -111,11 +112,24 @@ def __init__(self, project_name, port, beam_version=None): 'credentials file at the expected location %s.' % application_default_path_file) + # Creating the dependencies volume. + dependencies_dir = os.path.join(temp_dir, 'dependencies_dir') + if not os.path.exists(dependencies_dir): + os.mkdir(dependencies_dir) + self._environmental_variables = {} self._environmental_variables['CREDENTIALS_VOLUME'] = credentials_dir + self._environmental_variables['DEPENDENCIES_VOLUME'] = dependencies_dir self._environmental_variables['TRANSFORM_SERVICE_PORT'] = str(port) self._environmental_variables['BEAM_VERSION'] = beam_version + # Setting an empty requirements file + requirements_file_name = os.path.join(dependencies_dir, 'requirements.txt') + with open(requirements_file_name, 'w') as _: + pass + self._environmental_variables['PYTHON_REQUIREMENTS_FILE_NAME'] = ( + 'requirements.txt') + self._docker_compose_start_command_prefix = [] self._docker_compose_start_command_prefix.append('docker-compose') self._docker_compose_start_command_prefix.append('-p') diff --git a/sdks/python/apache_beam/version.py b/sdks/python/apache_beam/version.py index a69e3839fff3e..fa890eab50059 100644 --- a/sdks/python/apache_beam/version.py +++ b/sdks/python/apache_beam/version.py @@ -17,4 +17,4 @@ """Apache Beam SDK version information and utilities.""" -__version__ = '2.52.0.dev' +__version__ = '2.53.0.dev' diff --git a/sdks/python/apache_beam/yaml/README.md b/sdks/python/apache_beam/yaml/README.md index 3ba78784c997c..247b42b6839a2 100644 --- a/sdks/python/apache_beam/yaml/README.md +++ b/sdks/python/apache_beam/yaml/README.md @@ -166,41 +166,42 @@ Here we read two sources, join them, and write two outputs. ``` pipeline: - - type: ReadFromCsv - name: ReadLeft - config: - path: /path/to/left*.csv + transforms: + - type: ReadFromCsv + name: ReadLeft + config: + path: /path/to/left*.csv - - type: ReadFromCsv - name: ReadRight - config: - path: /path/to/right*.csv + - type: ReadFromCsv + name: ReadRight + config: + path: /path/to/right*.csv - - type: Sql - config: - query: select left.col1, right.col2 from left join right using (col3) - input: - left: ReadLeft - right: ReadRight - - - type: WriteToJson - name: WriteAll - input: Sql - config: - path: /path/to/all.json + - type: Sql + config: + query: select left.col1, right.col2 from left join right using (col3) + input: + left: ReadLeft + right: ReadRight - - type: Filter - name: FilterToBig - input: Sql - config: - language: python - keep: "col2 > 100" + - type: WriteToJson + name: WriteAll + input: Sql + config: + path: /path/to/all.json - - type: WriteToCsv - name: WriteBig - input: FilterToBig - config: - path: /path/to/big.csv + - type: Filter + name: FilterToBig + input: Sql + config: + language: python + keep: "col2 > 100" + + - type: WriteToCsv + name: WriteBig + input: FilterToBig + config: + path: /path/to/big.csv ``` One can, however, nest `chains` within a non-linear pipeline. @@ -209,49 +210,50 @@ that has a single input and contains its own sink. ``` pipeline: - - type: ReadFromCsv - name: ReadLeft - config: - path: /path/to/left*.csv + transforms: + - type: ReadFromCsv + name: ReadLeft + config: + path: /path/to/left*.csv - - type: ReadFromCsv - name: ReadRight - config: - path: /path/to/right*.csv + - type: ReadFromCsv + name: ReadRight + config: + path: /path/to/right*.csv - - type: Sql - config: - query: select left.col1, right.col2 from left join right using (col3) - input: - left: ReadLeft - right: ReadRight - - - type: WriteToJson - name: WriteAll - input: Sql - config: - path: /path/to/all.json + - type: Sql + config: + query: select left.col1, right.col2 from left join right using (col3) + input: + left: ReadLeft + right: ReadRight - - type: chain - name: ExtraProcessingForBigRows - input: Sql - transforms: - - type: Filter - config: - language: python - keep: "col2 > 100" - - type: Filter - config: - language: python - keep: "len(col1) > 10" - - type: Filter - config: - language: python - keep: "col1 > 'z'" - sink: - type: WriteToCsv + - type: WriteToJson + name: WriteAll + input: Sql config: - path: /path/to/big.csv + path: /path/to/all.json + + - type: chain + name: ExtraProcessingForBigRows + input: Sql + transforms: + - type: Filter + config: + language: python + keep: "col2 > 100" + - type: Filter + config: + language: python + keep: "len(col1) > 10" + - type: Filter + config: + language: python + keep: "col1 > 'z'" + sink: + type: WriteToCsv + config: + path: /path/to/big.csv ``` ## Windowing @@ -329,25 +331,26 @@ a join per window. ``` pipeline: - - type: ReadFromPubSub - name: ReadLeft - config: - topic: leftTopic + transforms: + - type: ReadFromPubSub + name: ReadLeft + config: + topic: leftTopic - - type: ReadFromPubSub - name: ReadRight - config: - topic: rightTopic + - type: ReadFromPubSub + name: ReadRight + config: + topic: rightTopic - - type: Sql - config: - query: select left.col1, right.col2 from left join right using (col3) - input: - left: ReadLeft - right: ReadRight - windowing: - type: fixed - size: 60 + - type: Sql + config: + query: select left.col1, right.col2 from left join right using (col3) + input: + left: ReadLeft + right: ReadRight + windowing: + type: fixed + size: 60 ``` For a transform with no inputs, the specified windowing is instead applied to @@ -480,7 +483,7 @@ The Beam yaml parser is currently included as part of the Apache Beam Python SDK This can be installed (e.g. within a virtual environment) as ``` -pip install apache_beam +pip install apache_beam[yaml,gcp] ``` In addition, several of the provided transforms (such as SQL) are implemented diff --git a/sdks/python/apache_beam/yaml/cache_provider_artifacts.py b/sdks/python/apache_beam/yaml/cache_provider_artifacts.py index 6c96dd3b0fd92..5efc6d04355ba 100644 --- a/sdks/python/apache_beam/yaml/cache_provider_artifacts.py +++ b/sdks/python/apache_beam/yaml/cache_provider_artifacts.py @@ -16,6 +16,7 @@ # import logging +import sys import time from apache_beam.version import __version__ as beam_version @@ -37,7 +38,8 @@ def cache_provider_artifacts(): if '.dev' not in beam_version: # Also cache a base python venv for fast cloning. t = time.time() - artifacts = yaml_provider.PypiExpansionService._create_venv_to_clone() + artifacts = yaml_provider.PypiExpansionService._create_venv_to_clone( + sys.executable) logging.info('Cached %s in %0.03f seconds.', artifacts, time.time() - t) diff --git a/sdks/python/apache_beam/yaml/json_utils.py b/sdks/python/apache_beam/yaml/json_utils.py new file mode 100644 index 0000000000000..e11d18720617d --- /dev/null +++ b/sdks/python/apache_beam/yaml/json_utils.py @@ -0,0 +1,219 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Utilities for converting between JSON and Beam Schema'd data. + +For internal use, no backward compatibility guarantees. +""" + +import json +from typing import Any +from typing import Callable +from typing import Dict +from typing import Optional + +import jsonschema + +import apache_beam as beam +from apache_beam.portability.api import schema_pb2 +from apache_beam.typehints import schemas + +JSON_ATOMIC_TYPES_TO_BEAM = { + 'boolean': schema_pb2.BOOLEAN, + 'integer': schema_pb2.INT64, + 'number': schema_pb2.DOUBLE, + 'string': schema_pb2.STRING, +} + + +def json_schema_to_beam_schema( + json_schema: Dict[str, Any]) -> schema_pb2.Schema: + """Returns a Beam schema equivalent for the given Json schema.""" + def maybe_nullable(beam_type, nullable): + if nullable: + beam_type.nullable = True + return beam_type + + json_type = json_schema.get('type', None) + if json_type != 'object': + raise ValueError('Expected object type, got {json_type}.') + if 'properties' not in json_schema: + # Technically this is a valid (vacuous) schema, but as it's not generally + # meaningful, throw an informative error instead. + # (We could add a flag to allow this degenerate case.) + raise ValueError('Missing properties for {json_schema}.') + required = set(json_schema.get('required', [])) + return schema_pb2.Schema( + fields=[ + schemas.schema_field( + name, + maybe_nullable(json_type_to_beam_type(t), name not in required)) + for (name, t) in json_schema['properties'].items() + ]) + + +def json_type_to_beam_type(json_type: Dict[str, Any]) -> schema_pb2.FieldType: + """Returns a Beam schema type for the given Json (schema) type.""" + if not isinstance(json_type, dict) or 'type' not in json_type: + raise ValueError(f'Malformed type {json_type}.') + type_name = json_type['type'] + if type_name in JSON_ATOMIC_TYPES_TO_BEAM: + return schema_pb2.FieldType( + atomic_type=JSON_ATOMIC_TYPES_TO_BEAM[type_name]) + elif type_name == 'array': + return schema_pb2.FieldType( + array_type=schema_pb2.ArrayType( + element_type=json_type_to_beam_type(json_type['items']))) + elif type_name == 'object': + if 'properties' in json_type: + return schema_pb2.FieldType( + row_type=schema_pb2.RowType( + schema=json_schema_to_beam_schema(json_type))) + elif 'additionalProperties' in json_type: + return schema_pb2.FieldType( + map_type=schema_pb2.MapType( + key_type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), + value_type=json_type_to_beam_type( + json_type['additionalProperties']))) + else: + raise ValueError( + f'Object type must have either properties or additionalProperties, ' + f'got {json_type}.') + else: + raise ValueError(f'Unable to convert {json_type} to a Beam schema.') + + +def json_to_row(beam_type: schema_pb2.FieldType) -> Callable[[Any], Any]: + """Returns a callable converting Json objects to Beam rows of the given type. + + The input to the returned callable is expected to conform to the Json schema + corresponding to this Beam type. + """ + type_info = beam_type.WhichOneof("type_info") + if type_info == "atomic_type": + return lambda value: value + elif type_info == "array_type": + element_converter = json_to_row(beam_type.array_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "iterable_type": + element_converter = json_to_row(beam_type.iterable_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "map_type": + if beam_type.map_type.key_type.atomic_type != schema_pb2.STRING: + raise TypeError( + f'Only strings allowd as map keys when converting from JSON, ' + f'found {beam_type}') + value_converter = json_to_row(beam_type.map_type.value_type) + return lambda value: {k: value_converter(v) for (k, v) in value.items()} + elif type_info == "row_type": + converters = { + field.name: json_to_row(field.type) + for field in beam_type.row_type.schema.fields + } + return lambda value: beam.Row( + ** + {name: convert(value[name]) + for (name, convert) in converters.items()}) + elif type_info == "logical_type": + return lambda value: value + else: + raise ValueError(f"Unrecognized type_info: {type_info!r}") + + +def json_parser( + beam_schema: schema_pb2.Schema, + json_schema: Optional[Dict[str, + Any]] = None) -> Callable[[bytes], beam.Row]: + """Returns a callable converting Json strings to Beam rows of the given type. + + The input to the returned callable is expected to conform to the Json schema + corresponding to this Beam type. + """ + if json_schema is None: + validate_fn = None + else: + cls = jsonschema.validators.validator_for(json_schema) + cls.check_schema(json_schema) + validate_fn = _PicklableFromConstructor( + lambda: jsonschema.validators.validator_for(json_schema) + (json_schema).validate) + + to_row = json_to_row( + schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=beam_schema))) + + def parse(s: bytes): + o = json.loads(s) + if validate_fn is not None: + validate_fn(o) + return to_row(o) + + return parse + + +class _PicklableFromConstructor: + def __init__(self, constructor): + self._constructor = constructor + self._value = None + + def __call__(self, o): + if self._value is None: + self._value = self._constructor() + return self._value(o) + + def __getstate__(self): + return {'_constructor': self._constructor, '_value': None} + + +def row_to_json(beam_type: schema_pb2.FieldType) -> Callable[[Any], Any]: + """Returns a callable converting rows of the given type to Json objects.""" + type_info = beam_type.WhichOneof("type_info") + if type_info == "atomic_type": + return lambda value: value + elif type_info == "array_type": + element_converter = row_to_json(beam_type.array_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "iterable_type": + element_converter = row_to_json(beam_type.iterable_type.element_type) + return lambda value: [element_converter(e) for e in value] + elif type_info == "map_type": + if beam_type.map_type.key_type.atomic_type != schema_pb2.STRING: + raise TypeError( + f'Only strings allowd as map keys when converting to JSON, ' + f'found {beam_type}') + value_converter = row_to_json(beam_type.map_type.value_type) + return lambda value: {k: value_converter(v) for (k, v) in value.items()} + elif type_info == "row_type": + converters = { + field.name: row_to_json(field.type) + for field in beam_type.row_type.schema.fields + } + return lambda row: { + name: convert(getattr(row, name)) + for (name, convert) in converters.items() + } + elif type_info == "logical_type": + return lambda value: value + else: + raise ValueError(f"Unrecognized type_info: {type_info!r}") + + +def json_formater( + beam_schema: schema_pb2.Schema) -> Callable[[beam.Row], bytes]: + """Returns a callable converting rows of the given schema to Json strings.""" + convert = row_to_json( + schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=beam_schema))) + return lambda row: json.dumps(convert(row), sort_keys=True).encode('utf-8') diff --git a/sdks/python/apache_beam/yaml/main.py b/sdks/python/apache_beam/yaml/main.py index eb0695f337b42..331b9e7b36166 100644 --- a/sdks/python/apache_beam/yaml/main.py +++ b/sdks/python/apache_beam/yaml/main.py @@ -20,6 +20,7 @@ import yaml import apache_beam as beam +from apache_beam.io.filesystems import FileSystems from apache_beam.typehints.schemas import LogicalType from apache_beam.typehints.schemas import MillisInstant from apache_beam.yaml import yaml_transform @@ -43,27 +44,30 @@ def _pipeline_spec_from_args(known_args): raise ValueError( "Exactly one of pipeline_spec or pipeline_spec_file must be set.") elif known_args.pipeline_spec_file: - with open(known_args.pipeline_spec_file) as fin: - pipeline_yaml = fin.read() + with FileSystems.open(known_args.pipeline_spec_file) as fin: + pipeline_yaml = fin.read().decode() elif known_args.pipeline_spec: pipeline_yaml = known_args.pipeline_spec else: raise ValueError( "Exactly one of pipeline_spec or pipeline_spec_file must be set.") - return yaml.load(pipeline_yaml, Loader=yaml_transform.SafeLineLoader) + return pipeline_yaml def run(argv=None): yaml_transform._LOGGER.setLevel('INFO') known_args, pipeline_args = _configure_parser(argv) - pipeline_spec = _pipeline_spec_from_args(known_args) + pipeline_yaml = _pipeline_spec_from_args(known_args) + pipeline_spec = yaml.load(pipeline_yaml, Loader=yaml_transform.SafeLineLoader) - with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pipeline_args, - pickle_library='cloudpickle', - **yaml_transform.SafeLineLoader.strip_metadata(pipeline_spec.get( - 'options', {})))) as p: + with beam.Pipeline( # linebreak for better yapf formatting + options=beam.options.pipeline_options.PipelineOptions( + pipeline_args, + pickle_library='cloudpickle', + **yaml_transform.SafeLineLoader.strip_metadata(pipeline_spec.get( + 'options', {}))), + display_data={'yaml': pipeline_yaml}) as p: print("Building pipeline...") yaml_transform.expand_pipeline(p, pipeline_spec) print("Running pipeline...") diff --git a/sdks/python/apache_beam/yaml/options.py b/sdks/python/apache_beam/yaml/options.py new file mode 100644 index 0000000000000..e80141c40b1d3 --- /dev/null +++ b/sdks/python/apache_beam/yaml/options.py @@ -0,0 +1,36 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from apache_beam.options import pipeline_options + + +class YamlOptions(pipeline_options.PipelineOptions): + @classmethod + def _add_argparse_args(cls, parser): + parser.add_argument( + '--yaml_experimental_features', + dest='yaml_experimental_features', + action='append', + default=[], + help=('Enable yaml features ahead of them being declared stable.')) + + @classmethod + def check_enabled(cls, pipeline, feature, description=None): + if feature not in pipeline._options.view_as(cls).yaml_experimental_features: + raise ValueError( + f'{description or feature} unsupported because ' + f'{feature} is not set in --yaml_experimental_features option.') diff --git a/sdks/python/apache_beam/yaml/pipeline.schema.yaml b/sdks/python/apache_beam/yaml/pipeline.schema.yaml index ef0d9fe0f2621..40f576c1618b7 100644 --- a/sdks/python/apache_beam/yaml/pipeline.schema.yaml +++ b/sdks/python/apache_beam/yaml/pipeline.schema.yaml @@ -15,7 +15,7 @@ # limitations under the License. # -$schema: 'http://json-schema.org/schema#' +$schema: 'http://json-schema.org/draft-07/schema#' $id: https://github.com/apache/beam/tree/master/sdks/python/apache_beam/yaml/pipeline.schema.yaml $defs: @@ -115,6 +115,23 @@ $defs: - $ref: '#/$defs/nestedTransform' - $ref: '#/$defs/implicitInputOutputs' + - if: + not: + anyOf: + - properties: { type: { const: composite }} + - properties: { type: { const: chain }} + then: + properties: + type: {} + name: {} + input: {} + output: {} + windowing: {} + config: { type: object } + __line__: {} + __uuid__: {} + additionalProperties: false + windowing: {} # TODO provider: @@ -128,27 +145,43 @@ $defs: properties: { __line__: {}} additionalProperties: type: string + config: { type: object } + __line__: {} + __uuid__: {} + additionalProperties: false required: - type - transforms + - config type: object properties: pipeline: - anyOf: - - type: array - items: - $ref: '#/$defs/transform' - - $ref: '#/$defs/transform' + allOf: + # These are the only top-level properties defined in pipeline. - type: object properties: - transforms: - type: array - items: - $ref: '#/$defs/transform' + type: + oneOf: + - { const: composite } + - { const: chain } + windowing: + $ref: '#/$defs/windowing' + transforms: {} + extra_transforms: {} + sink: {} + source: {} __line__: {} __uuid__: {} additionalProperties: false + # This defines the allowable contents of the attributes above. + - $ref: '#/$defs/nestedTransform' + # A chain-type transform, like a chain composite, must have implicit io. + - if: + properties: { type: { const: chain }} + required: [type] + then: + $ref: '#/$defs/implicitInputOutputs' providers: type: array items: diff --git a/sdks/python/apache_beam/yaml/readme_test.py b/sdks/python/apache_beam/yaml/readme_test.py index d918d18e11dd0..7f2d193bf35f5 100644 --- a/sdks/python/apache_beam/yaml/readme_test.py +++ b/sdks/python/apache_beam/yaml/readme_test.py @@ -26,13 +26,13 @@ import tempfile import unittest +import mock import yaml from yaml.loader import SafeLoader import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.typehints import trivial_inference -from apache_beam.yaml import yaml_mapping from apache_beam.yaml import yaml_provider from apache_beam.yaml import yaml_transform @@ -200,7 +200,10 @@ def test(self): if write in test_yaml: spec = replace_recursive(spec, write, 'path', env.output_file()) modified_yaml = yaml.dump(spec) - options = {'pickle_library': 'cloudpickle'} + options = { + 'pickle_library': 'cloudpickle', + 'yaml_experimental_features': ['Combine'] + } if RENDER_DIR is not None: options['runner'] = 'apache_beam.runners.render.RenderRunner' options['render_output'] = [ @@ -208,13 +211,12 @@ def test(self): ] options['render_leaf_composite_nodes'] = ['.*'] test_provider = TestProvider(TEST_TRANSFORMS) - test_sql_mapping_provider = yaml_mapping.SqlMappingProvider(test_provider) - p = beam.Pipeline(options=PipelineOptions(**options)) - yaml_transform.expand_pipeline( - p, - modified_yaml, - yaml_provider.merge_providers( - [test_provider, test_sql_mapping_provider])) + with mock.patch( + 'apache_beam.yaml.yaml_provider.SqlBackedProvider.sql_provider', + lambda self: test_provider): + p = beam.Pipeline(options=PipelineOptions(**options)) + yaml_transform.expand_pipeline( + p, modified_yaml, yaml_provider.merge_providers([test_provider])) if test_type == 'BUILD': return p.run().wait_until_finish() @@ -270,6 +272,9 @@ def createTestSuite(name, path): 'ErrorHandlingTest', os.path.join(os.path.dirname(__file__), 'yaml_errors.md')) +CombineTest = createTestSuite( + 'CombineTest', os.path.join(os.path.dirname(__file__), 'yaml_combine.md')) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--render_dir', default=None) diff --git a/sdks/python/apache_beam/yaml/standard_io.yaml b/sdks/python/apache_beam/yaml/standard_io.yaml index 9ad4f53ba1f62..b19c1e5b063ef 100644 --- a/sdks/python/apache_beam/yaml/standard_io.yaml +++ b/sdks/python/apache_beam/yaml/standard_io.yaml @@ -46,6 +46,40 @@ config: gradle_target: 'sdks:java:extensions:sql:expansion-service:shadowJar' +- type: renaming + transforms: + 'ReadFromKafka': 'ReadFromKafka' + 'WriteToKafka': 'WriteToKafka' + config: + mappings: + 'ReadFromKafka': + 'schema': 'schema' + 'consumer_config': 'consumerConfigUpdates' + 'format': 'format' + 'topic': 'topic' + 'bootstrap_servers': 'bootstrapServers' + 'confluent_schema_registry_url': 'confluentSchemaRegistryUrl' + 'confluent_schema_registry_subject': 'confluentSchemaRegistrySubject' + 'auto_offset_reset_config': 'autoOffsetResetConfig' + 'error_handling': 'errorHandling' + 'file_descriptor_path': 'fileDescriptorPath' + 'message_name': 'messageName' + 'WriteToKafka': + 'format': 'format' + 'topic': 'topic' + 'bootstrap_servers': 'bootstrapServers' + 'producer_config_updates': 'ProducerConfigUpdates' + 'error_handling': 'errorHandling' + 'file_descriptor_path': 'fileDescriptorPath' + 'message_name': 'messageName' + underlying_provider: + type: beamJar + transforms: + 'ReadFromKafka': 'beam:schematransform:org.apache.beam:kafka_read:v1' + 'WriteToKafka': 'beam:schematransform:org.apache.beam:kafka_write:v1' + config: + gradle_target: 'sdks:java:io:expansion-service:shadowJar' + - type: python transforms: 'ReadFromBigQuery': 'apache_beam.yaml.yaml_io.read_from_bigquery' @@ -67,6 +101,10 @@ 'WriteToCsv': 'WriteToCsv' 'ReadFromJson': 'ReadFromJson' 'WriteToJson': 'WriteToJson' + 'ReadFromParquet': 'ReadFromParquet' + 'WriteToParquet': 'WriteToParquet' + 'ReadFromAvro': 'ReadFromAvro' + 'WriteToAvro': 'WriteToAvro' config: mappings: 'ReadFromCsv': @@ -77,6 +115,19 @@ path: 'path' 'WriteToJson': path: 'path' + 'ReadFromParquet': + path: 'file_pattern' + 'WriteToParquet': + path: 'file_path_prefix' + 'ReadFromAvro': + path: 'file_pattern' + 'WriteToAvro': + path: 'file_path_prefix' + defaults: + 'ReadFromParquet': + as_rows: True + 'ReadFromAvro': + as_rows: True underlying_provider: type: python transforms: @@ -84,3 +135,88 @@ 'WriteToCsv': 'apache_beam.io.WriteToCsv' 'ReadFromJson': 'apache_beam.io.ReadFromJson' 'WriteToJson': 'apache_beam.io.WriteToJson' + 'ReadFromParquet': 'apache_beam.io.ReadFromParquet' + 'WriteToParquet': 'apache_beam.io.WriteToParquet' + 'ReadFromAvro': 'apache_beam.io.ReadFromAvro' + 'WriteToAvro': 'apache_beam.io.WriteToAvro' + +- type: beamJar + transforms: + 'WriteToCsv': 'beam:schematransform:org.apache.beam:csv_write:v1' + 'WriteToJson': 'beam:schematransform:org.apache.beam:json_write:v1' + config: + gradle_target: 'sdks:java:extensions:schemaio-expansion-service:shadowJar' + +- type: renaming + transforms: + 'ReadFromJdbc': 'ReadFromJdbc' + 'WriteToJdbc': 'WriteToJdbc' + 'ReadFromMySql': 'ReadFromJdbc' + 'WriteToMySql': 'WriteToJdbc' + 'ReadFromPostgres': 'ReadFromJdbc' + 'WriteToPostgres': 'WriteToJdbc' + 'ReadFromOracle': 'ReadFromJdbc' + 'WriteToOracle': 'WriteToJdbc' + 'ReadFromSqlServer': 'ReadFromJdbc' + 'WriteToSqlServer': 'WriteToJdbc' + config: + mappings: + 'ReadFromJdbc': + driver_class_name: 'driverClassName' + url: 'jdbcUrl' + username: 'username' + password: 'password' + table: 'location' + query: 'readQuery' + driver_jars: 'driverJars' + connection_properties: 'connectionProperties' + connection_init_sql: 'connectionInitSql' + 'WriteToJdbc': + driver_class_name: 'driverClassName' + url: 'jdbcUrl' + username: 'username' + password: 'password' + table: 'location' + driver_jars: 'driverJars' + connection_properties: 'connectionProperties' + connection_init_sql: 'connectionInitSql' + 'ReadFromMySql': 'ReadFromJdbc' + 'WriteToMySql': 'WriteToJdbc' + 'ReadFromPostgres': 'ReadFromJdbc' + 'WriteToPostgres': 'WriteToJdbc' + 'ReadFromOracle': 'ReadFromJdbc' + 'WriteToOracle': 'WriteToJdbc' + 'ReadFromSqlServer': 'ReadFromJdbc' + 'WriteToSqlServer': 'WriteToJdbc' + defaults: + 'ReadFromMySql': + driverClassName: 'com.mysql.jdbc.Driver' + 'WriteToMySql': + driverClassName: 'com.mysql.jdbc.Driver' + 'ReadFromPostgres': + driverClassName: 'org.postgresql.Driver' + 'WriteToPostgres': + driverClassName: 'org.postgresql.Driver' + 'ReadFromOracle': + driverClassName: 'oracle.jdbc.driver.OracleDriver' + 'WriteToOracle': + driverClassName: 'oracle.jdbc.driver.OracleDriver' + 'ReadFromSqlServer': + driverClassName: 'com.microsoft.sqlserver.jdbc.SQLServerDriver' + 'WriteToSqlServer': + driverClassName: 'com.microsoft.sqlserver.jdbc.SQLServerDriver' + underlying_provider: + type: beamJar + transforms: + 'ReadFromJdbc': 'beam:schematransform:org.apache.beam:jdbc_read:v1' + 'WriteToJdbc': 'beam:schematransform:org.apache.beam:jdbc_write:v1' + 'ReadFromMySql': 'beam:schematransform:org.apache.beam:jdbc_read:v1' + 'WriteToMySql': 'beam:schematransform:org.apache.beam:jdbc_write:v1' + 'ReadFromPostgres': 'beam:schematransform:org.apache.beam:jdbc_read:v1' + 'WriteToPostgres': 'beam:schematransform:org.apache.beam:jdbc_write:v1' + 'ReadFromOracle': 'beam:schematransform:org.apache.beam:jdbc_read:v1' + 'WriteToOracle': 'beam:schematransform:org.apache.beam:jdbc_write:v1' + 'ReadFromSqlServer': 'beam:schematransform:org.apache.beam:jdbc_read:v1' + 'WriteToSqlServer': 'beam:schematransform:org.apache.beam:jdbc_write:v1' + config: + gradle_target: 'sdks:java:extensions:schemaio-expansion-service:shadowJar' diff --git a/sdks/python/apache_beam/yaml/standard_providers.yaml b/sdks/python/apache_beam/yaml/standard_providers.yaml index cdb4036f98c21..c612d44120815 100644 --- a/sdks/python/apache_beam/yaml/standard_providers.yaml +++ b/sdks/python/apache_beam/yaml/standard_providers.yaml @@ -24,3 +24,41 @@ version: BEAM_VERSION transforms: Sql: 'beam:external:java:sql:v1' + MapToFields-java: "beam:schematransform:org.apache.beam:yaml:map_to_fields-java:v1" + MapToFields-generic: "beam:schematransform:org.apache.beam:yaml:map_to_fields-java:v1" + +- type: renaming + transforms: + 'MapToFields-java': 'MapToFields-java' + 'MapToFields-generic': 'MapToFields-java' + 'Filter-java': 'Filter-java' + 'Explode': 'Explode' + config: + mappings: + 'MapToFields-generic': + language: 'language' + append: 'append' + drop: 'drop' + fields: 'fields' + error_handling: 'errorHandling' + 'MapToFields-java': + language: 'language' + append: 'append' + drop: 'drop' + fields: 'fields' + error_handling: 'errorHandling' + 'Filter-java': + language: 'language' + keep: 'keep' + error_handling: 'errorHandling' + 'Explode': + fields: 'fields' + cross_product: 'crossProduct' + underlying_provider: + type: beamJar + transforms: + MapToFields-java: "beam:schematransform:org.apache.beam:yaml:map_to_fields-java:v1" + Filter-java: "beam:schematransform:org.apache.beam:yaml:filter-java:v1" + Explode: "beam:schematransform:org.apache.beam:yaml:explode:v1" + config: + gradle_target: 'sdks:java:extensions:sql:expansion-service:shadowJar' diff --git a/sdks/python/apache_beam/yaml/yaml_combine.md b/sdks/python/apache_beam/yaml/yaml_combine.md new file mode 100644 index 0000000000000..e2fef304fb0a1 --- /dev/null +++ b/sdks/python/apache_beam/yaml/yaml_combine.md @@ -0,0 +1,166 @@ + + +# Beam YAML Aggregations + +Beam YAML has EXPERIMENTAL ability to do aggregations to group and combine +values across records. The is accomplished via the `Combine` transform type. +Currently `Combine` needs to be in the `yaml_experimental_features` +option to use this transform. + +For example, one can write + +``` +- type: Combine + config: + group_by: col1 + combine: + total: + value: col2 + fn: + type: sum +``` + +If the function has no configuration requirements, it can be provided directly +as a string + +``` +- type: Combine + config: + group_by: col1 + combine: + total: + value: col2 + fn: sum +``` + +This can be simplified further if the output field name is the same as the input +field name + +``` +- type: Combine + config: + group_by: col1 + combine: + col2: sum +``` + +One can aggregate over may fields at once + +``` +- type: Combine + config: + group_by: col1 + combine: + col2: sum + col3: max +``` + +and/or group by more than one field + +``` +- type: Combine + config: + group_by: [col1, col2] + combine: + col3: sum +``` + +or none at all (which will result in a global combine with a single output) + +``` +- type: Combine + config: + group_by: [] + combine: + col2: sum + col3: max +``` + +## Windowed aggregation + +As with all transforms, `Combine` can take a windowing parameter + +``` +- type: Combine + windowing: + type: fixed + size: 60 + config: + group_by: col1 + combine: + col2: sum + col3: max +``` + +If no windowing specification is provided, it inherits the windowing +parameters from upstream, e.g. + +``` +- type: WindowInto + windowing: + type: fixed + size: 60 +- type: Combine + config: + group_by: col1 + combine: + col2: sum + col3: max +``` + +is equivalent to the previous example. + + +## Custom aggregation functions + +One can use aggregation functions defined in Python by setting the language +parameter. + +``` +- type: Combine + config: + language: python + group_by: col1 + combine: + biggest: + value: "col2 + col2" + fn: + type: 'apache_beam.transforms.combiners.TopCombineFn' + config: + n: 10 +``` + +## SQL-style aggregations + +By setting the language to SQL, one can provide full SQL snippets as the +combine fn. + +``` +- type: Combine + config: + language: sql + group_by: col1 + combine: + num_values: "count(*)" + total: "sum(col2)" +``` + +One can of course also use the `Sql` transform type and provide a query +directly. diff --git a/sdks/python/apache_beam/yaml/yaml_combine.py b/sdks/python/apache_beam/yaml/yaml_combine.py new file mode 100644 index 0000000000000..ef4974cff351f --- /dev/null +++ b/sdks/python/apache_beam/yaml/yaml_combine.py @@ -0,0 +1,205 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""This module defines the basic Combine operation.""" + +from typing import Any +from typing import Iterable +from typing import Mapping +from typing import Optional + +import apache_beam as beam +from apache_beam import typehints +from apache_beam.typehints import row_type +from apache_beam.typehints import trivial_inference +from apache_beam.typehints.decorators import get_type_hints +from apache_beam.typehints.schemas import named_fields_from_element_type +from apache_beam.utils import python_callable +from apache_beam.yaml import options +from apache_beam.yaml import yaml_mapping +from apache_beam.yaml import yaml_provider + +BUILTIN_COMBINE_FNS = { + 'sum': sum, + 'max': max, + 'min': min, + 'all': all, + 'any': any, + 'mean': beam.transforms.combiners.MeanCombineFn(), + 'count': beam.transforms.combiners.CountCombineFn(), +} + + +def normalize_combine(spec): + """Expands various shorthand specs for combine (which can otherwise be quite + verbose for simple cases.) We do this here so that it doesn't need to be done + per language. The following are all equivalent:: + + dest: fn_type + + dest: + value: dest + fn: fn_type + + dest: + value: dest + fn: + type: fn_type + """ + from apache_beam.yaml.yaml_transform import SafeLineLoader + if spec['type'] == 'Combine': + config = spec.get('config') + if isinstance(config.get('group_by'), str): + config['group_by'] = [config['group_by']] + + def normalize_agg(dest, agg): + if isinstance(agg, str): + agg = {'fn': agg} + if 'value' not in agg and spec.get('language') != 'sql': + agg['value'] = dest + if isinstance(agg['fn'], str): + agg['fn'] = {'type': agg['fn']} + return agg + + if 'combine' not in config: + raise ValueError('Missing combine parameter in Combine config.') + config['combine'] = { + dest: normalize_agg(dest, agg) + for (dest, + agg) in SafeLineLoader.strip_metadata(config['combine']).items() + } + return spec + + +class PyJsYamlCombine(beam.PTransform): + def __init__( + self, + group_by: Iterable[str], + combine: Mapping[str, Mapping[str, Any]], + language: Optional[str] = None): + self._group_by = group_by + self._combine = combine + self._language = language + + def expand(self, pcoll): + options.YamlOptions.check_enabled(pcoll.pipeline, 'Combine') + input_types = dict(named_fields_from_element_type(pcoll.element_type)) + all_fields = list(input_types.keys()) + unknown_keys = set(self._group_by) - set(all_fields) + if unknown_keys: + raise ValueError(f'Unknown grouping columns: {list(unknown_keys)}') + + def create_combine_fn(fn_spec): + if 'type' not in fn_spec: + raise ValueError(f'CombineFn spec missing type: {fn_spec}') + elif fn_spec['type'] in BUILTIN_COMBINE_FNS: + return BUILTIN_COMBINE_FNS[fn_spec['type']] + elif self._language == 'python': + # TODO(yaml): Support output_type here as well. + fn = python_callable.PythonCallableWithSource.load_from_source( + fn_spec['type']) + if 'config' in fn_spec: + fn = fn(**fn_spec['config']) + return fn + else: + raise TypeError('Unknown CombineFn: {fn_spec}') + + def extract_return_type(expr): + if isinstance(expr, str) and expr in input_types: + return input_types[expr] + expr_hints = get_type_hints(expr) + if (expr_hints and expr_hints.has_simple_output_type() and + expr_hints.simple_output_type(None) != typehints.Any): + return expr_hints.simple_output_type(None) + elif callable(expr): + return trivial_inference.infer_return_type(expr, [pcoll.element_type]) + else: + return Any + + # TODO(yaml): Support error handling. + transform = beam.GroupBy(*self._group_by) + output_types = [(k, input_types[k]) for k in self._group_by] + + for output, agg in self._combine.items(): + expr = yaml_mapping._as_callable( + all_fields, agg['value'], 'Combine', self._language) + fn = create_combine_fn(agg['fn']) + transform = transform.aggregate_field(expr, fn, output) + + # TODO(yaml): See if this logic can be pushed into GroupBy itself. + expr_type = extract_return_type(expr) + print('expr', expr, 'expr_type', expr_type) + if isinstance(fn, beam.CombineFn): + # TODO(yaml): Better inference on CombineFns whose outputs types are + # functions of their input types + combined_type = extract_return_type(fn) + elif fn in (sum, min, max): + combined_type = expr_type + elif fn in (any, all): + combined_type = bool + else: + combined_type = Any + output_types.append((output, combined_type)) + + return pcoll | transform.with_output_types( + row_type.RowTypeConstraint.from_fields(output_types)) + + +@beam.ptransform.ptransform_fn +def _SqlCombineTransform( + pcoll, sql_transform_constructor, group_by, combine, language=None): + options.YamlOptions.check_enabled(pcoll.pipeline, 'Combine') + all_fields = [ + x for x, _ in named_fields_from_element_type(pcoll.element_type) + ] + unknown_keys = set(group_by) - set(all_fields) + if unknown_keys: + raise ValueError(f'Unknown grouping columns: {list(unknown_keys)}') + + def combine_col(dest, fn_spec): + if 'value' in fn_spec or 'config' in fn_spec['fn']: + expr = '%s(%s)' % ( + fn_spec['fn']['type'], + ', '.join([fn_spec['value']] + + list(fn_spec['fn'].get('config', {}).values()))) + else: + expr = fn_spec['fn']['type'] + return f'{expr} as {dest}' + + return pcoll | sql_transform_constructor( + 'SELECT %s FROM PCOLLECTION GROUP BY %s' % ( + ', '.join( + list(group_by) + + [combine_col(dest, fn_spec) + for dest, fn_spec in combine.items()]), + ', '.join(group_by), + )) + + +def create_combine_providers(): + return [ + yaml_provider.InlineProvider({ + 'Combine-generic': PyJsYamlCombine, + 'Combine-python': PyJsYamlCombine, + 'Combine-javascript': PyJsYamlCombine, + }), + yaml_provider.SqlBackedProvider({ + 'Combine-generic': _SqlCombineTransform, + 'Combine-sql': _SqlCombineTransform, + 'Combine-calcite': _SqlCombineTransform, + }), + ] diff --git a/sdks/python/apache_beam/yaml/yaml_combine_test.py b/sdks/python/apache_beam/yaml/yaml_combine_test.py new file mode 100644 index 0000000000000..ef696c89379f5 --- /dev/null +++ b/sdks/python/apache_beam/yaml/yaml_combine_test.py @@ -0,0 +1,173 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import unittest + +import apache_beam as beam +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import equal_to +from apache_beam.yaml.yaml_transform import YamlTransform + +DATA = [ + beam.Row(a='x', b=1, c=101), + beam.Row(a='x', b=1, c=102), + beam.Row(a='y', b=1, c=103), + beam.Row(a='y', b=2, c=104), +] + + +class YamlCombineTest(unittest.TestCase): + def test_multiple_aggregations(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle', yaml_experimental_features=['Combine' + ])) as p: + elements = p | beam.Create(DATA) + result = elements | YamlTransform( + ''' + type: Combine + config: + group_by: a + combine: + b: sum + c: max + ''') + assert_that( + result | beam.Map(lambda x: beam.Row(**x._asdict())), + equal_to([ + beam.Row(a='x', b=2, c=102), + beam.Row(a='y', b=3, c=104), + ])) + + def test_multiple_keys(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle', yaml_experimental_features=['Combine' + ])) as p: + elements = p | beam.Create(DATA) + result = elements | YamlTransform( + ''' + type: Combine + config: + group_by: [a, b] + combine: + c: sum + ''') + assert_that( + result | beam.Map(lambda x: beam.Row(**x._asdict())), + equal_to([ + beam.Row(a='x', b=1, c=203), + beam.Row(a='y', b=1, c=103), + beam.Row(a='y', b=2, c=104), + ])) + + def test_no_keys(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle', yaml_experimental_features=['Combine' + ])) as p: + elements = p | beam.Create(DATA) + result = elements | YamlTransform( + ''' + type: Combine + config: + group_by: [] + combine: + c: sum + ''') + assert_that( + result | beam.Map(lambda x: beam.Row(**x._asdict())), + equal_to([ + beam.Row(c=410), + ])) + + def test_multiple_combines(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle', yaml_experimental_features=['Combine' + ])) as p: + elements = p | beam.Create(DATA) + result = elements | YamlTransform( + ''' + type: Combine + config: + group_by: a + combine: + min_c: + fn: min + value: c + max_c: + fn: max + value: c + ''') + assert_that( + result | beam.Map(lambda x: beam.Row(**x._asdict())), + equal_to([ + beam.Row(a='x', min_c=101, max_c=102), + beam.Row(a='y', min_c=103, max_c=104), + ])) + + def test_expression(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle', yaml_experimental_features=['Combine' + ])) as p: + elements = p | beam.Create(DATA) + result = elements | YamlTransform( + ''' + type: Combine + config: + language: python + group_by: a + combine: + max: + fn: max + value: b + c + ''') + assert_that( + result | beam.Map(lambda x: beam.Row(**x._asdict())), + equal_to([ + beam.Row(a='x', max=103), + beam.Row(a='y', max=106), + ])) + + def test_config(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle', yaml_experimental_features=['Combine' + ])) as p: + elements = p | beam.Create(DATA) + result = elements | YamlTransform( + ''' + type: Combine + config: + language: python + group_by: b + combine: + biggest: + fn: + type: 'apache_beam.transforms.combiners.TopCombineFn' + config: + n: 2 + value: c + ''') + assert_that( + result | beam.Map(lambda x: beam.Row(**x._asdict())), + equal_to([ + beam.Row(b=1, biggest=[103, 102]), + beam.Row(b=2, biggest=[104]), + ])) + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + unittest.main() diff --git a/sdks/python/apache_beam/yaml/yaml_errors.md b/sdks/python/apache_beam/yaml/yaml_errors.md index e7a60f750a106..aec6023936747 100644 --- a/sdks/python/apache_beam/yaml/yaml_errors.md +++ b/sdks/python/apache_beam/yaml/yaml_errors.md @@ -67,6 +67,10 @@ Note that with `error_handling` declared, `MapToFields.my_error_output` logging the bad records to stdout would be sufficient (though not recommended for a robust pipeline). +Note also that the exact format of the error outputs is still being finalized. +They can be safely printed and written to outputs, but their precise schema +may change in a future version of Beam and should not yet be depended on. + Some transforms allow for extra arguments in their error_handling config, e.g. for Python functions one can give a `threshold` which limits the relative number of records that can be bad before considering the entire pipeline a failure diff --git a/sdks/python/apache_beam/yaml/yaml_io.py b/sdks/python/apache_beam/yaml/yaml_io.py index 4a1d124900571..bf4009719b804 100644 --- a/sdks/python/apache_beam/yaml/yaml_io.py +++ b/sdks/python/apache_beam/yaml/yaml_io.py @@ -23,22 +23,28 @@ implementations of the same transforms, the configs must be kept in sync. """ +import io import os from typing import Any +from typing import Callable from typing import Iterable from typing import List from typing import Mapping from typing import Optional +from typing import Tuple +import fastavro import yaml import apache_beam as beam import apache_beam.io as beam_io from apache_beam.io import ReadFromBigQuery from apache_beam.io import WriteToBigQuery +from apache_beam.io import avroio from apache_beam.io.gcp.bigquery import BigQueryDisposition from apache_beam.portability.api import schema_pb2 from apache_beam.typehints import schemas +from apache_beam.yaml import json_utils from apache_beam.yaml import yaml_mapping from apache_beam.yaml import yaml_provider @@ -131,18 +137,32 @@ def raise_exception(failed_row_with_error): return WriteToBigQueryHandlingErrors() -def _create_parser(format, schema): +def _create_parser( + format, + schema: Any) -> Tuple[schema_pb2.Schema, Callable[[bytes], beam.Row]]: if format == 'raw': if schema: raise ValueError('raw format does not take a schema') return ( schema_pb2.Schema(fields=[schemas.schema_field('payload', bytes)]), lambda payload: beam.Row(payload=payload)) + elif format == 'json': + beam_schema = json_utils.json_schema_to_beam_schema(schema) + return beam_schema, json_utils.json_parser(beam_schema, schema) + elif format == 'avro': + beam_schema = avroio.avro_schema_to_beam_schema(schema) + covert_to_row = avroio.avro_dict_to_beam_row(schema, beam_schema) + return ( + beam_schema, + lambda record: covert_to_row( + fastavro.schemaless_reader(io.BytesIO(record), schema))) else: raise ValueError(f'Unknown format: {format}') -def _create_formatter(format, schema, beam_schema): +def _create_formatter( + format, schema: Any, + beam_schema: schema_pb2.Schema) -> Callable[[beam.Row], bytes]: if format == 'raw': if schema: raise ValueError('raw format does not take a schema') @@ -150,6 +170,19 @@ def _create_formatter(format, schema, beam_schema): if len(field_names) != 1: raise ValueError(f'Expecting exactly one field, found {field_names}') return lambda row: getattr(row, field_names[0]) + elif format == 'json': + return json_utils.json_formater(beam_schema) + elif format == 'avro': + avro_schema = schema or avroio.beam_schema_to_avro_schema(beam_schema) + from_row = avroio.beam_row_to_avro_dict(avro_schema, beam_schema) + + def formatter(row): + buffer = io.BytesIO() + fastavro.schemaless_writer(buffer, avro_schema, from_row(row)) + buffer.seek(0) + return buffer.read() + + return formatter else: raise ValueError(f'Unknown format: {format}') @@ -182,6 +215,8 @@ def read_from_pubsub( - raw: Produces records with a single `payload` field whose contents are the raw bytes of the pubsub message. + - avro: Parses records with a given avro schema. + - json: Parses records with a given json schema. schema: Schema specification for the given format. attributes: List of attribute keys whose values will be flattened into the @@ -276,8 +311,12 @@ def write_to_pubsub( formats are - raw: Expects a message with a single field (excluding - attribute-related fields )whose contents are used as the raw bytes + attribute-related fields) whose contents are used as the raw bytes of the pubsub message. + - avro: Encodes records with a given avro schema, which may be inferred + from the input PCollection schema. + - json: Formats records with a given json schema, which may be inferred + from the input PCollection schema. schema: Schema specification for the given format. attributes: List of attribute keys whose values will be pulled out as diff --git a/sdks/python/apache_beam/yaml/yaml_io_test.py b/sdks/python/apache_beam/yaml/yaml_io_test.py index ab6298661c157..54fbac0fbeb29 100644 --- a/sdks/python/apache_beam/yaml/yaml_io_test.py +++ b/sdks/python/apache_beam/yaml/yaml_io_test.py @@ -15,9 +15,12 @@ # limitations under the License. # +import io +import json import logging import unittest +import fastavro import mock import apache_beam as beam @@ -167,6 +170,175 @@ def test_read_with_id_attribute(self): result, equal_to([beam.Row(payload=b'msg1'), beam.Row(payload=b'msg2')])) + _avro_schema = { + 'type': 'record', + 'name': 'ec', + 'fields': [{ + 'name': 'label', 'type': 'string' + }, { + 'name': 'rank', 'type': 'int' + }] + } + + def _encode_avro(self, data): + buffer = io.BytesIO() + fastavro.schemaless_writer(buffer, self._avro_schema, data) + buffer.seek(0) + return buffer.read() + + def test_read_avro(self): + + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + with mock.patch( + 'apache_beam.io.ReadFromPubSub', + FakeReadFromPubSub( + topic='my_topic', + messages=[PubsubMessage(self._encode_avro({'label': '37a', + 'rank': 1}), {}), + PubsubMessage(self._encode_avro({'label': '389a', + 'rank': 2}), {})])): + result = p | YamlTransform( + ''' + type: ReadFromPubSub + config: + topic: my_topic + format: avro + schema: %s + ''' % json.dumps(self._avro_schema)) + assert_that( + result, + equal_to( + [beam.Row(label='37a', rank=1), # linebreak + beam.Row(label='389a', rank=2)])) + + def test_read_json(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + with mock.patch('apache_beam.io.ReadFromPubSub', + FakeReadFromPubSub( + topic='my_topic', + messages=[PubsubMessage( + b'{"generator": {"x": 0, "y": 0}, "rank": 1}', + {'weierstrass': 'y^2+y=x^3-x', 'label': '37a'}) + ])): + result = p | YamlTransform( + ''' + type: ReadFromPubSub + config: + topic: my_topic + format: json + schema: + type: object + properties: + generator: + type: object + properties: + x: {type: integer} + y: {type: integer} + rank: {type: integer} + attributes: [label] + attributes_map: other + ''') + assert_that( + result, + equal_to([ + beam.Row( + generator=beam.Row(x=0, y=0), + rank=1, + label='37a', + other={ + 'label': '37a', 'weierstrass': 'y^2+y=x^3-x' + }) + ])) + + def test_read_json_with_error_handling(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + with mock.patch( + 'apache_beam.io.ReadFromPubSub', + FakeReadFromPubSub(topic='my_topic', + messages=[PubsubMessage('{"some_int": 123}', + attributes={}), + PubsubMessage('unparsable', + attributes={})])): + result = p | YamlTransform( + ''' + type: ReadFromPubSub + config: + topic: my_topic + format: json + schema: + type: object + properties: + some_int: {type: integer} + error_handling: + output: errors + ''') + assert_that( + result['good'], + equal_to([beam.Row(some_int=123)]), + label='CheckGood') + assert_that( + result['errors'] | beam.Map(lambda error: error.element), + equal_to(['unparsable']), + label='CheckErrors') + + def test_read_json_without_error_handling(self): + with self.assertRaises(Exception): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + with mock.patch( + 'apache_beam.io.ReadFromPubSub', + FakeReadFromPubSub(topic='my_topic', + messages=[PubsubMessage('{"some_int": 123}', + attributes={}), + PubsubMessage('unparsable', + attributes={})])): + _ = p | YamlTransform( + ''' + type: ReadFromPubSub + config: + topic: my_topic + format: json + schema: + type: object + properties: + some_int: {type: integer} + ''') + + def test_read_json_with_bad_schema(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + with mock.patch('apache_beam.io.ReadFromPubSub', + FakeReadFromPubSub( + topic='my_topic', + messages=[PubsubMessage('{"some_int": 123}', + attributes={}), + PubsubMessage('{"some_int": "NOT"}', + attributes={})])): + result = p | YamlTransform( + ''' + type: ReadFromPubSub + config: + topic: my_topic + format: json + schema: + type: object + properties: + some_int: {type: integer} + error_handling: + output: errors + ''') + assert_that( + result['good'], + equal_to([beam.Row(some_int=123)]), + label='CheckGood') + assert_that( + result['errors'] | beam.Map(lambda error: error.element), + equal_to(['{"some_int": "NOT"}']), + label='CheckErrors') + def test_simple_write(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( pickle_library='cloudpickle')) as p: @@ -179,7 +351,6 @@ def test_simple_write(self): | YamlTransform( ''' type: WriteToPubSub - input: input config: topic: my_topic format: raw @@ -201,7 +372,6 @@ def test_write_with_attribute(self): ]) | YamlTransform( ''' type: WriteToPubSub - input: input config: topic: my_topic format: raw @@ -224,7 +394,6 @@ def test_write_with_attribute_map(self): ]) | YamlTransform( ''' type: WriteToPubSub - input: input config: topic: my_topic format: raw @@ -244,13 +413,61 @@ def test_write_with_id_attribute(self): | YamlTransform( ''' type: WriteToPubSub - input: input config: topic: my_topic format: raw id_attribute: some_attr ''')) + def test_write_avro(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + with mock.patch( + 'apache_beam.io.WriteToPubSub', + FakeWriteToPubSub( + topic='my_topic', + messages=[PubsubMessage(self._encode_avro({'label': '37a', + 'rank': 1}), {}), + PubsubMessage(self._encode_avro({'label': '389a', + 'rank': 2}), {})])): + _ = ( + p | beam.Create( + [beam.Row(label='37a', rank=1), beam.Row(label='389a', rank=2)]) + | YamlTransform( + ''' + type: WriteToPubSub + config: + topic: my_topic + format: avro + ''')) + + def test_write_json(self): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + with mock.patch('apache_beam.io.WriteToPubSub', + FakeWriteToPubSub( + topic='my_topic', + messages=[PubsubMessage( + b'{"generator": {"x": 0, "y": 0}, "rank": 1}', + {'weierstrass': 'y^2+y=x^3-x', 'label': '37a'}) + ])): + _ = ( + p | beam.Create([ + beam.Row( + label='37a', + generator=beam.Row(x=0, y=0), + rank=1, + other={'weierstrass': 'y^2+y=x^3-x'}) + ]) | YamlTransform( + ''' + type: WriteToPubSub + config: + topic: my_topic + format: json + attributes: [label] + attributes_map: other + ''')) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/yaml/yaml_mapping.md b/sdks/python/apache_beam/yaml/yaml_mapping.md index 653b4abe8b89e..e760b691e13d6 100644 --- a/sdks/python/apache_beam/yaml/yaml_mapping.md +++ b/sdks/python/apache_beam/yaml/yaml_mapping.md @@ -200,3 +200,47 @@ criteria. This can be accomplished with a `Filter` transform, e.g. language: sql keep: "col2 > 0" ``` + +## Types + +Beam will try to infer the types involved in the mappings, but sometimes this +is not possible. In these cases one can explicitly denote the expected output +type, e.g. + +``` +- type: MapToFields + config: + language: python + fields: + new_col: + expression: "col1.upper()" + output_type: string +``` + +The expected type is given in json schema notation, with the addition that +a top-level basic types may be given as a literal string rather than requiring +a `{type: 'basic_type_name'}` nesting. + +``` +- type: MapToFields + config: + language: python + fields: + new_col: + expression: "col1.upper()" + output_type: string + another_col: + expression: "beam.Row(a=col1, b=[col2])" + output_type: + type: 'object' + properties: + a: + type: 'string' + b: + type: 'array' + items: + type: 'number' +``` + +This can be especially useful to resolve errors involving the inability to +handle the `beam:logical:pythonsdk_any:v1` type. diff --git a/sdks/python/apache_beam/yaml/yaml_mapping.py b/sdks/python/apache_beam/yaml/yaml_mapping.py index 889f7f1ee3092..42af11ae2456a 100644 --- a/sdks/python/apache_beam/yaml/yaml_mapping.py +++ b/sdks/python/apache_beam/yaml/yaml_mapping.py @@ -17,24 +17,32 @@ """This module defines the basic MapToFields operation.""" import itertools +from collections import abc from typing import Any from typing import Callable from typing import Collection from typing import Dict -from typing import Iterable from typing import Mapping from typing import Optional from typing import Union import js2py +from js2py import base +from js2py.constructors import jsdate +from js2py.internals import simplex import apache_beam as beam from apache_beam.io.filesystems import FileSystems +from apache_beam.portability.api import schema_pb2 from apache_beam.typehints import row_type +from apache_beam.typehints import schemas from apache_beam.typehints import trivial_inference from apache_beam.typehints.schemas import named_fields_from_element_type from apache_beam.utils import python_callable +from apache_beam.yaml import json_utils +from apache_beam.yaml import options from apache_beam.yaml import yaml_provider +from apache_beam.yaml.yaml_provider import dicts_to_rows def _check_mapping_arguments( @@ -71,19 +79,67 @@ def __setstate__(self, state): self.__dict__.update(state) +# TODO(yaml) Improve type inferencing for JS UDF's +def py_value_to_js_dict(py_value): + if ((isinstance(py_value, tuple) and hasattr(py_value, '_asdict')) or + isinstance(py_value, beam.Row)): + py_value = py_value._asdict() + if isinstance(py_value, dict): + return {key: py_value_to_js_dict(value) for key, value in py_value.items()} + elif not isinstance(py_value, str) and isinstance(py_value, abc.Iterable): + return [py_value_to_js_dict(value) for value in list(py_value)] + else: + return py_value + + # TODO(yaml) Consider adding optional language version parameter to support # ECMAScript 5 and 6 def _expand_javascript_mapping_func( original_fields, expression=None, callable=None, path=None, name=None): + + js_array_type = ( + base.PyJsArray, + base.PyJsArrayBuffer, + base.PyJsInt8Array, + base.PyJsUint8Array, + base.PyJsUint8ClampedArray, + base.PyJsInt16Array, + base.PyJsUint16Array, + base.PyJsInt32Array, + base.PyJsUint32Array, + base.PyJsFloat32Array, + base.PyJsFloat64Array) + + def _js_object_to_py_object(obj): + if isinstance(obj, (base.PyJsNumber, base.PyJsString, base.PyJsBoolean)): + return base.to_python(obj) + elif isinstance(obj, js_array_type): + return [_js_object_to_py_object(value) for value in obj.to_list()] + elif isinstance(obj, jsdate.PyJsDate): + return obj.to_utc_dt() + elif isinstance(obj, (base.PyJsNull, base.PyJsUndefined)): + return None + elif isinstance(obj, base.PyJsError): + raise RuntimeError(obj['message']) + elif isinstance(obj, base.PyJsObject): + return { + key: _js_object_to_py_object(value['value']) + for (key, value) in obj.own.items() + } + elif isinstance(obj, base.JsObjectWrapper): + return _js_object_to_py_object(obj._obj) + + return obj + if expression: - args = ', '.join(original_fields) - js_func = f'function fn({args}) {{return ({expression})}}' - js_callable = _CustomJsObjectWrapper(js2py.eval_js(js_func)) - return lambda __row__: js_callable(*__row__._asdict().values()) + source = '\n'.join(['function(__row__) {'] + [ + f' {name} = __row__.{name}' + for name in original_fields if name in expression + ] + [' return (' + expression + ')'] + ['}']) + js_func = _CustomJsObjectWrapper(js2py.eval_js(source)) elif callable: - js_callable = _CustomJsObjectWrapper(js2py.eval_js(callable)) - return lambda __row__: js_callable(__row__._asdict()) + js_func = _CustomJsObjectWrapper(js2py.eval_js(callable)) else: if not path.endswith('.js'): @@ -91,8 +147,19 @@ def _expand_javascript_mapping_func( udf_code = FileSystems.open(path).read().decode() js = js2py.EvalJs() js.eval(udf_code) - js_callable = _CustomJsObjectWrapper(getattr(js, name)) - return lambda __row__: js_callable(__row__._asdict()) + js_func = _CustomJsObjectWrapper(getattr(js, name)) + + def js_wrapper(row): + row_as_dict = py_value_to_js_dict(row) + try: + js_result = js_func(row_as_dict) + except simplex.JsException as exn: + raise RuntimeError( + f"Error evaluating javascript expression: " + f"{exn.mes['message']}") from exn + return dicts_to_rows(_js_object_to_py_object(js_result)) + + return js_wrapper def _expand_python_mapping_func( @@ -120,11 +187,48 @@ def _expand_python_mapping_func( return python_callable.PythonCallableWithSource(source) +def _validator(beam_type: schema_pb2.FieldType) -> Callable[[Any], bool]: + """Returns a callable converting rows of the given type to Json objects.""" + type_info = beam_type.WhichOneof("type_info") + if type_info == "atomic_type": + if beam_type.atomic_type == schema_pb2.BOOLEAN: + return lambda x: isinstance(x, bool) + elif beam_type.atomic_type == schema_pb2.INT64: + return lambda x: isinstance(x, int) + elif beam_type.atomic_type == schema_pb2.DOUBLE: + return lambda x: isinstance(x, (int, float)) + elif beam_type.atomic_type == schema_pb2.STRING: + return lambda x: isinstance(x, str) + else: + raise ValueError( + f'Unknown or unsupported atomic type: {beam_type.atomic_type}') + elif type_info == "array_type": + element_validator = _validator(beam_type.array_type.element_type) + return lambda value: all(element_validator(e) for e in value) + elif type_info == "iterable_type": + element_validator = _validator(beam_type.iterable_type.element_type) + return lambda value: all(element_validator(e) for e in value) + elif type_info == "map_type": + key_validator = _validator(beam_type.map_type.key_type) + value_validator = _validator(beam_type.map_type.value_type) + return lambda value: all( + key_validator(k) and value_validator(v) for (k, v) in value.items()) + elif type_info == "row_type": + validators = { + field.name: _validator(field.type) + for field in beam_type.row_type.schema.fields + } + return lambda row: all( + validator(getattr(row, name)) + for (name, validator) in validators.items()) + else: + raise ValueError(f"Unrecognized type_info: {type_info!r}") + + def _as_callable(original_fields, expr, transform_name, language): if expr in original_fields: return expr - # TODO(yaml): support a type parameter # TODO(yaml): support an imports parameter # TODO(yaml): support a requirements parameter (possibly at a higher level) if isinstance(expr, str): @@ -132,20 +236,36 @@ def _as_callable(original_fields, expr, transform_name, language): if not isinstance(expr, dict): raise ValueError( f"Ambiguous expression type (perhaps missing quoting?): {expr}") - elif len(expr) != 1 and ('path' not in expr or 'name' not in expr): - raise ValueError(f"Ambiguous expression type: {list(expr.keys())}") - + explicit_type = expr.pop('output_type', None) _check_mapping_arguments(transform_name, **expr) if language == "javascript": - return _expand_javascript_mapping_func(original_fields, **expr) + func = _expand_javascript_mapping_func(original_fields, **expr) elif language == "python": - return _expand_python_mapping_func(original_fields, **expr) + func = _expand_python_mapping_func(original_fields, **expr) else: raise ValueError( f'Unknown language for mapping transform: {language}. ' 'Supported languages are "javascript" and "python."') + if explicit_type: + if isinstance(explicit_type, str): + explicit_type = {'type': explicit_type} + beam_type = json_utils.json_type_to_beam_type(explicit_type) + validator = _validator(beam_type) + + @beam.typehints.with_output_types(schemas.typing_from_runner_api(beam_type)) + def checking_func(row): + result = func(row) + if not validator(result): + raise TypeError(f'{result} violates schema {explicit_type}') + return result + + return checking_func + + else: + return func + def exception_handling_args(error_handling_spec): if error_handling_spec: @@ -183,8 +303,6 @@ def expand(pcoll, error_handling=None, **kwargs): return expand -# TODO(yaml): This should be available in all environments, in which case -# we choose the one that matches best. class _Explode(beam.PTransform): def __init__( self, @@ -233,11 +351,12 @@ def explode_zip(base, fields): copy[field] = values[ix] yield beam.Row(**copy) + cross_product = self._cross_product return ( pcoll | beam.FlatMap( lambda row: - (explode_cross_product if self._cross_product else explode_zip) + (explode_cross_product if cross_product else explode_zip) ({name: getattr(row, name) for name in all_fields}, to_explode))) @@ -257,6 +376,9 @@ def with_exception_handling(self, **kwargs): @maybe_with_exception_handling_transform_fn def _PyJsFilter( pcoll, keep: Union[str, Dict[str, str]], language: Optional[str] = None): + if language == 'javascript': + options.YamlOptions.check_enabled(pcoll.pipeline, 'javascript') + try: input_schema = dict(named_fields_from_element_type(pcoll.element_type)) except (TypeError, ValueError) as exn: @@ -327,6 +449,9 @@ def normalize_fields(pcoll, fields, drop=(), append=False, language='generic'): def _PyJsMapToFields(pcoll, language='generic', **mapping_args): input_schema, fields = normalize_fields( pcoll, language=language, **mapping_args) + if language == 'javascript': + options.YamlOptions.check_enabled(pcoll.pipeline, 'javascript') + original_fields = list(input_schema.keys()) return pcoll | beam.Select( @@ -336,62 +461,14 @@ def _PyJsMapToFields(pcoll, language='generic', **mapping_args): }) -class SqlMappingProvider(yaml_provider.Provider): - def __init__(self, sql_provider=None): - if sql_provider is None: - sql_provider = yaml_provider.beam_jar( - urns={'Sql': 'beam:external:java:sql:v1'}, - gradle_target='sdks:java:extensions:sql:expansion-service:shadowJar') - self._sql_provider = sql_provider - - def available(self): - return self._sql_provider.available() - - def cache_artifacts(self): - return self._sql_provider.cache_artifacts() - - def provided_transforms(self) -> Iterable[str]: - return [ - 'Filter-sql', - 'Filter-calcite', - 'MapToFields-sql', - 'MapToFields-calcite' - ] - - def create_transform( - self, - typ: str, - args: Mapping[str, Any], - yaml_create_transform: Callable[ - [Mapping[str, Any], Iterable[beam.PCollection]], beam.PTransform] - ) -> beam.PTransform: - if typ.startswith('Filter-'): - return _SqlFilterTransform( - self._sql_provider, yaml_create_transform, **args) - if typ.startswith('MapToFields-'): - return _SqlMapToFieldsTransform( - self._sql_provider, yaml_create_transform, **args) - else: - raise NotImplementedError(typ) - - def underlying_provider(self): - return self._sql_provider - - def to_json(self): - return {'type': "SqlMappingProvider"} - - @beam.ptransform.ptransform_fn -def _SqlFilterTransform( - pcoll, sql_provider, yaml_create_transform, keep, language): - return pcoll | sql_provider.create_transform( - 'Sql', {'query': f'SELECT * FROM PCOLLECTION WHERE {keep}'}, - yaml_create_transform) +def _SqlFilterTransform(pcoll, sql_transform_constructor, keep, language): + return pcoll | sql_transform_constructor( + f'SELECT * FROM PCOLLECTION WHERE {keep}') @beam.ptransform.ptransform_fn -def _SqlMapToFieldsTransform( - pcoll, sql_provider, yaml_create_transform, **mapping_args): +def _SqlMapToFieldsTransform(pcoll, sql_transform_constructor, **mapping_args): _, fields = normalize_fields(pcoll, **mapping_args) def extract_expr(name, v): @@ -407,8 +484,7 @@ def extract_expr(name, v): for (name, expr) in fields.items() ] query = "SELECT " + ", ".join(selects) + " FROM PCOLLECTION" - return pcoll | sql_provider.create_transform( - 'Sql', {'query': query}, yaml_create_transform) + return pcoll | sql_transform_constructor(query) def create_mapping_providers(): @@ -424,5 +500,10 @@ def create_mapping_providers(): 'MapToFields-javascript': _PyJsMapToFields, 'MapToFields-generic': _PyJsMapToFields, }), - SqlMappingProvider(), + yaml_provider.SqlBackedProvider({ + 'Filter-sql': _SqlFilterTransform, + 'Filter-calcite': _SqlFilterTransform, + 'MapToFields-sql': _SqlMapToFieldsTransform, + 'MapToFields-calcite': _SqlMapToFieldsTransform, + }), ] diff --git a/sdks/python/apache_beam/yaml/yaml_mapping_test.py b/sdks/python/apache_beam/yaml/yaml_mapping_test.py index 55032aeae52e9..0de2f7022550c 100644 --- a/sdks/python/apache_beam/yaml/yaml_mapping_test.py +++ b/sdks/python/apache_beam/yaml/yaml_mapping_test.py @@ -40,7 +40,6 @@ def test_basic(self): result = elements | YamlTransform( ''' type: MapToFields - input: input config: language: python fields: @@ -62,7 +61,6 @@ def test_drop(self): result = elements | YamlTransform( ''' type: MapToFields - input: input config: fields: {} append: true @@ -83,7 +81,6 @@ def test_filter(self): result = elements | YamlTransform( ''' type: Filter - input: input config: language: python keep: "rank > 0" @@ -106,7 +103,6 @@ def test_explode(self): result = elements | YamlTransform( ''' type: chain - input: input transforms: - type: MapToFields config: @@ -136,6 +132,27 @@ def test_explode(self): beam.Row(a=3, b='y', c=.125, range=2), ])) + def test_validate_explicit_types(self): + with self.assertRaisesRegex(TypeError, r'.*violates schema.*'): + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + elements = p | beam.Create([ + beam.Row(a=2, b='abc', c=.25), + beam.Row(a=3, b='xy', c=.125), + ]) + result = elements | YamlTransform( + ''' + type: MapToFields + input: input + config: + language: python + fields: + bad: + expression: "a + c" + output_type: string # This is a lie. + ''') + self.assertEqual(result.element_type._fields[0][1], str) + YamlMappingDocTest = createTestSuite( 'YamlMappingDocTest', diff --git a/sdks/python/apache_beam/yaml/yaml_provider.py b/sdks/python/apache_beam/yaml/yaml_provider.py index 630e63c31d8a2..01e39b770c9b0 100644 --- a/sdks/python/apache_beam/yaml/yaml_provider.py +++ b/sdks/python/apache_beam/yaml/yaml_provider.py @@ -28,7 +28,6 @@ import subprocess import sys import urllib.parse -import uuid from typing import Any from typing import Callable from typing import Dict @@ -453,6 +452,45 @@ def create_transform(self, type, args, yaml_create_transform): return self._transform_factories[type](yaml_create_transform, **args) +class SqlBackedProvider(Provider): + def __init__( + self, + transforms: Mapping[str, Callable[..., beam.PTransform]], + sql_provider: Optional[Provider] = None): + self._transforms = transforms + if sql_provider is None: + sql_provider = beam_jar( + urns={'Sql': 'beam:external:java:sql:v1'}, + gradle_target='sdks:java:extensions:sql:expansion-service:shadowJar') + self._sql_provider = sql_provider + + def sql_provider(self): + return self._sql_provider + + def provided_transforms(self): + return self._transforms.keys() + + def available(self): + return self.sql_provider().available() + + def cache_artifacts(self): + return self.sql_provider().cache_artifacts() + + def underlying_provider(self): + return self.sql_provider() + + def to_json(self): + return {'type': "SqlBackedProvider"} + + def create_transform( + self, typ: str, args: Mapping[str, Any], + yaml_create_transform: Any) -> beam.PTransform: + return self._transforms[typ]( + lambda query: self.sql_provider().create_transform( + 'Sql', {'query': query}, yaml_create_transform), + **args) + + PRIMITIVE_NAMES_TO_ATOMIC_TYPE = { py_type.__name__: schema_type for (py_type, schema_type) in schemas.PRIMITIVE_TO_ATOMIC_TYPE.items() @@ -460,6 +498,13 @@ def create_transform(self, type, args, yaml_create_transform): } +def element_to_rows(e): + if isinstance(e, dict): + return dicts_to_rows(e) + else: + return beam.Row(element=dicts_to_rows(e)) + + def dicts_to_rows(o): if isinstance(o, dict): return beam.Row(**{k: dicts_to_rows(v) for k, v in o.items()}) @@ -487,47 +532,7 @@ def create(elements: Iterable[Any], reshuffle: bool = True): reshuffle (optional): Whether to introduce a reshuffle if there is more than one element in the collection. Defaults to True. """ - return beam.Create(dicts_to_rows(elements), reshuffle) - - def with_schema(**args): - # TODO: This is preliminary. - def parse_type(spec): - if spec in PRIMITIVE_NAMES_TO_ATOMIC_TYPE: - return schema_pb2.FieldType( - atomic_type=PRIMITIVE_NAMES_TO_ATOMIC_TYPE[spec]) - elif isinstance(spec, list): - if len(spec) != 1: - raise ValueError("Use single-element lists to denote list types.") - else: - return schema_pb2.FieldType( - iterable_type=schema_pb2.IterableType( - element_type=parse_type(spec[0]))) - elif isinstance(spec, dict): - return schema_pb2.FieldType( - iterable_type=schema_pb2.RowType(schema=parse_schema(spec[0]))) - else: - raise ValueError("Unknown schema type: {spec}") - - def parse_schema(spec): - return schema_pb2.Schema( - fields=[ - schema_pb2.Field(name=key, type=parse_type(value), id=ix) - for (ix, (key, value)) in enumerate(spec.items()) - ], - id=str(uuid.uuid4())) - - named_tuple = schemas.named_tuple_from_schema(parse_schema(args)) - names = list(args.keys()) - - def extract_field(x, name): - if isinstance(x, dict): - return x[name] - else: - return getattr(x, name) - - return 'WithSchema(%s)' % ', '.join(names) >> beam.Map( - lambda x: named_tuple(*[extract_field(x, name) for name in names]) - ).with_output_types(named_tuple) + return beam.Create([element_to_rows(e) for e in elements], reshuffle) # Or should this be posargs, args? # pylint: disable=dangerous-default-value @@ -589,7 +594,6 @@ def log_and_return(x): 'Create': create, 'LogForTesting': lambda: beam.Map(log_and_return), 'PyTransform': fully_qualified_named_transform, - 'WithSchemaExperimental': with_schema, 'Flatten': Flatten, 'WindowInto': WindowInto, }, @@ -684,7 +688,7 @@ def __exit__(self, *args): @ExternalProvider.register_provider_type('renaming') class RenamingProvider(Provider): - def __init__(self, transforms, mappings, underlying_provider): + def __init__(self, transforms, mappings, underlying_provider, defaults=None): if isinstance(underlying_provider, dict): underlying_provider = ExternalProvider.provider_from_spec( underlying_provider) @@ -693,7 +697,24 @@ def __init__(self, transforms, mappings, underlying_provider): for transform in transforms.keys(): if transform not in mappings: raise ValueError(f'Missing transform {transform} in mappings.') - self._mappings = mappings + self._mappings = self.expand_mappings(mappings) + self._defaults = defaults or {} + + @staticmethod + def expand_mappings(mappings): + if not isinstance(mappings, dict): + raise ValueError( + "RenamingProvider mappings must be dict of transform " + "mappings.") + for key, value in mappings.items(): + if isinstance(value, str): + if value not in mappings.keys(): + raise ValueError( + "RenamingProvider transform mappings must be dict or " + "specify transform that has mappings within same " + "provider.") + mappings[key] = mappings[value] + return mappings def available(self) -> bool: return self._underlying_provider.available() @@ -731,6 +752,9 @@ def create_transform( mappings.get(key, key): value for key, value in args.items() } + for key, value in self._defaults.get(typ, {}).items(): + if key not in remapped_args: + remapped_args[key] = value return self._underlying_provider.create_transform( self._transforms[typ], remapped_args, yaml_create_transform) @@ -741,6 +765,9 @@ def _affinity(self, other): def underlying_provider(self): return self._underlying_provider.underlying_provider() + def cache_artifacts(self): + self._underlying_provider.cache_artifacts() + def parse_providers(provider_specs): providers = collections.defaultdict(list) @@ -770,6 +797,7 @@ def merge_providers(*provider_sets): def standard_providers(): + from apache_beam.yaml.yaml_combine import create_combine_providers from apache_beam.yaml.yaml_mapping import create_mapping_providers from apache_beam.yaml.yaml_io import io_providers with open(os.path.join(os.path.dirname(__file__), @@ -779,6 +807,7 @@ def standard_providers(): return merge_providers( create_builtin_provider(), create_mapping_providers(), + create_combine_providers(), io_providers(), parse_providers(standard_providers)) diff --git a/sdks/python/apache_beam/yaml/yaml_transform.py b/sdks/python/apache_beam/yaml/yaml_transform.py index fa30c1830809b..ff5547db034c1 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform.py +++ b/sdks/python/apache_beam/yaml/yaml_transform.py @@ -32,8 +32,10 @@ from yaml.loader import SafeLoader import apache_beam as beam +from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.transforms.fully_qualified_named_transform import FullyQualifiedNamedTransform from apache_beam.yaml import yaml_provider +from apache_beam.yaml.yaml_combine import normalize_combine __all__ = ["YamlTransform"] @@ -225,6 +227,8 @@ def get_pcollection(self, name): outputs = self.get_outputs(transform) if output in outputs: return outputs[output] + elif len(outputs) == 1 and outputs[next(iter(outputs))].tag == output: + return outputs[next(iter(outputs))] else: raise ValueError( f'Unknown output {repr(output)} ' @@ -522,7 +526,7 @@ def is_not_output_of_last_transform(new_transforms, value): raise TypeError( f"Chain at {identify_object(spec)} missing transforms property.") has_explicit_outputs = 'output' in spec - composite_spec = normalize_inputs_outputs(spec) + composite_spec = normalize_inputs_outputs(tag_explicit_inputs(spec)) new_transforms = [] for ix, transform in enumerate(composite_spec['transforms']): if any(io in transform for io in ('input', 'output')): @@ -539,6 +543,8 @@ def is_not_output_of_last_transform(new_transforms, value): pass elif is_explicitly_empty(composite_spec['input']): transform['input'] = composite_spec['input'] + elif is_empty(composite_spec['input']): + del composite_spec['input'] else: transform['input'] = { key: key @@ -883,7 +889,7 @@ def ensure_transforms_have_providers(spec): return spec def preprocess_langauges(spec): - if spec['type'] in ('Filter', 'MapToFields'): + if spec['type'] in ('Filter', 'MapToFields', 'Combine'): language = spec.get('config', {}).get('language', 'generic') new_type = spec['type'] + '-' + language if known_transforms and new_type not in known_transforms: @@ -898,6 +904,7 @@ def preprocess_langauges(spec): for phase in [ ensure_transforms_have_types, + normalize_combine, preprocess_langauges, ensure_transforms_have_providers, preprocess_source_sink, @@ -931,24 +938,43 @@ def __init__(self, spec, providers={}): # pylint: disable=dangerous-default-val self._providers = yaml_provider.merge_providers( providers, yaml_provider.standard_providers()) self._spec = preprocess(spec, known_transforms=self._providers.keys()) + self._was_chain = spec['type'] == 'chain' def expand(self, pcolls): if isinstance(pcolls, beam.pvalue.PBegin): root = pcolls + pipeline = root.pipeline pcolls = {} elif isinstance(pcolls, beam.PCollection): root = pcolls.pipeline + pipeline = root pcolls = {'input': pcolls} + if not self._spec['input']: + self._spec['input'] = {'input': 'input'} + if self._was_chain and self._spec['transforms']: + # This should have been copied as part of the composite-to-chain. + self._spec['transforms'][0]['input'] = self._spec['input'] else: root = next(iter(pcolls.values())).pipeline + pipeline = root + if not self._spec['input']: + self._spec['input'] = {name: name for name in pcolls.keys()} + python_provider = yaml_provider.InlineProvider({}) + + options = pipeline.options.view_as(GoogleCloudOptions) + options.labels = ["yaml=true"] + result = expand_transform( self._spec, Scope( root, pcolls, - transforms=[], + transforms=[self._spec], providers=self._providers, - input_providers={})) + input_providers={ + pcoll: python_provider + for pcoll in pcolls.values() + })) if len(result) == 1: return only_element(result.values()) else: diff --git a/sdks/python/apache_beam/yaml/yaml_transform_test.py b/sdks/python/apache_beam/yaml/yaml_transform_test.py index 63f2e0e7facd0..ce608578b6002 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform_test.py +++ b/sdks/python/apache_beam/yaml/yaml_transform_test.py @@ -43,6 +43,16 @@ def expand(self, p): | beam.Map(lambda x: beam.transforms.window.TimestampedValue(x, x))) +class CreateInts(beam.PTransform): + _yaml_requires_inputs = False + + def __init__(self, elements): + self._elements = elements + + def expand(self, p): + return p | beam.Create(self._elements) + + class SumGlobally(beam.PTransform): def expand(self, pcoll): return pcoll | beam.CombineGlobally(sum).without_defaults() @@ -54,18 +64,18 @@ def __init__(self, limit, error_handling): self._error_handling = error_handling def expand(self, pcoll): - def raise_on_big(element): - if len(element) > self._limit: - raise ValueError(element) + def raise_on_big(row): + if len(row.element) > self._limit: + raise ValueError(row.element) else: - return element + return row.element good, bad = pcoll | beam.Map(raise_on_big).with_exception_handling() return {'small_elements': good, self._error_handling['output']: bad} TEST_PROVIDERS = { - 'CreateInts': lambda elements: beam.Create(elements), + 'CreateInts': CreateInts, 'CreateTimestamped': CreateTimestamped, 'SumGlobally': SumGlobally, 'SizeLimiter': SizeLimiter, @@ -201,7 +211,7 @@ def test_implicit_flatten(self): - type: PyMap input: [CreateBig, CreateSmall] config: - fn: "lambda x: x * x" + fn: "lambda x: x.element * x.element" output: PyMap ''', providers=TEST_PROVIDERS) @@ -263,7 +273,7 @@ def test_name_is_not_ambiguous(self): - type: PyMap name: PyMap config: - fn: "lambda elem: elem * elem" + fn: "lambda row: row.element * row.element" input: Create output: PyMap ''', @@ -421,11 +431,14 @@ def test_mapping_errors(self): - type: Create config: elements: [0, 1, 2, 4] - - type: PyMap + - type: MapToFields name: ToRow input: Create config: - fn: "lambda x: beam.Row(num=x, str='a' * x or 'bbb')" + language: python + fields: + num: element + str: "'a' * element or 'bbb'" - type: Filter input: ToRow config: @@ -585,7 +598,8 @@ class AnnotatingProvider(yaml_provider.InlineProvider): """ def __init__(self, name, transform_names): super().__init__({ - transform_name: lambda: beam.Map(lambda x: (x or ()) + (name, )) + transform_name: + lambda: beam.Map(lambda x: (x if type(x) == tuple else ()) + (name, )) for transform_name in transform_names.strip().split() }) self._name = name @@ -718,7 +732,7 @@ def __init__(self, a, b): def expand(self, pcoll): a = self._a b = self._b - return pcoll | beam.Map(lambda x: a * x + b) + return pcoll | beam.Map(lambda x: a * x.element + b) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/yaml/yaml_transform_unit_test.py b/sdks/python/apache_beam/yaml/yaml_transform_unit_test.py index 5d5e5850fd73d..d1886ba4dcfbc 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform_unit_test.py +++ b/sdks/python/apache_beam/yaml/yaml_transform_unit_test.py @@ -244,12 +244,10 @@ def test_chain_as_composite(self): expected = f''' type: composite name: Chain - input: {{}} transforms: - type: Create config: elements: [0,1,2] - input: {{}} - type: PyMap config: fn: 'lambda x: x*x' diff --git a/sdks/python/apache_beam/yaml/yaml_udf_test.py b/sdks/python/apache_beam/yaml/yaml_udf_test.py index 5e9faa08253cd..5f5ee1147ded4 100644 --- a/sdks/python/apache_beam/yaml/yaml_udf_test.py +++ b/sdks/python/apache_beam/yaml/yaml_udf_test.py @@ -25,20 +25,27 @@ from apache_beam.options import pipeline_options from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to +from apache_beam.yaml.yaml_mapping import py_value_to_js_dict +from apache_beam.yaml.yaml_provider import dicts_to_rows from apache_beam.yaml.yaml_transform import YamlTransform def AsRows(): - return beam.Map(lambda named_tuple: beam.Row(**named_tuple._asdict())) + return beam.Map( + lambda named_tuple: dicts_to_rows(py_value_to_js_dict(named_tuple))) class YamlUDFMappingTest(unittest.TestCase): def __init__(self, method_name='runYamlMappingTest'): super().__init__(method_name) self.data = [ - beam.Row(label='11a', conductor=11, rank=0), - beam.Row(label='37a', conductor=37, rank=1), - beam.Row(label='389a', conductor=389, rank=2), + beam.Row( + label='11a', conductor=11, row=beam.Row(rank=0, values=[1, 2, 3])), + beam.Row( + label='37a', conductor=37, row=beam.Row(rank=1, values=[4, 5, 6])), + beam.Row( + label='389a', conductor=389, row=beam.Row(rank=2, values=[7, 8, + 9])), ] def setUp(self): @@ -50,26 +57,47 @@ def tearDown(self): def test_map_to_fields_filter_inline_js(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle')) as p: + pickle_library='cloudpickle', yaml_experimental_features=['javascript' + ])) as p: elements = p | beam.Create(self.data) result = elements | YamlTransform( ''' type: MapToFields - input: input config: language: javascript fields: label: - callable: "function label_map(x) {return x.label + 'x'}" + callable: | + function label_map(x) { + return x.label + 'x' + } conductor: - callable: "function conductor_map(x) {return x.conductor + 1}" + callable: | + function conductor_map(x) { + return x.conductor + 1 + } + row: + callable: | + function row_map(x) { + x.row.values.push(x.row.rank + 10) + return x.row + } ''') assert_that( result, equal_to([ - beam.Row(label='11ax', conductor=12), - beam.Row(label='37ax', conductor=38), - beam.Row(label='389ax', conductor=390), + beam.Row( + label='11ax', + conductor=12, + row=beam.Row(rank=0, values=[1, 2, 3, 10])), + beam.Row( + label='37ax', + conductor=38, + row=beam.Row(rank=1, values=[4, 5, 6, 11])), + beam.Row( + label='389ax', + conductor=390, + row=beam.Row(rank=2, values=[7, 8, 9, 12])), ])) def test_map_to_fields_filter_inline_py(self): @@ -79,7 +107,6 @@ def test_map_to_fields_filter_inline_py(self): result = elements | YamlTransform( ''' type: MapToFields - input: input config: language: python fields: @@ -87,33 +114,44 @@ def test_map_to_fields_filter_inline_py(self): callable: "lambda x: x.label + 'x'" conductor: callable: "lambda x: x.conductor + 1" + sum: + callable: "lambda x: sum(x.row.values)" ''') assert_that( result, equal_to([ - beam.Row(label='11ax', conductor=12), - beam.Row(label='37ax', conductor=38), - beam.Row(label='389ax', conductor=390), + beam.Row(label='11ax', conductor=12, sum=6), + beam.Row(label='37ax', conductor=38, sum=15), + beam.Row(label='389ax', conductor=390, sum=24), ])) def test_filter_inline_js(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle')) as p: + pickle_library='cloudpickle', yaml_experimental_features=['javascript' + ])) as p: elements = p | beam.Create(self.data) result = elements | YamlTransform( ''' type: Filter - input: input config: language: javascript keep: - callable: "function filter(x) {return x.rank > 0}" + callable: | + function filter(x) { + return x.row.rank > 0 + } ''') assert_that( result | AsRows(), equal_to([ - beam.Row(label='37a', conductor=37, rank=1), - beam.Row(label='389a', conductor=389, rank=2), + beam.Row( + label='37a', + conductor=37, + row=beam.Row(rank=1, values=[4, 5, 6])), + beam.Row( + label='389a', + conductor=389, + row=beam.Row(rank=2, values=[7, 8, 9])), ])) def test_filter_inline_py(self): @@ -123,36 +161,44 @@ def test_filter_inline_py(self): result = elements | YamlTransform( ''' type: Filter - input: input config: language: python keep: - callable: "lambda x: x.rank > 0" + callable: "lambda x: x.row.rank > 0" ''') assert_that( result | AsRows(), equal_to([ - beam.Row(label='37a', conductor=37, rank=1), - beam.Row(label='389a', conductor=389, rank=2), + beam.Row( + label='37a', + conductor=37, + row=beam.Row(rank=1, values=[4, 5, 6])), + beam.Row( + label='389a', + conductor=389, + row=beam.Row(rank=2, values=[7, 8, 9])), ])) def test_filter_expression_js(self): with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle')) as p: + pickle_library='cloudpickle', yaml_experimental_features=['javascript' + ])) as p: elements = p | beam.Create(self.data) result = elements | YamlTransform( ''' type: Filter - input: input config: language: javascript keep: - expression: "label.toUpperCase().indexOf('3') == -1 && conductor" + expression: "label.toUpperCase().indexOf('3') == -1 && row.rank < 1" ''') assert_that( result | AsRows(), equal_to([ - beam.Row(label='11a', conductor=11, rank=0), + beam.Row( + label='11a', + conductor=11, + row=beam.Row(rank=0, values=[1, 2, 3])), ])) def test_filter_expression_py(self): @@ -162,7 +208,6 @@ def test_filter_expression_py(self): result = elements | YamlTransform( ''' type: Filter - input: input config: language: python keep: @@ -171,17 +216,20 @@ def test_filter_expression_py(self): assert_that( result | AsRows(), equal_to([ - beam.Row(label='11a', conductor=11, rank=0), + beam.Row( + label='11a', + conductor=11, + row=beam.Row(rank=0, values=[1, 2, 3])), ])) def test_filter_inline_js_file(self): data = ''' function f(x) { - return x.rank > 0 + return x.row.rank > 0 } function g(x) { - return x.rank > 1 + return x.row.rank > 1 } '''.replace(' ', '') @@ -189,12 +237,12 @@ def test_filter_inline_js_file(self): self.fs.create(path).write(data.encode('utf8')) with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( - pickle_library='cloudpickle')) as p: + pickle_library='cloudpickle', yaml_experimental_features=['javascript' + ])) as p: elements = p | beam.Create(self.data) result = elements | YamlTransform( f''' type: Filter - input: input config: language: javascript keep: @@ -204,17 +252,23 @@ def test_filter_inline_js_file(self): assert_that( result | AsRows(), equal_to([ - beam.Row(label='37a', conductor=37, rank=1), - beam.Row(label='389a', conductor=389, rank=2), + beam.Row( + label='37a', + conductor=37, + row=beam.Row(rank=1, values=[4, 5, 6])), + beam.Row( + label='389a', + conductor=389, + row=beam.Row(rank=2, values=[7, 8, 9])), ])) def test_filter_inline_py_file(self): data = ''' def f(x): - return x.rank > 0 + return x.row.rank > 0 def g(x): - return x.rank > 1 + return x.row.rank > 1 '''.replace(' ', '') path = os.path.join(self.tmpdir, 'udf.py') @@ -226,7 +280,6 @@ def g(x): result = elements | YamlTransform( f''' type: Filter - input: input config: language: python keep: @@ -236,8 +289,14 @@ def g(x): assert_that( result | AsRows(), equal_to([ - beam.Row(label='37a', conductor=37, rank=1), - beam.Row(label='389a', conductor=389, rank=2), + beam.Row( + label='37a', + conductor=37, + row=beam.Row(rank=1, values=[4, 5, 6])), + beam.Row( + label='389a', + conductor=389, + row=beam.Row(rank=2, values=[7, 8, 9])), ])) diff --git a/sdks/python/build.gradle b/sdks/python/build.gradle index 762bed268d63f..7795e77e39634 100644 --- a/sdks/python/build.gradle +++ b/sdks/python/build.gradle @@ -30,7 +30,8 @@ def buildPython = tasks.register("buildPython") { logger.info('Building Python Dependencies') exec { executable 'sh' - args '-c', ". ${envdir}/bin/activate && python setup.py build --build-base ${buildDir}" + // args '-c', ". ${envdir}/bin/activate && python setup.py build --build-base ${buildDir}" + args '-c', ". ${envdir}/bin/activate && pip install -e ." } } } @@ -46,7 +47,7 @@ def sdist = tasks.register("sdist") { // Build artifact exec { executable 'sh' - args '-c', ". ${envdir}/bin/activate && python setup.py -q sdist --formats zip,gztar --dist-dir ${buildDir}" + args '-c', ". ${envdir}/bin/activate && pip install -U build && python -m build --sdist --outdir=${buildDir}" } def collection = fileTree(buildDir){ include "**/*${project.sdk_version}*.tar.gz" exclude 'srcs/**'} @@ -96,7 +97,6 @@ platform_identifiers_map.each { platform, idsuffix -> exec { environment CIBW_BUILD: "cp${pyversion}-${idsuffix}" environment CIBW_ENVIRONMENT: "SETUPTOOLS_USE_DISTUTILS=stdlib" - environment CIBW_BEFORE_BUILD: "pip install cython==0.29.36 numpy --config-settings=setup-args='-Dallow-noblas=true' && pip install --upgrade setuptools" // note: sync cibuildwheel version with GitHub Action // .github/workflow/build_wheel.yml:build_wheels "Install cibuildwheel" step executable 'sh' @@ -110,6 +110,7 @@ platform_identifiers_map.each { platform, idsuffix -> } } + /*************************************************************************************************/ // Non-testing builds and analysis tasks diff --git a/sdks/python/container/Dockerfile b/sdks/python/container/Dockerfile index 73d83343e0334..a49933ee6604f 100644 --- a/sdks/python/container/Dockerfile +++ b/sdks/python/container/Dockerfile @@ -45,7 +45,7 @@ RUN \ && \ rm -rf /var/lib/apt/lists/* && \ - pip install --upgrade setuptools && \ + pip install --upgrade pip setuptools wheel && \ # Install required packages for Beam Python SDK and common dependencies used by users. # use --no-deps to ensure the list includes all transitive dependencies. diff --git a/sdks/python/container/base_image_requirements_manual.txt b/sdks/python/container/base_image_requirements_manual.txt index e952b2126604c..f2f3ea44b44c7 100644 --- a/sdks/python/container/base_image_requirements_manual.txt +++ b/sdks/python/container/base_image_requirements_manual.txt @@ -43,3 +43,4 @@ nose==1.3.7 # For Dataflow internal testing. TODO: remove this. python-snappy;python_version<"3.11" # Optimizes execution of some Beam codepaths. scipy scikit-learn +build>=1.0,<2 # tool to build sdist from setup.py in stager. \ No newline at end of file diff --git a/sdks/python/container/build.gradle b/sdks/python/container/build.gradle index 06b1ea918c7f8..161d343b303ab 100644 --- a/sdks/python/container/build.gradle +++ b/sdks/python/container/build.gradle @@ -25,7 +25,7 @@ int max_python_version=11 configurations { sdkSourceTarball - sdkHarnessLauncher + pythonHarnessLauncher } dependencies { @@ -82,5 +82,5 @@ tasks.register("generatePythonRequirementsAll") { } artifacts { - sdkHarnessLauncher file: file('./build/target/launcher'), builtBy: goBuild + pythonHarnessLauncher file: file('./build/target/launcher'), builtBy: goBuild } diff --git a/sdks/python/container/common.gradle b/sdks/python/container/common.gradle index bb706aa5c5d83..4996dae52db57 100644 --- a/sdks/python/container/common.gradle +++ b/sdks/python/container/common.gradle @@ -22,12 +22,12 @@ description = "Apache Beam :: SDKs :: Python :: Container :: Python ${pythonVers configurations { sdkSourceTarball - sdkHarnessLauncher + pythonHarnessLauncher } dependencies { sdkSourceTarball project(path: ":sdks:python", configuration: "distTarBall") - sdkHarnessLauncher project(path: ":sdks:python:container", configuration: "sdkHarnessLauncher") + pythonHarnessLauncher project(path: ":sdks:python:container", configuration: "pythonHarnessLauncher") } def generatePythonRequirements = tasks.register("generatePythonRequirements") { @@ -59,9 +59,13 @@ def copyLicenseScripts = tasks.register("copyLicenseScripts", Copy){ } def copyLauncherDependencies = tasks.register("copyLauncherDependencies", Copy) { - from configurations.sdkHarnessLauncher + from configurations.pythonHarnessLauncher into "build/target/launcher" - if(configurations.sdkHarnessLauncher.isEmpty()) { + + // Avoid seemingly gradle bug stated in https://github.com/apache/beam/issues/29220 + mustRunAfter "copyLicenses" + + if(configurations.pythonHarnessLauncher.isEmpty()) { throw new StopExecutionException(); } } @@ -93,7 +97,7 @@ dockerPrepare.dependsOn copyLauncherDependencies dockerPrepare.dependsOn copyDockerfileDependencies dockerPrepare.dependsOn copyLicenseScripts -if (project.rootProject.hasProperty(["docker-pull-licenses"])) { +if (project.rootProject.hasProperty("docker-pull-licenses")) { def copyGolangLicenses = tasks.register("copyGolangLicenses", Copy) { from "${project(':release:go-licenses:py').buildDir}/output" into "build/target/go-licenses" diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index e8a666e7b9d07..a9f94104374e0 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -24,10 +24,11 @@ attrs==23.1.0 beautifulsoup4==4.12.2 bs4==0.0.1 +build==1.0.3 cachetools==5.3.1 certifi==2023.7.22 -cffi==1.15.1 -charset-normalizer==3.2.0 +cffi==1.16.0 +charset-normalizer==3.3.0 click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 @@ -40,17 +41,17 @@ docker==6.1.3 docopt==0.6.2 exceptiongroup==1.1.3 execnet==2.0.2 -fastavro==1.8.3 +fastavro==1.8.4 fasteners==0.19 freezegun==1.2.2 future==0.18.3 -google-api-core==2.11.1 -google-api-python-client==2.100.0 +google-api-core==2.12.0 +google-api-python-client==2.104.0 google-apitools==0.5.31 -google-auth==2.23.0 +google-auth==2.23.3 google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.33.1 -google-cloud-bigquery==3.11.4 +google-cloud-aiplatform==1.35.0 +google-cloud-bigquery==3.12.0 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 @@ -63,24 +64,26 @@ google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.5 google-cloud-resource-manager==1.10.4 google-cloud-spanner==3.40.1 -google-cloud-storage==2.11.0 +google-cloud-storage==2.12.0 google-cloud-videointelligence==2.11.4 -google-cloud-vision==3.4.4 +google-cloud-vision==3.4.5 google-crc32c==1.5.0 google-resumable-media==2.6.0 -googleapis-common-protos==1.60.0 -greenlet==2.0.2 +googleapis-common-protos==1.61.0 +greenlet==3.0.0 grpc-google-iam-v1==0.12.6 -grpcio==1.58.0 -grpcio-status==1.58.0 -guppy3==3.1.3 -hdfs==2.7.2 +grpcio==1.59.0 +grpcio-status==1.59.0 +guppy3==3.1.4 +hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.87.0 +hypothesis==6.88.1 idna==3.4 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 @@ -88,16 +91,17 @@ nose==1.3.7 numpy==1.24.4 oauth2client==4.1.3 objsize==0.6.1 -orjson==3.9.7 +orjson==3.9.9 overrides==6.5.0 -packaging==23.1 +packaging==23.2 pandas==1.5.3 parameterized==0.9.0 pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.24.3 -psycopg2-binary==2.9.7 +protobuf==4.25.0 +psycopg2-binary==2.9.9 pyarrow==11.0.0 +pyarrow-hotfix==0.4 pyasn1==0.5.0 pyasn1-modules==0.3.0 pycparser==2.21 @@ -107,20 +111,23 @@ pyjsparser==2.7.1 pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 +pyproject_hooks==1.0.0 pytest==7.4.2 -pytest-timeout==2.1.0 +pytest-timeout==2.2.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 python-snappy==0.6.1 pytz==2023.3.post1 PyYAML==6.0.1 -regex==2023.8.8 +referencing==0.30.2 +regex==2023.10.3 requests==2.31.0 requests-mock==1.11.0 +rpds-py==0.10.6 rsa==4.9 scikit-learn==1.3.1 -scipy==1.11.2 -Shapely==1.8.5.post1 +scipy==1.11.3 +shapely==2.0.2 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 @@ -132,9 +139,9 @@ threadpoolctl==3.2.0 tomli==2.0.1 tqdm==4.66.1 typing_extensions==4.8.0 -tzlocal==5.0.1 +tzlocal==5.1 uritemplate==4.1.1 -urllib3==1.26.16 -websocket-client==1.6.3 +urllib3==2.0.7 +websocket-client==1.6.4 wrapt==1.15.0 zstandard==0.21.0 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index 2e5d834926bd2..865b856683a4d 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -24,10 +24,11 @@ attrs==23.1.0 beautifulsoup4==4.12.2 bs4==0.0.1 +build==1.0.3 cachetools==5.3.1 certifi==2023.7.22 -cffi==1.15.1 -charset-normalizer==3.2.0 +cffi==1.16.0 +charset-normalizer==3.3.0 click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 @@ -39,16 +40,16 @@ dnspython==2.4.2 docker==6.1.3 docopt==0.6.2 execnet==2.0.2 -fastavro==1.8.3 +fastavro==1.8.4 fasteners==0.19 freezegun==1.2.2 future==0.18.3 -google-api-core==2.11.1 +google-api-core==2.12.0 google-apitools==0.5.31 -google-auth==2.23.0 +google-auth==2.23.3 google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.33.1 -google-cloud-bigquery==3.11.4 +google-cloud-aiplatform==1.35.0 +google-cloud-bigquery==3.12.0 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 @@ -60,24 +61,26 @@ google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.5 google-cloud-resource-manager==1.10.4 google-cloud-spanner==3.40.1 -google-cloud-storage==2.11.0 +google-cloud-storage==2.12.0 google-cloud-videointelligence==2.11.4 -google-cloud-vision==3.4.4 +google-cloud-vision==3.4.5 google-crc32c==1.5.0 google-resumable-media==2.6.0 -googleapis-common-protos==1.60.0 -greenlet==2.0.2 +googleapis-common-protos==1.61.0 +greenlet==3.0.0 grpc-google-iam-v1==0.12.6 -grpcio==1.58.0 -grpcio-status==1.58.0 -guppy3==3.1.3 -hdfs==2.7.2 +grpcio==1.59.0 +grpcio-status==1.59.0 +guppy3==3.1.4 +hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.87.0 +hypothesis==6.88.1 idna==3.4 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 @@ -85,16 +88,17 @@ nose==1.3.7 numpy==1.24.4 oauth2client==4.1.3 objsize==0.6.1 -orjson==3.9.7 +orjson==3.9.9 overrides==6.5.0 -packaging==23.1 +packaging==23.2 pandas==1.5.3 parameterized==0.9.0 pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.24.3 -psycopg2-binary==2.9.7 +protobuf==4.25.0 +psycopg2-binary==2.9.9 pyarrow==11.0.0 +pyarrow-hotfix==0.4 pyasn1==0.5.0 pyasn1-modules==0.3.0 pycparser==2.21 @@ -104,19 +108,22 @@ pyjsparser==2.7.1 pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 +pyproject_hooks==1.0.0 pytest==7.4.2 -pytest-timeout==2.1.0 +pytest-timeout==2.2.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 pytz==2023.3.post1 PyYAML==6.0.1 -regex==2023.8.8 +referencing==0.30.2 +regex==2023.10.3 requests==2.31.0 requests-mock==1.11.0 +rpds-py==0.10.6 rsa==4.9 scikit-learn==1.3.1 -scipy==1.11.2 -Shapely==1.8.5.post1 +scipy==1.11.3 +shapely==2.0.2 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 @@ -127,8 +134,8 @@ testcontainers==3.7.1 threadpoolctl==3.2.0 tqdm==4.66.1 typing_extensions==4.8.0 -tzlocal==5.0.1 -urllib3==1.26.16 -websocket-client==1.6.3 +tzlocal==5.1 +urllib3==2.0.7 +websocket-client==1.6.4 wrapt==1.15.0 zstandard==0.21.0 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index ed5d35fc64578..5dffff5f80d9a 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -25,10 +25,11 @@ attrs==23.1.0 backports.zoneinfo==0.2.1 beautifulsoup4==4.12.2 bs4==0.0.1 +build==1.0.3 cachetools==5.3.1 certifi==2023.7.22 -cffi==1.15.1 -charset-normalizer==3.2.0 +cffi==1.16.0 +charset-normalizer==3.3.0 click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 @@ -41,17 +42,17 @@ docker==6.1.3 docopt==0.6.2 exceptiongroup==1.1.3 execnet==2.0.2 -fastavro==1.8.3 +fastavro==1.8.4 fasteners==0.19 freezegun==1.2.2 future==0.18.3 -google-api-core==2.11.1 -google-api-python-client==2.100.0 +google-api-core==2.12.0 +google-api-python-client==2.104.0 google-apitools==0.5.31 -google-auth==2.23.0 +google-auth==2.23.3 google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.33.1 -google-cloud-bigquery==3.11.4 +google-cloud-aiplatform==1.35.0 +google-cloud-bigquery==3.12.0 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 @@ -64,24 +65,28 @@ google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.5 google-cloud-resource-manager==1.10.4 google-cloud-spanner==3.40.1 -google-cloud-storage==2.11.0 +google-cloud-storage==2.12.0 google-cloud-videointelligence==2.11.4 -google-cloud-vision==3.4.4 +google-cloud-vision==3.4.5 google-crc32c==1.5.0 google-resumable-media==2.6.0 -googleapis-common-protos==1.60.0 -greenlet==2.0.2 +googleapis-common-protos==1.61.0 +greenlet==3.0.0 grpc-google-iam-v1==0.12.6 -grpcio==1.58.0 -grpcio-status==1.58.0 -guppy3==3.1.3 -hdfs==2.7.2 +grpcio==1.59.0 +grpcio-status==1.59.0 +guppy3==3.1.4 +hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.87.0 +hypothesis==6.88.1 idna==3.4 +importlib-metadata==6.8.0 +importlib-resources==6.1.0 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 @@ -89,16 +94,18 @@ nose==1.3.7 numpy==1.24.4 oauth2client==4.1.3 objsize==0.6.1 -orjson==3.9.7 +orjson==3.9.9 overrides==6.5.0 -packaging==23.1 +packaging==23.2 pandas==1.5.3 parameterized==0.9.0 +pkgutil_resolve_name==1.3.10 pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.24.3 -psycopg2-binary==2.9.7 +protobuf==4.25.0 +psycopg2-binary==2.9.9 pyarrow==11.0.0 +pyarrow-hotfix==0.4 pyasn1==0.5.0 pyasn1-modules==0.3.0 pycparser==2.21 @@ -108,20 +115,23 @@ pyjsparser==2.7.1 pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 +pyproject_hooks==1.0.0 pytest==7.4.2 -pytest-timeout==2.1.0 +pytest-timeout==2.2.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 python-snappy==0.6.1 pytz==2023.3.post1 PyYAML==6.0.1 -regex==2023.8.8 +referencing==0.30.2 +regex==2023.10.3 requests==2.31.0 requests-mock==1.11.0 +rpds-py==0.10.6 rsa==4.9 scikit-learn==1.3.1 scipy==1.10.1 -Shapely==1.8.5.post1 +shapely==2.0.2 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 @@ -133,9 +143,10 @@ threadpoolctl==3.2.0 tomli==2.0.1 tqdm==4.66.1 typing_extensions==4.8.0 -tzlocal==5.0.1 +tzlocal==5.1 uritemplate==4.1.1 -urllib3==1.26.17 -websocket-client==1.6.3 +urllib3==2.0.7 +websocket-client==1.6.4 wrapt==1.15.0 +zipp==3.17.0 zstandard==0.21.0 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index ff6ba0945e14c..1b8ad7a2e748f 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -24,10 +24,11 @@ attrs==23.1.0 beautifulsoup4==4.12.2 bs4==0.0.1 +build==1.0.3 cachetools==5.3.1 certifi==2023.7.22 -cffi==1.15.1 -charset-normalizer==3.2.0 +cffi==1.16.0 +charset-normalizer==3.3.0 click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 @@ -40,17 +41,17 @@ docker==6.1.3 docopt==0.6.2 exceptiongroup==1.1.3 execnet==2.0.2 -fastavro==1.8.3 +fastavro==1.8.4 fasteners==0.19 freezegun==1.2.2 future==0.18.3 -google-api-core==2.11.1 -google-api-python-client==2.100.0 +google-api-core==2.12.0 +google-api-python-client==2.104.0 google-apitools==0.5.31 -google-auth==2.23.0 +google-auth==2.23.3 google-auth-httplib2==0.1.1 -google-cloud-aiplatform==1.33.1 -google-cloud-bigquery==3.11.4 +google-cloud-aiplatform==1.35.0 +google-cloud-bigquery==3.12.0 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 @@ -63,24 +64,27 @@ google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.5 google-cloud-resource-manager==1.10.4 google-cloud-spanner==3.40.1 -google-cloud-storage==2.11.0 +google-cloud-storage==2.12.0 google-cloud-videointelligence==2.11.4 -google-cloud-vision==3.4.4 +google-cloud-vision==3.4.5 google-crc32c==1.5.0 google-resumable-media==2.6.0 -googleapis-common-protos==1.60.0 -greenlet==2.0.2 +googleapis-common-protos==1.61.0 +greenlet==3.0.0 grpc-google-iam-v1==0.12.6 -grpcio==1.58.0 -grpcio-status==1.58.0 -guppy3==3.1.3 -hdfs==2.7.2 +grpcio==1.59.0 +grpcio-status==1.59.0 +guppy3==3.1.4 +hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.87.0 +hypothesis==6.88.1 idna==3.4 +importlib-metadata==6.8.0 iniconfig==2.0.0 joblib==1.3.2 Js2Py==0.74 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 @@ -88,16 +92,17 @@ nose==1.3.7 numpy==1.24.4 oauth2client==4.1.3 objsize==0.6.1 -orjson==3.9.7 +orjson==3.9.9 overrides==6.5.0 -packaging==23.1 +packaging==23.2 pandas==1.5.3 parameterized==0.9.0 pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.24.3 -psycopg2-binary==2.9.7 +protobuf==4.25.0 +psycopg2-binary==2.9.9 pyarrow==11.0.0 +pyarrow-hotfix==0.4 pyasn1==0.5.0 pyasn1-modules==0.3.0 pycparser==2.21 @@ -107,20 +112,23 @@ pyjsparser==2.7.1 pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 +pyproject_hooks==1.0.0 pytest==7.4.2 -pytest-timeout==2.1.0 +pytest-timeout==2.2.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 python-snappy==0.6.1 pytz==2023.3.post1 PyYAML==6.0.1 -regex==2023.8.8 +referencing==0.30.2 +regex==2023.10.3 requests==2.31.0 requests-mock==1.11.0 +rpds-py==0.10.6 rsa==4.9 scikit-learn==1.3.1 -scipy==1.11.2 -Shapely==1.8.5.post1 +scipy==1.11.3 +shapely==2.0.2 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 @@ -132,9 +140,10 @@ threadpoolctl==3.2.0 tomli==2.0.1 tqdm==4.66.1 typing_extensions==4.8.0 -tzlocal==5.0.1 +tzlocal==5.1 uritemplate==4.1.1 -urllib3==1.26.16 -websocket-client==1.6.3 +urllib3==2.0.7 +websocket-client==1.6.4 wrapt==1.15.0 +zipp==3.17.0 zstandard==0.21.0 diff --git a/sdks/python/expansion-service-container/boot.go b/sdks/python/expansion-service-container/boot.go index 90a97c35425ad..ba56b349c4eab 100644 --- a/sdks/python/expansion-service-container/boot.go +++ b/sdks/python/expansion-service-container/boot.go @@ -18,8 +18,10 @@ package main import ( + "bufio" "flag" "fmt" + "io/ioutil" "log" "os" "path/filepath" @@ -31,16 +33,15 @@ import ( ) var ( - id = flag.String("id", "", "Local identifier (required)") - port = flag.Int("port", 0, "Port for the expansion service (required)") + id = flag.String("id", "", "Local identifier (required)") + port = flag.Int("port", 0, "Port for the expansion service (required)") + requirements_file = flag.String("requirements_file", "", "A requirement file with extra packages to be made available to the transforms being expanded. Path should be relative to the 'dependencies_dir'") + dependencies_dir = flag.String("dependencies_dir", "", "A directory that stores locally available extra packages.") ) const ( expansionServiceEntrypoint = "apache_beam.runners.portability.expansion_service_main" venvDirectory = "beam_venv" // This should match the venv directory name used in the Dockerfile. - requirementsFile = "requirements.txt" - beamSDKArtifact = "apache-beam-sdk.tar.gz" - beamSDKOptions = "[gcp,dataframe]" ) func main() { @@ -58,6 +59,79 @@ func main() { } } +func getLines(fileNameToRead string) ([]string, error) { + fileToRead, err := os.Open(fileNameToRead) + if err != nil { + return nil, err + } + defer fileToRead.Close() + + sc := bufio.NewScanner(fileToRead) + lines := make([]string, 0) + + // Read through 'tokens' until an EOF is encountered. + for sc.Scan() { + lines = append(lines, sc.Text()) + } + + if err := sc.Err(); err != nil { + return nil, err + } + return lines, nil +} + +func installExtraPackages(requirementsFile string) error { + extraPackages, err := getLines(requirementsFile) + if err != nil { + return err + } + + for _, extraPackage := range extraPackages { + log.Printf("Installing extra package %v", extraPackage) + // We expect 'pip' command in virtual env to be already available at the top of the PATH. + args := []string{"install", extraPackage} + if err := execx.Execute("pip", args...); err != nil { + return fmt.Errorf("Could not install the package %s: %s", extraPackage, err) + } + } + return nil +} + +func getUpdatedRequirementsFile(oldRequirementsFileName string, dependenciesDir string) (string, error) { + oldExtraPackages, err := getLines(filepath.Join(dependenciesDir, oldRequirementsFileName)) + if err != nil { + return "", err + } + var updatedExtraPackages = make([]string, 0) + for _, extraPackage := range oldExtraPackages { + // TODO update + potentialLocalFilePath := filepath.Join(dependenciesDir, extraPackage) + _, err := os.Stat(potentialLocalFilePath) + if err == nil { + // Package exists locally so using that. + extraPackage = potentialLocalFilePath + log.Printf("Using locally available extra package %v", extraPackage) + } + updatedExtraPackages = append(updatedExtraPackages, extraPackage) + } + + updatedRequirementsFile, err := ioutil.TempFile("/opt/apache/beam", "requirements*.txt") + if err != nil { + return "", err + } + + updatedRequirementsFileName := updatedRequirementsFile.Name() + + datawriter := bufio.NewWriter(updatedRequirementsFile) + for _, extraPackage := range updatedExtraPackages { + _, _ = datawriter.WriteString(extraPackage + "\n") + } + datawriter.Flush() + updatedRequirementsFile.Close() + + return updatedRequirementsFileName, nil +} + func launchExpansionServiceProcess() error { pythonVersion, err := expansionx.GetPythonVersion() if err != nil { @@ -70,6 +144,24 @@ func launchExpansionServiceProcess() error { os.Setenv("PATH", strings.Join([]string{filepath.Join(dir, "bin"), os.Getenv("PATH")}, ":")) args := []string{"-m", expansionServiceEntrypoint, "-p", strconv.Itoa(*port), "--fully_qualified_name_glob", "*"} + + if *requirements_file != "" { + log.Printf("Received the requirements file %v", *requirements_file) + updatedRequirementsFileName, err := getUpdatedRequirementsFile(*requirements_file, *dependencies_dir) + if err != nil { + return err + } + defer os.Remove(updatedRequirementsFileName) + log.Printf("Updated requirements file is %v", updatedRequirementsFileName) + // Provide the requirements file to the expansion service so that packages get staged by runners. + args = append(args, "--requirements_file", updatedRequirementsFileName) + // Install packages locally so that they can be used by the expansion service during transform + // expansion if needed. + err = installExtraPackages(updatedRequirementsFileName) + if err != nil { + return err + } + } if err := execx.Execute(pythonVersion, args...); err != nil { return fmt.Errorf("could not start the expansion service: %s", err) } diff --git a/sdks/python/gen_protos.py b/sdks/python/gen_protos.py index 94d80c8d263b9..2b488af0afb5d 100644 --- a/sdks/python/gen_protos.py +++ b/sdks/python/gen_protos.py @@ -18,7 +18,7 @@ """ Generates Python proto modules and grpc stubs for Beam protos. """ - +import argparse import contextlib import glob import inspect @@ -27,9 +27,7 @@ import platform import re import shutil -import subprocess import sys -import time from collections import defaultdict from importlib import import_module @@ -60,7 +58,7 @@ NO_PROMISES_NOTICE = """ \"\"\" For internal use only; no backwards-compatibility guarantees. -Automatically generated when running setup.py sdist or build[_py]. +Automatically generated when running python -m build. \"\"\" """ @@ -321,43 +319,6 @@ def find_by_ext(root_dir, ext): if file.endswith(ext): yield clean_path(os.path.join(root, file)) - -def ensure_grpcio_exists(): - try: - from grpc_tools import protoc # pylint: disable=unused-import - except ImportError: - return _install_grpcio_tools() - - -def _install_grpcio_tools(): - """ - Though wheels are available for grpcio-tools, setup_requires uses - easy_install which doesn't understand them. This means that it is - compiled from scratch (which is expensive as it compiles the full - protoc compiler). Instead, we attempt to install a wheel in a temporary - directory and add it to the path as needed. - See https://github.com/pypa/setuptools/issues/377 - """ - install_path = os.path.join(PYTHON_SDK_ROOT, '.eggs', 'grpcio-wheels') - logging.warning('Installing grpcio-tools into %s', install_path) - start = time.time() - subprocess.check_call([ - sys.executable, - '-m', - 'pip', - 'install', - '--target', - install_path, - '--upgrade', - '-r', - os.path.join(PYTHON_SDK_ROOT, 'build-requirements.txt') - ]) - logging.warning( - 'Installing grpcio-tools took %0.2f seconds.', time.time() - start) - - return install_path - - def build_relative_import(root_path, import_path, start_file_path): tail_path = import_path.replace('.', os.path.sep) source_path = os.path.join(root_path, tail_path) @@ -511,33 +472,31 @@ def generate_proto_files(force=False): if not os.path.exists(PYTHON_OUTPUT_PATH): os.mkdir(PYTHON_OUTPUT_PATH) - grpcio_install_loc = ensure_grpcio_exists() protoc_gen_mypy = _find_protoc_gen_mypy() - with PythonPath(grpcio_install_loc): - from grpc_tools import protoc - builtin_protos = pkg_resources.resource_filename('grpc_tools', '_proto') - args = ( - [sys.executable] + # expecting to be called from command line - ['--proto_path=%s' % builtin_protos] + - ['--proto_path=%s' % d - for d in proto_dirs] + ['--python_out=%s' % PYTHON_OUTPUT_PATH] + - ['--plugin=protoc-gen-mypy=%s' % protoc_gen_mypy] + - # new version of mypy-protobuf converts None to zero default value - # and remove Optional from the param type annotation. This causes - # some mypy errors. So to mitigate and fall back to old behavior, - # use `relax_strict_optional_primitives` flag. more at - # https://github.com/nipunn1313/mypy-protobuf/tree/main#relax_strict_optional_primitives # pylint:disable=line-too-long - ['--mypy_out=relax_strict_optional_primitives:%s' % PYTHON_OUTPUT_PATH - ] + - # TODO(robertwb): Remove the prefix once it's the default. - ['--grpc_python_out=grpc_2_0:%s' % PYTHON_OUTPUT_PATH] + proto_files) - - LOG.info('Regenerating Python proto definitions (%s).' % regenerate_reason) - ret_code = protoc.main(args) - if ret_code: - raise RuntimeError( - 'Protoc returned non-zero status (see logs for details): ' - '%s' % ret_code) + from grpc_tools import protoc + builtin_protos = pkg_resources.resource_filename('grpc_tools', '_proto') + args = ( + [sys.executable] + # expecting to be called from command line + ['--proto_path=%s' % builtin_protos] + + ['--proto_path=%s' % d + for d in proto_dirs] + ['--python_out=%s' % PYTHON_OUTPUT_PATH] + + ['--plugin=protoc-gen-mypy=%s' % protoc_gen_mypy] + + # new version of mypy-protobuf converts None to zero default value + # and remove Optional from the param type annotation. This causes + # some mypy errors. So to mitigate and fall back to old behavior, + # use `relax_strict_optional_primitives` flag. more at + # https://github.com/nipunn1313/mypy-protobuf/tree/main#relax_strict_optional_primitives # pylint:disable=line-too-long + ['--mypy_out=relax_strict_optional_primitives:%s' % PYTHON_OUTPUT_PATH + ] + + # TODO(robertwb): Remove the prefix once it's the default. + ['--grpc_python_out=grpc_2_0:%s' % PYTHON_OUTPUT_PATH] + proto_files) + + LOG.info('Regenerating Python proto definitions (%s).' % regenerate_reason) + ret_code = protoc.main(args) + if ret_code: + raise RuntimeError( + 'Protoc returned non-zero status (see logs for details): ' + '%s' % ret_code) # copy resource files for path in MODEL_RESOURCES: @@ -548,7 +507,7 @@ def generate_proto_files(force=False): # force relative import paths for proto files compiled_import_re = re.compile('^from (.*) import (.*)$') for file_path in find_by_ext(PYTHON_OUTPUT_PATH, - ('_pb2.py', '_pb2_grpc.py', '_pb2.pyi')): + ('_pb2.py', '_pb2_grpc.py', '_pb2.pyi')): proto_packages.add(os.path.dirname(file_path)) lines = [] with open(file_path, encoding='utf-8') as f: @@ -566,12 +525,14 @@ def generate_proto_files(force=False): f.writelines(lines) generate_init_files_lite(PYTHON_OUTPUT_PATH) - with PythonPath(grpcio_install_loc): - for proto_package in proto_packages: - generate_urn_files(proto_package, PYTHON_OUTPUT_PATH) + for proto_package in proto_packages: + generate_urn_files(proto_package, PYTHON_OUTPUT_PATH) generate_init_files_full(PYTHON_OUTPUT_PATH) if __name__ == '__main__': - generate_proto_files(force=True) + parser = argparse.ArgumentParser() + parser.add_argument('--no-force', dest='force', action='store_false') + args = parser.parse_args() + generate_proto_files(force=args.force) diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml new file mode 100644 index 0000000000000..d185c45f61910 --- /dev/null +++ b/sdks/python/pyproject.toml @@ -0,0 +1,36 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# since we rely on setuptools and according to https://peps.python.org/pep-0518/#build-system-table +# this is the minimum requirements for the build system to execute. +[build-system] +requires = [ + "setuptools", + "wheel>=0.36.0", + "grpcio-tools==1.53.0", + "mypy-protobuf==3.5.0", + # Avoid https://github.com/pypa/virtualenv/issues/2006 + "distlib==0.3.7", + # Numpy headers + "numpy>=1.14.3,<1.25", # Update setup.py as well. + # having cython here will create wheels that are platform dependent. + "cython==0.29.36", +] + + +# legacy installation is needed to generate `apache_beam.portability.api` package. +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/sdks/python/scripts/run_integration_test.sh b/sdks/python/scripts/run_integration_test.sh index 6ad592080ae2e..5ac3627a09608 100755 --- a/sdks/python/scripts/run_integration_test.sh +++ b/sdks/python/scripts/run_integration_test.sh @@ -79,6 +79,7 @@ SUITE="" COLLECT_MARKERS= REQUIREMENTS_FILE="" ARCH="" +PY_VERSION="" # Default test (pytest) options. # Run WordCountIT.test_wordcount_it by default if no test options are @@ -169,6 +170,11 @@ case $key in shift # past argument shift # past value ;; + --py_version) + PY_VERSION="$2" + shift # past argument + shift # past value + ;; *) # unknown option echo "Unknown option: $1" exit 1 @@ -242,6 +248,9 @@ if [[ -z $PIPELINE_OPTS ]]; then if [[ "$ARCH" == "ARM" ]]; then opts+=("--machine_type=t2a-standard-1") + + IMAGE_NAME="beam_python${PY_VERSION}_sdk" + opts+=("--sdk_container_image=us.gcr.io/$PROJECT/$USER/$IMAGE_NAME:$MULTIARCH_TAG") fi if [[ ! -z "$KMS_KEY_NAME" ]]; then diff --git a/sdks/python/scripts/run_pytest.sh b/sdks/python/scripts/run_pytest.sh index 01f2318164c4f..ad35b48972b6b 100755 --- a/sdks/python/scripts/run_pytest.sh +++ b/sdks/python/scripts/run_pytest.sh @@ -42,10 +42,10 @@ echo "posargs: $posargs" # Run with pytest-xdist and without. pytest -o junit_suite_name=${envname} \ - --junitxml=pytest_${envname}.xml -m 'not no_xdist' -n 6 ${pytest_args} --pyargs ${posargs} + --junitxml=pytest_${envname}.xml -m 'not no_xdist' -n 6 --import-mode=importlib ${pytest_args} --pyargs ${posargs} status1=$? pytest -o junit_suite_name=${envname}_no_xdist \ - --junitxml=pytest_${envname}_no_xdist.xml -m 'no_xdist' ${pytest_args} --pyargs ${posargs} + --junitxml=pytest_${envname}_no_xdist.xml -m 'no_xdist' --import-mode=importlib ${pytest_args} --pyargs ${posargs} status2=$? # Exit with error if no tests were run in either suite (status code 5). diff --git a/sdks/python/scripts/run_snapshot_publish.sh b/sdks/python/scripts/run_snapshot_publish.sh index 6379e6f210843..bc379077349d5 100755 --- a/sdks/python/scripts/run_snapshot_publish.sh +++ b/sdks/python/scripts/run_snapshot_publish.sh @@ -21,7 +21,7 @@ BUCKET=gs://beam-python-nightly-snapshots VERSION=$(awk '/__version__/{print $3}' $WORKSPACE/sdks/python/apache_beam/version.py) VERSION=$(echo $VERSION | cut -c 2- | rev | cut -c 2- | rev) time=$(date +"%Y-%m-%dT%H:%M:%S") -SNAPSHOT="apache-beam-$VERSION-$time.zip" +SNAPSHOT="apache-beam-$VERSION-$time.tar.gz" DEP_SNAPSHOT_ROOT="$BUCKET/dependency_requirements_snapshot" DEP_SNAPSHOT_FILE_NAME="beam-py-requirements-$time.txt" @@ -30,8 +30,8 @@ DEP_SNAPSHOT_FILE_NAME="beam-py-requirements-$time.txt" # and located under Gradle build directory. cd $WORKSPACE/sdks/python/build -# Rename the file to be apache-beam-{VERSION}-{datetime}.zip -for file in "apache-beam-$VERSION*.zip"; do +# Rename the file to be apache-beam-{VERSION}-{datetime}.tar.gz +for file in "apache-beam-$VERSION*.tar.gz"; do mv $file $SNAPSHOT done diff --git a/sdks/python/scripts/run_tox.sh b/sdks/python/scripts/run_tox.sh index ebbacf5494eaf..ac60f26b32bad 100755 --- a/sdks/python/scripts/run_tox.sh +++ b/sdks/python/scripts/run_tox.sh @@ -53,12 +53,21 @@ if [[ "$JENKINS_HOME" != "" ]]; then export PY_COLORS=1 fi -if [[ ! -z $2 ]]; then +# Determine if the second argument is SDK_LOCATION or posargs +if [[ -f "$1" ]]; then # Check if the argument corresponds to a file SDK_LOCATION="$1" - shift; - tox -c tox.ini run --recreate -e "$TOX_ENVIRONMENT" --installpkg "$SDK_LOCATION" -- "$@" -else - tox -c tox.ini run --recreate -e "$TOX_ENVIRONMENT" + shift +fi + +# If SDK_LOCATION is identified and there are still arguments left, those are posargs. +if [[ ! -z "$SDK_LOCATION" ]]; then + if [[ $# -gt 0 ]]; then # There are posargs + tox -c tox.ini run --recreate -e "$TOX_ENVIRONMENT" --installpkg "$SDK_LOCATION" -- "$@" + else + tox -c tox.ini run --recreate -e "$TOX_ENVIRONMENT" --installpkg "$SDK_LOCATION" + fi +else # No SDK_LOCATION; all arguments are posargs + tox -c tox.ini run --recreate -e "$TOX_ENVIRONMENT" -- "$@" fi exit_code=$? diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c5b3d5e9fcd96..529c3ee38bd9e 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -18,6 +18,7 @@ """Apache Beam SDK for Python setup file.""" import os +import subprocess import sys import warnings # Pylint and isort disagree here. @@ -62,7 +63,6 @@ def get_project_path(self): return os.path.join(project_path, to_filename(ei_cmd.egg_name)) def run(self): - import subprocess args = ['mypy', self.get_project_path()] result = subprocess.call(args) if result != 0: @@ -140,9 +140,15 @@ def cythonize(*args, **kwargs): # [BEAM-8181] pyarrow cannot be installed on 32-bit Windows platforms. if sys.platform == 'win32' and sys.maxsize <= 2**32: - pyarrow_dependency = '' + pyarrow_dependency = [''] else: - pyarrow_dependency = 'pyarrow>=3.0.0,<12.0.0' + pyarrow_dependency = [ + 'pyarrow>=3.0.0,<12.0.0', + # NOTE(https://github.com/apache/beam/issues/29392): We can remove this + # once Beam increases the pyarrow lower bound to a version that fixes CVE. + 'pyarrow-hotfix<1' + ] + # Exclude pandas<=1.4.2 since it doesn't work with numpy 1.24.x. # Exclude 1.5.0 and 1.5.1 because of @@ -151,16 +157,40 @@ def cythonize(*args, **kwargs): 'pandas>=1.4.3,!=1.5.0,!=1.5.1,<2.1;python_version>="3.8"', ] +def find_by_ext(root_dir, ext): + for root, _, files in os.walk(root_dir): + for file in files: + if file.endswith(ext): + yield os.path.realpath(os.path.join(root, file)) # We must generate protos after setup_requires are installed. def generate_protos_first(): try: - # pylint: disable=wrong-import-position - import gen_protos - gen_protos.generate_proto_files() - - except ImportError: - warnings.warn("Could not import gen_protos, skipping proto generation.") + # Pyproject toml build happens in isolated environemnts. In those envs, + # gen_protos is unable to get imported. so we run a subprocess call. + cwd = os.path.abspath(os.path.dirname(__file__)) + # when pip install <>.tar.gz gets called, if gen_protos.py is not available + # in the sdist,then the proto files would have already been generated. So we + # skip proto generation in that case. + if not os.path.exists(os.path.join(cwd, 'gen_protos.py')): + # make sure we already generated protos + pb2_files = list(find_by_ext(os.path.join( + cwd, 'apache_beam', 'portability', 'api'), '_pb2.py')) + if not pb2_files: + raise RuntimeError('protobuf files are not generated. ' + 'Please generate pb2 files') + + warnings.warn('Skipping proto generation as they are already generated.') + return + out = subprocess.run([ + sys.executable, + os.path.join(cwd, 'gen_protos.py'), + '--no-force' + ], capture_output=True, check=True) + print(out.stdout) + except subprocess.CalledProcessError as err: + raise RuntimeError('Could not generate protos due to error: %s', + err.stderr) def get_portability_package_data(): @@ -188,6 +218,27 @@ def get_portability_package_data(): # structure must exist before the call to setuptools.find_packages() # executes below. generate_protos_first() + + # generate cythonize extensions only if we are building a wheel or + # building an extension or running in editable mode. + cythonize_cmds = ('bdist_wheel', 'build_ext', 'editable_wheel') + if any(cmd in sys.argv for cmd in cythonize_cmds): + extensions = cythonize([ + 'apache_beam/**/*.pyx', + 'apache_beam/coders/coder_impl.py', + 'apache_beam/metrics/cells.py', + 'apache_beam/metrics/execution.py', + 'apache_beam/runners/common.py', + 'apache_beam/runners/worker/logger.py', + 'apache_beam/runners/worker/opcounters.py', + 'apache_beam/runners/worker/operations.py', + 'apache_beam/transforms/cy_combiners.py', + 'apache_beam/transforms/stats.py', + 'apache_beam/utils/counters.py', + 'apache_beam/utils/windowed_value.py', + ]) + else: + extensions = [] # Keep all dependencies inlined in the setup call, otherwise Dependabot won't # be able to parse it. setuptools.setup( @@ -213,21 +264,7 @@ def get_portability_package_data(): *get_portability_package_data() ] }, - ext_modules=cythonize([ - 'apache_beam/**/*.pyx', - 'apache_beam/coders/coder_impl.py', - 'apache_beam/metrics/cells.py', - 'apache_beam/metrics/execution.py', - 'apache_beam/runners/common.py', - 'apache_beam/runners/worker/logger.py', - 'apache_beam/runners/worker/opcounters.py', - 'apache_beam/runners/worker/operations.py', - 'apache_beam/transforms/cy_combiners.py', - 'apache_beam/transforms/stats.py', - 'apache_beam/utils/counters.py', - 'apache_beam/utils/windowed_value.py', - ], - language_level=3), + ext_modules=extensions, install_requires=[ 'crcmod>=1.7,<2.0', 'orjson>=3.9.7,<4', @@ -248,9 +285,10 @@ def get_portability_package_data(): 'hdfs>=2.1.0,<3.0.0', 'httplib2>=0.8,<0.23.0', 'js2py>=0.74,<1', + 'jsonschema>=4.0.0,<5.0.0', # numpy can have breaking changes in minor versions. # Use a strict upper bound. - 'numpy>=1.14.3,<1.25.0', # Update build-requirements.txt as well. + 'numpy>=1.14.3,<1.25.0', # Update pyproject.toml as well. 'objsize>=0.6.1,<0.7.0', 'packaging>=22.0', 'pymongo>=3.8.0,<5.0.0', @@ -265,7 +303,7 @@ def get_portability_package_data(): # # 3. Exclude protobuf 4 versions that leak memory, see: # https://github.com/apache/beam/issues/28246 - 'protobuf>=3.20.3,<4.25.0,!=4.0.*,!=4.21.*,!=4.22.0,!=4.23.*,!=4.24.0,!=4.24.1,!=4.24.2', # pylint: disable=line-too-long + 'protobuf>=3.20.3,<4.26.0,!=4.0.*,!=4.21.*,!=4.22.0,!=4.23.*,!=4.24.*', # pylint: disable=line-too-long 'pydot>=1.2.0,<2', 'python-dateutil>=2.8.0,<3', 'pytz>=2018.3', @@ -276,7 +314,7 @@ def get_portability_package_data(): # Dynamic dependencies must be specified in a separate list, otherwise # Dependabot won't be able to parse the main list. Any dynamic # dependencies will not receive updates from Dependabot. - ] + [pyarrow_dependency], + ] + pyarrow_dependency, python_requires=python_requires, # BEAM-8840: Do NOT use tests_require or setup_requires. extras_require={ @@ -349,9 +387,11 @@ def get_portability_package_data(): 'interactive_test': [ # headless chrome based integration tests 'needle>=0.5.0,<1', - 'chromedriver-binary>=100,<114', + 'chromedriver-binary>=117,<118', # use a fixed major version of PIL for different python versions 'pillow>=7.1.1,<10', + # urllib 2.x is a breaking change for the headless chrome tests + 'urllib3<2,>=1.21.1' ], 'aws': ['boto3>=1.9,<2'], 'azure': [ @@ -364,6 +404,9 @@ def get_portability_package_data(): 'dask >= 2022.6', 'distributed >= 2022.6', ], + 'yaml': [ + 'pyyaml>=3.12,<7.0.0', + ] + dataframe_dependency }, zip_safe=False, # PyPI package information. diff --git a/sdks/python/test-suites/dataflow/common.gradle b/sdks/python/test-suites/dataflow/common.gradle index 7766cf3a377c6..a713b82400e75 100644 --- a/sdks/python/test-suites/dataflow/common.gradle +++ b/sdks/python/test-suites/dataflow/common.gradle @@ -144,7 +144,9 @@ task postCommitIT { } task postCommitArmIT { + def pyversion = "${project.ext.pythonVersion.replace('.', '')}" dependsOn 'initializeForDataflowJob' + dependsOn ":sdks:python:container:py${pyversion}:docker" doLast { def testOpts = basicPytestOpts + ["--numprocesses=8", "--dist=loadfile"] @@ -153,6 +155,7 @@ task postCommitArmIT { "sdk_location": project.ext.sdkLocation, "suite": "postCommitIT-df${pythonVersionSuffix}", "collect": "it_postcommit", + "py_version": project.ext.pythonVersion, "arch": "ARM" ] def cmdArgs = mapToArgString(argMap) diff --git a/sdks/python/test-suites/tox/common.gradle b/sdks/python/test-suites/tox/common.gradle index ee183dff40649..3fdd0c0c553b8 100644 --- a/sdks/python/test-suites/tox/common.gradle +++ b/sdks/python/test-suites/tox/common.gradle @@ -29,18 +29,12 @@ test.dependsOn "testPy${pythonVersionSuffix}Cloud" // toxTask "testPy${pythonVersionSuffix}Dask", "py${pythonVersionSuffix}-dask", "${posargs}" // test.dependsOn "testPy${pythonVersionSuffix}Dask" -toxTask "testPy${pythonVersionSuffix}Cython", "py${pythonVersionSuffix}-cython", "${posargs}" -test.dependsOn "testPy${pythonVersionSuffix}Cython" toxTask "testPy38CloudCoverage", "py38-cloudcoverage", "${posargs}" test.dependsOn "testPy38CloudCoverage" project.tasks.register("preCommitPy${pythonVersionSuffix}") { - // Since codecoverage reports will always be generated for py38, - // all tests will be exercised. - if (pythonVersionSuffix.equals('38')) { - dependsOn = ["testPy38Cython"] - } else { - dependsOn = ["testPy${pythonVersionSuffix}Cloud", "testPy${pythonVersionSuffix}Cython"] - } + // Since codecoverage reports will always be generated for py38, + // all tests will be exercised. + dependsOn = ["testPy${pythonVersionSuffix}Cloud", "testPython${pythonVersionSuffix}"] } \ No newline at end of file diff --git a/sdks/python/test-suites/tox/py310/build.gradle b/sdks/python/test-suites/tox/py310/build.gradle index ea10fde831c63..f1e40a17951fc 100644 --- a/sdks/python/test-suites/tox/py310/build.gradle +++ b/sdks/python/test-suites/tox/py310/build.gradle @@ -28,5 +28,3 @@ pythonVersion = '3.10' apply from: "../common.gradle" -// TODO(https://github.com/apache/beam/issues/20051): Remove this once tox uses isolated builds. -testPy310Cython.mustRunAfter testPython310, testPy310Cloud diff --git a/sdks/python/test-suites/tox/py311/build.gradle b/sdks/python/test-suites/tox/py311/build.gradle index 1bb3766500bb9..fabf9fd4365a7 100644 --- a/sdks/python/test-suites/tox/py311/build.gradle +++ b/sdks/python/test-suites/tox/py311/build.gradle @@ -28,5 +28,3 @@ pythonVersion = '3.11' apply from: "../common.gradle" -// TODO(https://github.com/apache/beam/issues/20051): Remove this once tox uses isolated builds. -testPy311Cython.mustRunAfter testPython311, testPy311Cloud diff --git a/sdks/python/test-suites/tox/py38/build.gradle b/sdks/python/test-suites/tox/py38/build.gradle index ab9ce6dddc26d..f01caa3607d0d 100644 --- a/sdks/python/test-suites/tox/py38/build.gradle +++ b/sdks/python/test-suites/tox/py38/build.gradle @@ -43,8 +43,6 @@ lint.dependsOn mypyPy38 apply from: "../common.gradle" -// TODO(https://github.com/apache/beam/issues/20051): Remove this once tox uses isolated builds. -testPy38Cython.mustRunAfter testPython38, testPy38CloudCoverage // PyCoverage Precommit runs test suites that evaluate test coverage and compatibility of @@ -157,6 +155,8 @@ task archiveFilesToLint(type: Zip) { include "**/*.md" include "**/build.gradle" include '**/build.gradle.kts' + exclude '**/build/**' // intermediate build directory + exclude 'website/www/site/themes/docsy/**' // fork to google/docsy exclude "**/node_modules/*" exclude "**/.gogradle/*" } diff --git a/sdks/python/test-suites/tox/py39/build.gradle b/sdks/python/test-suites/tox/py39/build.gradle index 380cc1486daad..5bb73b60a5d2f 100644 --- a/sdks/python/test-suites/tox/py39/build.gradle +++ b/sdks/python/test-suites/tox/py39/build.gradle @@ -27,6 +27,3 @@ applyPythonNature() pythonVersion = '3.9' apply from: "../common.gradle" - -// TODO(https://github.com/apache/beam/issues/20051): Remove this once tox uses isolated builds. -testPy39Cython.mustRunAfter testPython39, testPy39Cloud diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index a1bbc8001f51c..79a3bfd3ae81a 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -17,7 +17,7 @@ [tox] # new environments will be excluded by default unless explicitly added to envlist. -envlist = py38,py39,py310,py311,py38-{cloud,cython,docs,lint,mypy,cloudcoverage,dask},py39-{cloud,cython},py310-{cloud,cython,dask},py311-{cloud,cython,dask},whitespacelint +envlist = py38,py39,py310,py311,py38-{cloud,docs,lint,mypy,cloudcoverage,dask},py39-{cloud},py310-{cloud,dask},py311-{cloud,dask},whitespacelint toxworkdir = {toxinidir}/target/{env:ENV_NAME:.tox} [pycodestyle] @@ -44,9 +44,6 @@ allowlist_externals = curl ./codecov chmod -deps = - cython: cython==0.29.33 - -r build-requirements.txt setenv = RUN_SKIPPED_PY3_TESTS=0 # Use an isolated tmp dir for tests that get slowed down by scanning /tmp. @@ -67,6 +64,7 @@ commands_pre = bash {toxinidir}/scripts/run_tox_cleanup.sh commands_post = bash {toxinidir}/scripts/run_tox_cleanup.sh + commands = false {envname} is misconfigured [testenv:py{38,39,310,311}] @@ -81,28 +79,18 @@ commands = install_command = {envbindir}/python.exe {envbindir}/pip.exe install --retries 10 {opts} {packages} list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze -[testenv:py{38,39,310,311}-cython] -# cython tests are only expected to work in linux (2.x and 3.x) -# If we want to add other platforms in the future, it should be: -# `platform = linux2|darwin|...` -# See https://docs.python.org/2/library/sys.html#sys.platform for platform codes -platform = linux -commands = - # TODO(https://github.com/apache/beam/issues/20051): Remove this build_ext invocation once local source no longer - # shadows the installed apache_beam. - python setup.py build_ext --inplace - python apache_beam/examples/complete/autocomplete_test.py - bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" - [testenv:py{38,39,310,311}-cloud] +; extras = test,gcp,interactive,dataframe,aws,azure extras = test,gcp,interactive,dataframe,aws,azure commands = + python apache_beam/examples/complete/autocomplete_test.py bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" [testenv:py{38,39,310,311}-dask] extras = test,dask commands = bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" + [testenv:py38-cloudcoverage] deps = pytest-cov==3.0.0 @@ -124,7 +112,6 @@ commands = setenv = # keep the version of pylint in sync with the 'rev' in .pre-commit-config.yaml deps = - -r build-requirements.txt astroid<2.17.0,>=2.15.6 pycodestyle==2.8.0 pylint==2.17.5 @@ -143,7 +130,6 @@ commands = [testenv:py38-mypy] deps = - -r build-requirements.txt mypy==0.790 dask==2022.01.0 distributed==2022.01.0 @@ -173,7 +159,6 @@ commands = # Used by hdfs_integration_test.sh. Do not run this directly, as it depends on # nodes defined in hdfs_integration_test/docker-compose.yml. deps = - -r build-requirements.txt holdup==1.8.0 extras = gcp @@ -206,7 +191,6 @@ commands_pre = # Do not run this directly, as it depends on nodes defined in # azure/integration_test/docker-compose.yml. deps = - -r build-requirements.txt extras = azure passenv = REQUESTS_CA_BUNDLE @@ -338,15 +322,15 @@ commands = # Run all DataFrame API unit tests bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/dataframe' -[testenv:py{38,39}-tft-113] +[testenv:py{38,39}-tft-{113,114}] deps = 113: tensorflow_transform>=1.13.0,<1.14.0 + 114: tensorflow_transform>=1.14.0,<1.15.0 commands = - bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms' + bash {toxinidir}/scripts/run_pytest.sh {envname} 'apache_beam/ml/transforms apache_beam/examples/snippets/transforms/elementwise/mltransform_test.py' [testenv:py{38,39,310,311}-pytorch-{19,110,111,112,113}] deps = - -r build-requirements.txt 19: torch>=1.9.0,<1.10.0 110: torch>=1.10.0,<1.11.0 111: torch>=1.11.0,<1.12.0 @@ -364,7 +348,6 @@ commands = [testenv:py{38,39,310}-pytorch-200] deps = - -r build-requirements.txt 200: torch>=2.0.0,<2.1.0 extras = test,gcp # Don't set TMPDIR to avoid "AF_UNIX path too long" errors in certain tests. @@ -398,7 +381,6 @@ commands = [testenv:py{38,39,310}-tensorflow-212] deps = - -r build-requirements.txt 212: tensorflow>=2.12rc1,<2.13 extras = test,gcp commands = @@ -410,7 +392,6 @@ commands = [testenv:py{38,39,310}-xgboost-{160,170}] deps = - -r build-requirements.txt 160: xgboost>=1.6.0,<1.7.0 datatable==1.0.0 @@ -427,7 +408,6 @@ commands = [testenv:py{38,39,310,311}-transformers-{428,429,430}] deps = - -r build-requirements.txt 428: transformers>=4.28.0,<4.29.0 429: transformers>=4.29.0,<4.30.0 430: transformers>=4.30.0,<4.31.0 @@ -445,7 +425,6 @@ commands = [testenv:py{38,311}-vertex-ai] deps = - -r build-requirements.txt tensorflow==2.12.0 extras = test,gcp commands = diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index b582b3d5c07f2..918846a79add3 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -1,6 +1,6 @@ { "name": "apache-beam", - "version": "2.52.0-SNAPSHOT", + "version": "2.53.0-SNAPSHOT", "devDependencies": { "@google-cloud/bigquery": "^5.12.0", "@types/mocha": "^9.0.0", diff --git a/settings.gradle.kts b/settings.gradle.kts index f4901d7df92be..85ef793b8e120 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -24,8 +24,8 @@ pluginManagement { } plugins { - id("com.gradle.enterprise") version "3.13.2" - id("com.gradle.common-custom-user-data-gradle-plugin") version "1.10" + id("com.gradle.enterprise") version "3.15.1" + id("com.gradle.common-custom-user-data-gradle-plugin") version "1.11.3" } @@ -175,6 +175,7 @@ include(":sdks:java:container:agent") include(":sdks:java:container:java8") include(":sdks:java:container:java11") include(":sdks:java:container:java17") +include(":sdks:java:container:java21") include(":sdks:java:core") include(":sdks:java:core:jmh") include(":sdks:java:expansion-service") @@ -244,6 +245,7 @@ include(":sdks:java:io:hbase") include(":sdks:java:io:hcatalog") include(":sdks:java:io:jdbc") include(":sdks:java:io:jms") +include(":sdks:java:io:json") include(":sdks:java:io:kafka") include(":sdks:java:io:kinesis") include(":sdks:java:io:kinesis:expansion-service") @@ -324,12 +326,16 @@ include(":runners:google-cloud-dataflow-java:worker:windmill") // no dots allowed for project paths include("beam-test-infra-metrics") project(":beam-test-infra-metrics").projectDir = file(".test-infra/metrics") +include("beam-test-infra-mock-apis") +project(":beam-test-infra-mock-apis").projectDir = file(".test-infra/mock-apis") include("beam-test-infra-pipelines") project(":beam-test-infra-pipelines").projectDir = file(".test-infra/pipelines") include("beam-test-tools") project(":beam-test-tools").projectDir = file(".test-infra/tools") include("beam-test-jenkins") project(":beam-test-jenkins").projectDir = file(".test-infra/jenkins") +include("beam-test-gha") +project(":beam-test-gha").projectDir = file(".github") include("beam-validate-runner") project(":beam-validate-runner").projectDir = file(".test-infra/validate-runner") include("com.google.api.gax.batching") diff --git a/website/www/site/config.toml b/website/www/site/config.toml index 6a1907e60591b..c7b0cd3412e67 100644 --- a/website/www/site/config.toml +++ b/website/www/site/config.toml @@ -104,7 +104,7 @@ github_project_repo = "https://github.com/apache/beam" [params] description = "Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes." -release_latest = "2.50.0" +release_latest = "2.51.0" # The repository and branch where the files live in Github or Colab. This is used # to serve and stage from your local branch, but publish to the master branch. # e.g. https://github.com/{{< param branch_repo >}}/path/to/notebook.ipynb diff --git a/website/www/site/content/en/blog/apache-beam-flink-and-kubernetes.md b/website/www/site/content/en/blog/apache-beam-flink-and-kubernetes.md new file mode 100644 index 0000000000000..b50d475ed7bce --- /dev/null +++ b/website/www/site/content/en/blog/apache-beam-flink-and-kubernetes.md @@ -0,0 +1,403 @@ +--- +title: "Build a scalable, self-managed streaming infrastructure with Beam and Flink" +date: 2023-11-03 09:00:00 -0400 +categories: + - blog +authors: + - talat +--- + + +In this blog series, [Talat Uyarer (Architect / Senior Principal Engineer)](https://www.linkedin.com/in/talatuyarer/), [Rishabh Kedia (Principal Engineer)](https://www.linkedin.com/in/rishabhkedia/), and [David He (Engineering Director)](https://www.linkedin.com/in/davidqhe/) describe how we built a self-managed streaming platform by using Apache Beam and Flink. In this part of the series, we describe why and how we built a large-scale, self-managed streaming infrastructure and services based on Flink by migrating from a cloud managed streaming service. We also outline the learnings for operational scalability and observability, performance, and cost effectiveness. We summarize techniques that we found useful in our journey. + + + +# Build a scalable, self-managed streaming infrastructure with Flink - part 1 + +## Introduction + +Palo Alto Networks (PANW) is a leader in cybersecurity, providing products, services and solutions to our customers. Data is the center of our products and services. We stream and store exabytes of data in our data lake, with near real-time ingestion, data transformation, data insertion to data store, and forwarding data to our internal ML-based systems and external SIEM’s. We support multi-tenancy in each component so that we can isolate tenants and provide optimal performance and SLA. Streaming processing plays a critical role in the pipelines. + +In the second part of the series, we provide a more thorough description of the core building blocks of our streaming infrastructure, such as autoscaler. We also give more details about our customizations, which enabled us to build a high-performance, large-scale streaming system. Finally, we explain how we solved challenging problems. + + +## The importance of self-managed streaming Infrastructure + +We built a large-scale data platform on Google Cloud. We used Dataflow as a managed streaming service. With Dataflow, we used the streaming engine running our application using Apache Beam and observability tools such as Cloud Logging and Cloud Monitoring. For more details, see [1]. The system can handle 15 million of events per second and one trillion events daily, at four petabytes of data volume daily. We run about 30,000 Dataflow jobs. Each job can have one or hundreds of workers, depending on the customer’s event throughputs. + +We support various applications using different endpoints: BigQuery data store, HTTPS-based external SIEMs or internal endpoints, Syslog based SIEMs, and Google Cloud Storage endpoints. Our customers and products rely on this data platform to handle cybersecurity postures and reactions. Our streaming infrastructure is highly flexible to add, update, and delete use cases through a streaming job subscription. For example, a customer wants to ingest log events from a firewall device into the data lake buffered in Kafka topics. A streaming job is subscribed to extract and filter the data, transform the data format, and do a streaming insert to our BigQuery data warehouse endpoint in real-time. The customer can use our visualization and dashboard products to view traffic or threads captured by this firewall. The following diagram illustrates the event producer, the use case subscription workflow, and the key components of the streaming platform: + + + + + + +This managed, Dataflow-based streaming infrastructure runs fine, but with some caveats: + + + +1. Cost is high, because it is a managed service. For the same resources used in a Dataflow application, such as vCPU and memory, the cost is much more expensive than using an open source streaming engine such as Flink running the same Beam application code. +2. It's not easy to achieve our latency and SLA goals, because it's difficult to extend features, such as autoscaling based on different applications, endpoints, or different parameters within one application. +3. The pipeline only runs on Google Cloud. + +The uniqueness of PANW’s streaming use cases is another reason that we use a self-managed service. We support multi-tenancy. A tenant (a customer) can ingest data at a very high rate (>100k requests per second), or at a very low rate (< 100 requests per second). A Dataflow job runs on VMs instead of Kubernetes, requiring a minimal one vCPU core. With a small tenant, this wastes resources. Our streaming infrastructure supports thousands of jobs, and the CPU utilization is more efficient if we do not have to use one core for a job. It is natural for us to use a streaming engine running on Kubernetes, so that we can allocate minimal resources for a small tenant, for example, using a Google Kubernetes Engine (GKE) pod with ½ or less vCPU core. + + +## The choice of Apache Flink and Kubernetes + +In an effort to handle the problems already stated and to find the most efficient solution, we evaluated various streaming frameworks, including Apache Samza, Apache Flink, and Apache Spark, against Dataflow. + +### Performance + + + +* One notable factor was Apache Flink’s native Kubernetes support. Unlike Samza, which lacked native Kubernetes support and required Apache Zookeeper for coordination, Flink seamlessly integrated with Kubernetes. This integration eliminated unnecessary complexities. In terms of performance, both Samza and Flink were close competitors. +* Apache Spark, while popular, proved to be significantly slower in our tests. A presentation at the Beam Summit revealed that Apache Beam’s Spark Runner was approximately ten times slower than Native Apache Spark [3]. We could not afford such a drastic performance hit. Rewriting our entire Beam codebase with native Spark was not a viable option, especially given the extensive codebase we had built over the past four years with Apache Beam. + +### Community + + + +The robustness of community support played a pivotal role in our decision making. Dataflow provided excellent support, but we needed assurance in our choice of an open-source framework. Apache Flink’s vibrant community and active contributions from multiple companies offered a level of confidence that was unmatched. This collaborative environment meant that bug identification and fixes were ongoing processes. In fact, in our journey, we have patched our system using many Flink fixes from the community: + +* We fixed the Google Cloud Storage file reading exceptions by merging Flink 1.15 open source fix [FLINK-26063](https://issues.apache.org/jira/browse/FLINK-26063?page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel&focusedCommentId=17504555#comment-17504555) (we are using 1.13). +* We fixed an issue with workers restarting for stateful jobs from [FLINK-31963](https://issues.apache.org/jira/browse/FLINK-31963). + +We also contributed to the community during our journey by founding and fixing bugs in the open source code. For details, see [FLINK-32700](https://issues.apache.org/jira/browse/FLINK-32700) for Flink Kubernetes Operator. We also created a new GKE Auth support for Kubernetes clients and merged it to GitHub at [4]. + +### Integration + + + +The seamless integration of Apache Flink with Kubernetes provided us with a flexible and scalable platform for orchestration. The synergy between Apache Flink and Kubernetes not only optimized our data processing workflows but also future-proofed our system. + + +## Architecture and deployment workflow + +In the realm of real-time data processing and analytics, Apache Flink distinguishes itself as a powerful and versatile framework. When combined with Kubernetes, the industry-standard container orchestration system, Flink applications can scale horizontally and have robust management capabilities. We explore a cutting-edge design where Apache Flink and Kubernetes synergize seamlessly, thanks to the Apache Flink Kubernetes Operator. + +At its core, the Flink Kubernetes Operator serves as a control plane, mirroring the knowledge and actions of a human operator managing Flink deployments. Unlike traditional methods, the Operator automates critical activities, from starting and stopping applications to handling upgrades and errors. Its versatile feature set includes fully-automated job lifecycle management, support for different Flink versions, and multiple deployment modes, such as application clusters and session jobs. Moreover, the Operator's operational prowess extends to metrics, logging, and even dynamic scaling by using the Job Autoscaler. + + +### Build a seamless deployment workflow + +Imagine a robust system where Flink jobs are deployed effortlessly, monitored diligently, and managed proactively. Our team created this workflow by integrating Apache Flink, Apache Flink Kubernetes Operator, and Kubernetes. Central to this setup is our custom-built Apache Flink Kubernetes Operator Client Library. This library acts as a bridge, enabling atomic operations such as starting, stopping, updating, and canceling Flink jobs. + + + + + + + +### The deployment process + +In our code, the client provides Apache Beam pipeline options, which include essential information such as the Kubernetes cluster's API endpoint, authentication details, the Google Cloud/S3 temporary location for uploading the JAR file, and worker type specifications. The Kubernetes Operator Library uses this information to orchestrate a seamless deployment process. The following sections explain the steps taken. Most of the core steps are automated in our code base. + +**Step 1:** + + + +* The client wants to start a job for a customer and a specific application. + +**Step 2:** + + + +* **Generate a unique job ID:** The library generates a unique job ID, which is set as a Kubernetes label. This identifier helps track and manage the deployed Flink job. +* **Configuration and code upload:** The library uploads all necessary configurations and user code to a designated location on Google Cloud Storage or Amazon S3. This step ensures that the Flink application's resources are available for deployment. +* **YAML payload generation:** After the upload process completes, the library constructs a YAML payload. This payload contains crucial deployment information, including resource settings based on the specified worker type. + +We used a convention for naming our worker VM instance types. Our convention is similar to the naming convention that Google Cloud uses. The name `n1-standard-1` refers to a specific, predefined VM machine type. Let’s break down what each component of the name means: + + + +* **n1** indicates the CPU type of the instance. In this case, it refers to the Intel based on instances in the N1 series. Google Cloud has multiple generations of instances with varying hardware and performance characteristics. +* **standard** signifies the machine type family. Standard machine types offer a balanced ratio of 1 virtual CPU (vCPUs) and 4 GB of memory for Task Manager, and 0.5 vCPU and 2 GB memory for Job Manager. +* **1** represents the number of vCPUs available in the instance. In the case of n1-standard-1, it means the instance has 1 vCPU. + +**Step 3:** + + + +* **Calling the Kubernetes API with Fabric8**: To initiate the deployment, the library interacts with the Kubernetes API using Fabric8. Fabric8 initially lacked support for authentication in Google Kubernetes Engine or Amazon Elastic Kubernetes Service (EKS). To address this limitation, our team implemented the necessary authentication support, which can be found in our merge request on GitHub PR [4]. + +**Step 4:** + + + +* **Flink Operator deployment**: When it receives the YAML payload, the Flink Operator takes charge of deploying the various components of the Flink job. Tasks include provisioning resources and managing the deployment of the Flink Job Manager, Task Manager, and Job Service. + +**Step 5:** + + + +* **Job submission and execution**: When the Flink Job Manager is running, it fetches the JAR file and configurations from the designated Google Cloud Storage or S3 location. With all necessary resources in place, it submits the Flink job to the standalone Flink cluster for execution. + +**Step 6** + + + +* **Continuous monitoring**: Post-deployment, our operator continuously monitors the status of the running Flink job. This real-time feedback loop enables us to promptly address any issues that arise, ensuring the overall health and optimal performance of our Flink applications. + +In summary, our deployment process leverages Apache Beam pipeline options, integrates seamlessly with Kubernetes and the Flink Operator, and employs custom logic to handle configuration uploads and authentication. This end-to-end workflow ensures a reliable and efficient deployment of Flink applications in Kubernetes clusters while maintaining vigilant monitoring for smooth operation. The following sequence diagram shows the steps. + + + + + +## Develope an autoscaler + +Having an autoscaler is critical to having a self-managed streaming service. There are not enough resources available on the internet for us to learn to build our own autoscaler, which makes this part of the workflow difficult. + +The autoscaler scales up the number of task managers to drain the lag and to keep up with the throughput. It also scales down the minimum number of resources required to process the incoming traffic to reduce costs. We need to do this frequently while keeping the processing disruption to minimum. + +We extensively tuned the autoscaler to meet the SLA for latency. This tuning involved a cost trade off. We also made the autoscaler application-specific to meet specific needs for certain applications. Every decision has a hidden cost. The second part of this blog provides more details about the autoscaler. + + +## Create a client library for steaming job development + +To deploy the job using the Flink Kubernetes Operator, you need to know about how Kubernetes works. The following steps explain how to create a single Flink job. + + + +1. Define a YAML file with proper specifications. The following image provides an example. + +```yaml +apiVersion: flink.apache.org/v1beta1 +kind: FlinkDeployment +metadata: + name: basic-reactive-example +spec: + image: flink:1.13 + flinkVersion: v1_13 + flinkConfiguration: + scheduler-mode: REACTIVE + taskmanager.numberOfTaskSlots: "2" + state.savepoints.dir: file:///flink-data/savepoints + state.checkpoints.dir: file:///flink-data/checkpoints + high-availability: org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory + high-availability.storageDir: file:///flink-data/ha + serviceAccount: flink + jobManager: + resource: + memory: "2048m" + cpu: 1 + taskManager: + resource: + memory: "2048m" + cpu: 1 + podTemplate: + spec: + containers: + - name: flink-main-container + volumeMounts: + - mountPath: /flink-data + name: flink-volume + volumes: + - name: flink-volume + hostPath: + # directory location on host + path: /tmp/flink + # this field is optional + type: Directory + job: + jarURI: local:///opt/flink/examples/streaming/StateMachineExample.jar + parallelism: 2 + upgradeMode: savepoint + state: running + savepointTriggerNonce: 0 + mode: standalone +``` + +2. SSH into your Flink cluster and run the command following command: + +``` +kubectl create -f job1.yaml +``` + + +3. Use the following command to check the status of the job: + +``` +kubectl get flinkdeployment job1 +``` + + + +This process impacts our scalability. Because we frequently update our jobs, we can't manually follow these steps for every running job. To do so would be highly error prone and time consuming. One wrong space in the YAML can fail the deployment. This approach also acts as a barrier to innovation, because you need to know Kubernetes to interact with Flink jobs. + +We built a library to provide an interface for any teams and applications that want to to start, delete, update, or get the status of their jobs. + + + + +This library extends the Fabric8 client and FlinkDeployment CRD. FlinkDeployment CRD is exposed by the Flink Kubernetes Operator. CRD lets you store and retrieve structured data. By extending the CRD, we get access to POJO, making it easier to manipulate the YAML file. + +The library supports the following tasks: + + + +1. Authentication to ensure that you are allowed to perform actions on the Flink cluster. +2. Validation (fetches the template from AWS/Google Cloud Storage for validation) takes user variable input and validates it against the policy, rules, YAML format. +3. Action execution converts the Java call to invoke the Kubernetes operation. + +During this process, we learned the following lessons: + + + +1. App specific operator service: At our large scale, the operator was unable to handle such a large number of jobs. Kubernetes calls started to time out and fail. To solve this problem, we created multiple operators (about 4) in high-traffic regions to handle each application. +2. Kube call caching: To prevent overloading, we cached the results of Kubernetes calls for thirty to sixty seconds. +3. Label support: Providing label support to search jobs using client-specific variables reduced the load on Kube and improved the job search speed by 5x. + +The following are some of the biggest wins we achieved by exposing the library: + + + +1. Standardized job management: Users can start, delete, and get status updates for their Flink jobs in a Kubernetes environment using a single library. +2. Abstracted Kubernetes complexity: Teams no longer need to worry about the inner workings of Kubernetes or the formatting job deployment YAML files. The library handles these details internally. +3. Simplified upgrades: With the underlying Kubernetes infrastructure, the library brings robustness and fault tolerance to Flink job management, ensuring minimal downtime and efficient recovery. + + +## Observability and alerting + +Observability is important when runing a production system at a large scale. We have about 30,000 streaming jobs in PANW. Each job serves a customer for a specific application. Each job also reads data from multiple topics in Kafka, performs transformations, and then writes the data to various sinks and endpoints. + +Constraints can occur anywhere in the pipeline or its endpoints, such as the customer API, BigQuery, and so on. We want to make sure the latency of streaming meets the SLA. Therefore, understanding if a job is healthy, meeting SLA, and alerting and intervening when needed is very challenging. + +To achieve our operational goals, we built a sophisticated observability and alerting capability. We provide three kinds of observability and debugging tools, described in the following sections. + + +### Flink job list and job insights from Prometheus and Grafana + +Each Flink job sends various metrics to our Prometheus with cardinality details, such as application name, customer Id, and regions, so that we can look at each job. Critical metrics include the input traffic rate, output throughput, backlogs in Kafka, timestamp-based latency, task CPU usage, task numbers, OOM counts, and so on. + +The following charts provide a few examples. The charts provide details about the ingestion traffic rate to Kafka for a specific customer, the streaming job’s overall throughput, each vCPU’s throughput, backlogs in Kafka, and worker autoscaling based on the observed backlog. + + + + + + +The following chart shows streaming latency based on the timestamp watermark. In addition to the numbers of events in Kafka as backlogs, it is important to know the time latency for end-to-end streaming so that we can define and monitor the SLA. The latency is defined as the time taken for the streaming processing, starting from ingestion timestamp, to the timestamp sending to the streaming endpoint. A watermark is the last processed event’s time. With the watermark, we are tracking P100 latency. We track each event’s stream latency, so that we can understand each Kafka topic and partition or Flink job pipeline issue. The following example shows each event stream and its latency: + + + + +### Flink open source UI + +We use and extend the Apache Flink dashboard UI to monitor jobs and tasks, such as the checkpoint duration, size, and failure. One important extension we used is a job history page that lets us see a job's start and update timeline and details, which helps us to debug issues. + + + + + +### Dashboards and alerting for backlog and latency + +We have about 30,000 jobs, and we want to closely monitor the jobs and receive alerts for jobs in abnormal states so that we can intervene. We created dashboards for each application so that we can show the list of jobs with the highest latency and create thresholds for alerts. The following example shows the timestamp-based latency dashboard for one application. We can set the alerting if the latency is larger than a threshold, such as 10 minutes, for a certain time continuously: + + + + + +The following example shows more backlog-based dashboards: + + + + + +The alerts are based on thresholds, and we frequently check metrics. If a threshold is met and continues for a certain amount of times, we alert our internal Slack channels or PagerDuty for immediate attention. We tune the alerting so that the accuracy is high. + + +## Cost optimization strategies and tuning + +We also moved to a self-managed streaming service to improve cost efficiency. Several minor tunings have allowed us to reduce costs by half, and we have more opportunities for improvement. + +The following list includes a few tips that have helped us: + + + +- Use Google Cloud Storage as checkpointing storage. +- Reduce the write frequency to Google Cloud Storage. +- Use appropriate machine types. For example, in Google Cloud, N2D machines are 15% less expensive than N2 machines. +- Autoscale tasks to use optimal resources while maintaining the latency SLA. + +The following sections provide more details about the first two tips. + +### Google Cloud Storage and checkpointing + +We use Google Cloud Storage as our checkpoint store because it is cost-effective, scalable, and durable. When working with Google Cloud Storage, the following design considerations and best practices can help you optimize scaling and performance: + + + +* Use data partitioning methods like range partitioning, which divides data based on specific attributes, and hash partitioning, which distributes data evenly using hash functions. +* Avoid sequential key names, especially timestamps, to avoid hotspots and uneven data distribution. Instead, introduce random prefixes for object distribution. +* Use a hierarchical folder structure to improve data management and reduce the number of objects in a single directory. +* Combine small files into larger ones to improve read throughput. Minimizing the number of small files reduces inefficient storage use and metadata operations. + +### Tune the frequency of writing to Google Cloud Storage + +Scaling jobs efficiently was one of our primary challenges. Stateless jobs, which are relatively simpler, still present hurdles, especially in scenarios where Flink needed to process an overwhelming number of workers. To overcome this challenge, We increased the `state.storage.fs.memory-threshold` settings to 1 MB from 20KB (??). This configuration allowed us to combine small checkpoint files into larger ones at the Job Manager level and to reduce metadata calls. + +Optimizing the performance of Google Cloud operations was another challenge. Although Google Cloud Storage is excellent for streaming large amounts of data, it has limitations when it comes to handling high-frequency I/O requests. To mitigate this issue, we introduced random prefixes in key names, avoided sequential key names, and optimized our Google Cloud Storage sharding techniques. These methods significantly enhanced our Google Cloud Storage performance, enabling the smooth operation of our stateless jobs. + +The following chart shows the Google Cloud Storage writes reduction after changing the memory-threshold: + + + + + + + +# Conclusion + +Palo Alto Networks® Cortex Data Lake is fully migrated from Dataflow streaming engine to Flink self managed streaming engine infrastructure. We have achieved our goals to run the system more cost efficiently (more than half cost cut), and run the infrastructure on multiple clouds such as GCP and AWS. We have learned how to build a large scale reliable production system based on open sources. We see large potentials to customize the system based on our specific needs as we have a lot of freedom to customize the open source code and configuration. In the next Part 2 post we will give more details on autoscaling and performance tuning parts. We hope our experience will be helpful for readers who will explore similar solutions for their own organizations. + + +# Additional Resources + +We provide links here for related presentations as further reading for readers interested in implementing similar solutions. By adding this section, we hope you can find more details to build a fully managed streaming infrastructure, making it easier for readers to follow our stories and learnings. + +[1] Streaming framework at PANW published at Apache Beam: [https://beam.apache.org/case-studies/paloalto/](https://beam.apache.org/case-studies/paloalto/) + +[2] PANW presentation at Beam Summit 2023: [https://youtu.be/IsGW8IU3NfA?feature=shared](https://youtu.be/IsGW8IU3NfA?feature=shared) + +[3] Benchmark presented at Beam Summit 2021: [https://2021.beamsummit.org/sessions/tpc-ds-and-apache-beam/](https://2021.beamsummit.org/sessions/tpc-ds-and-apache-beam/) + +[4] PANW open source contribution to Flink for GKE Auth support: [https://github.com/fabric8io/kubernetes-client/pull/4185](https://github.com/fabric8io/kubernetes-client/pull/4185) + + +# Acknowledgements + +This is a large effort to build the new infrastructure and to migrate the large customer based applications from cloud provider managed streaming infrastructure to self-managed Flink based infrastructure at scale. Thanks the Palo Alto Networks CDL streaming team who helped to make this happen: Kishore Pola, Andrew Park, Hemant Kumar, Manan Mangal, Helen Jiang, Mandy Wang, Praveen Kumar Pasupuleti, JM Teo, Rishabh Kedia, Talat Uyarer, Naitk Dani, and David He. diff --git a/website/www/site/content/en/blog/beam-2.51.0.md b/website/www/site/content/en/blog/beam-2.51.0.md new file mode 100644 index 0000000000000..aaa4142bae625 --- /dev/null +++ b/website/www/site/content/en/blog/beam-2.51.0.md @@ -0,0 +1,210 @@ +--- +title: "Apache Beam 2.51.0" +date: 2023-10-11 09:00:00 -0400 +categories: + - blog + - release +authors: + - klk +--- + + +We are happy to present the new 2.51.0 release of Beam. +This release includes both improvements and new functionality. +See the [download page](/get-started/downloads/#2510-2023-10-03) for this release. + + + +For more information on changes in 2.51.0, check out the [detailed release notes](https://github.com/apache/beam/milestone/15). + +## New Features / Improvements + +* In Python, [RunInference](https://beam.apache.org/documentation/sdks/python-machine-learning/#why-use-the-runinference-api) now supports loading many models in the same transform using a [KeyedModelHandler](https://beam.apache.org/documentation/sdks/python-machine-learning/#use-a-keyed-modelhandler) ([#27628](https://github.com/apache/beam/issues/27628)). +* In Python, the [VertexAIModelHandlerJSON](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.vertex_ai_inference.html#apache_beam.ml.inference.vertex_ai_inference.VertexAIModelHandlerJSON) now supports passing in inference_args. These will be passed through to the Vertex endpoint as parameters. +* Added support to run `mypy` on user pipelines ([#27906](https://github.com/apache/beam/issues/27906)) + + +## Breaking Changes + +* Removed fastjson library dependency for Beam SQL. Table property is changed to be based on jackson ObjectNode (Java) ([#24154](https://github.com/apache/beam/issues/24154)). +* Removed TensorFlow from Beam Python container images [PR](https://github.com/apache/beam/pull/28424). If you have been negatively affected by this change, please comment on [#20605](https://github.com/apache/beam/issues/20605). +* Removed the parameter `t reflect.Type` from `parquetio.Write`. The element type is derived from the input PCollection (Go) ([#28490](https://github.com/apache/beam/issues/28490)) +* Refactor BeamSqlSeekableTable.setUp adding a parameter joinSubsetType. [#28283](https://github.com/apache/beam/issues/28283) + + +## Bugfixes + +* Fixed exception chaining issue in GCS connector (Python) ([#26769](https://github.com/apache/beam/issues/26769#issuecomment-1700422615)). +* Fixed streaming inserts exception handling, GoogleAPICallErrors are now retried according to retry strategy and routed to failed rows where appropriate rather than causing a pipeline error (Python) ([#21080](https://github.com/apache/beam/issues/21080)). +* Fixed a bug in Python SDK's cross-language Bigtable sink that mishandled records that don't have an explicit timestamp set: [#28632](https://github.com/apache/beam/issues/28632). + + +## Security Fixes +* Python containers updated, fixing [CVE-2021-30474](https://nvd.nist.gov/vuln/detail/CVE-2021-30474), [CVE-2021-30475](https://nvd.nist.gov/vuln/detail/CVE-2021-30475), [CVE-2021-30473](https://nvd.nist.gov/vuln/detail/CVE-2021-30473), [CVE-2020-36133](https://nvd.nist.gov/vuln/detail/CVE-2020-36133), [CVE-2020-36131](https://nvd.nist.gov/vuln/detail/CVE-2020-36131), [CVE-2020-36130](https://nvd.nist.gov/vuln/detail/CVE-2020-36130), and [CVE-2020-36135](https://nvd.nist.gov/vuln/detail/CVE-2020-36135) +* Used go 1.21.1 to build, fixing [CVE-2023-39320](https://security-tracker.debian.org/tracker/CVE-2023-39320) + + +## Known Issues + +* Python pipelines using BigQuery Storage Read API must pin `fastavro` dependency to 1.8.3 + or earlier: [#28811](https://github.com/apache/beam/issues/28811) + +## List of Contributors + +According to git shortlog, the following people contributed to the 2.50.0 release. Thank you to all contributors! + +Adam Whitmore + +Ahmed Abualsaud + +Ahmet Altay + +Aleksandr Dudko + +Alexey Romanenko + +Anand Inguva + +Andrey Devyatkin + +Arvind Ram + +Arwin Tio + +BjornPrime + +Bruno Volpato + +Bulat + +Celeste Zeng + +Chamikara Jayalath + +Clay Johnson + +Damon + +Danny McCormick + +David Cavazos + +Dip Patel + +Hai Joey Tran + +Hao Xu + +Haruka Abe + +Jack Dingilian + +Jack McCluskey + +Jeff Kinard + +Jeffrey Kinard + +Joey Tran + +Johanna Öjeling + +Julien Tournay + +Kenneth Knowles + +Kerry Donny-Clark + +Mattie Fu + +Melissa Pashniak + +Michel Davit + +Moritz Mack + +Pranav Bhandari + +Rebecca Szper + +Reeba Qureshi + +Reuven Lax + +Ritesh Ghorse + +Robert Bradshaw + +Robert Burke + +Ruwann + +Ryan Tam + +Sam Rohde + +Sereana Seim + +Svetak Sundhar + +Tim Grein + +Udi Meiri + +Valentyn Tymofieiev + +Vitaly Terentyev + +Vlado Djerek + +Xinyu Liu + +Yi Hu + +Zbynek Konecny + +Zechen Jiang + +bzablocki + +caneff + +dependabot[bot] + +gDuperran + +gabry.wu + +johnjcasey + +kberezin-nshl + +kennknowles + +liferoad + +lostluck + +magicgoody + +martin trieu + +mosche + +olalamichelle + +tvalentyn + +xqhu + +Łukasz Spyra diff --git a/website/www/site/content/en/blog/beam-sql-with-notebooks.md b/website/www/site/content/en/blog/beam-sql-with-notebooks.md index 4f7c428613a12..d7d80f4db7f56 100644 --- a/website/www/site/content/en/blog/beam-sql-with-notebooks.md +++ b/website/www/site/content/en/blog/beam-sql-with-notebooks.md @@ -420,7 +420,7 @@ import json import requests # The covidtracking project has stopped collecting new data, current data ends on 2021-03-07 -json_current='https://covidtracking.com/api/v1/states/current.json' +json_current='https://api.covidtracking.com/v1/states/current.json' def get_json_data(url): with requests.Session() as session: diff --git a/website/www/site/content/en/blog/beamquest.md b/website/www/site/content/en/blog/beamquest.md index eea893bf82274..dde6376b40771 100644 --- a/website/www/site/content/en/blog/beamquest.md +++ b/website/www/site/content/en/blog/beamquest.md @@ -34,6 +34,6 @@ Individuals aren’t the only ones who can benefit from completing this quest - Data Processing is a key part of AI/ML workflows. Given the recent advancements in artificial intelligence, now’s the time to jump into the world of data processing! Get started on your journey [here](https://www.cloudskillsboost.google/quests/310). -We are currently offering this quest **FREE OF CHARGE** until **July 8, 2023** for the **first 2,000** people. To obtain your badge for **FREE**, use the [Access Code](https://www.cloudskillsboost.google/catalog?qlcampaign=1h-swiss-19), create an account, and search ["Getting Started with Apache Beam"](https://www.cloudskillsboost.google/quests/310). +We are currently offering this quest **FREE OF CHARGE**. To obtain your badge for **FREE**, use the [Access Code](https://www.cloudskillsboost.google/catalog?qlcampaign=1h-swiss-19), create an account, and search ["Getting Started with Apache Beam"](https://www.cloudskillsboost.google/quests/310). If the code does not work, please email [dev@beam.apache.org](dev@beam.apache.org) to obtain a free code. PS: Once you earn your badge, please [share it on social media](https://support.google.com/qwiklabs/answer/9222527?hl=en&sjid=14905615709060962899-NA)! diff --git a/website/www/site/content/en/blog/contributor-spotlight-johanna-ojeling.md b/website/www/site/content/en/blog/contributor-spotlight-johanna-ojeling.md new file mode 100644 index 0000000000000..717f591eca927 --- /dev/null +++ b/website/www/site/content/en/blog/contributor-spotlight-johanna-ojeling.md @@ -0,0 +1,63 @@ +--- +title: "Contributor Spotlight: Johanna Öjeling" +date: 2023-11-11 15:00:00 -0800 +categories: + - blog +authors: + - altay +--- + + +Johanna Öjeling is a Senior Software Engineer at [Normative](https://normative.io/). She started using Apache Beam in 2020 at her previous company [Datatonic](http://datatonic.com) and began contributing in 2022 at a personal capacity. We interviewed Johanna to learn more about her interests and we hope that this will inspire new, future, diverse set of contributors to participate in OSS projects. + +**What areas of interest are you passionate about in your career?** + +My core interest lies in distributed and data-intensive systems, and I enjoy working on challenges related to performance, scalability and maintainability. I also feel strongly about developer experience, and like to build tools and frameworks that make developers happier and more productive. Aside from that, I take pleasure in mentoring and coaching other software engineers to grow their skills and pursue a fulfilling career. + +**What motivated you to make your first contribution?** + +I was already a user of the Apache Beam Java and Python SDKs and Google Cloud Dataflow in my previous job, and had started to play around with the Go SDK to learn Go. When I noticed that a feature I wanted was missing, it seemed like a great opportunity to implement it. I had been curious about developing open source software for some time, but did not have a good idea until then of what to contribute with. + +**In which way have you contributed to Apache Beam?** + +I have primarily worked on the Go SDK with implementation of new features, bug fixes, tests, documentation and code reviews. Some examples include a MongoDB I/O connector with dynamically scalable reads and writes, a file I/O connector supporting continuous file discovery, and an Amazon S3 file system implementation. + +**How has your open source engagement impacted your personal or professional growth?** + +Contributing to open source is one of the best decisions I have taken professionally. The Beam community has been incredibly welcoming and appreciative, and it has been rewarding to collaborate with talented people around the world to create software that is free for anyone to benefit from. Open source has opened up new opportunities to challenge myself, dive deeper into technologies I like, and learn from highly skilled professionals. To me, it has served as an outlet for creativity, problem solving and purposeful work. + +**How have you noticed contributing to open source is different from contributing to closed source/proprietary software?** + +My observation has been that there are higher requirements for software quality in open source, and it is more important to get things right the first time. My closed source software experience is from startups/scale-ups where speed is prioritized. When not working on public facing APIs or libraries, one can also more easily change things, whereas we need to be mindful about breaking changes in Beam. I care for software quality and value the high standards the Beam committers hold. + +**What do you like to do with your spare time when you're not contributing to Beam?** + +Coding is a passion of mine so I tend to spend a lot of my free time on hobby projects, reading books and articles, listening to talks and attending events. When I was younger I loved learning foreign languages and studied English, French, German and Spanish. Later I discovered an interest in computer science and switched focus to programming languages. I decided to change careers to software engineering and have tried to learn as much as possible ever since. I love that it never ends. + +**What future features/improvements are you most excited about, or would you like to see on Beam?** + +The multi-language pipeline support is an impressive feature of Beam, and I like that new SDKs such as TypeScript and Swift are emerging, which enables developers to write pipelines in their preferred language. Naturally, I am also excited to see where the Go SDK is headed and how we can make use of newer features of the Go language. + +**What types of contributions or support do you think the Beam community needs more of?** + +Many data and machine learning engineers feel more comfortable with Python than Java and wish the Python SDK were as feature rich as the Java SDK. This presents great opportunities for Python developers to start contributing to Beam. As an SDK author, one can take advantage of Beam's multiple SDKs. When I have developed in Go I have often studied the Java and Python implementations to get ideas for how to solve specific problems and make sure the Go SDK follows a similar pattern. + +**What advice would you give to someone who wants to contribute but does not know where to begin?** + +Start with asking yourself what prior knowledge you have and what you would like to learn, then look for opportunities that match that. The contribution guidelines will tell you where to find open issues and what the process looks like. There are tasks labeled as "good first issue" which can be a good starting point. I was quite nervous about making my first contribution and had my mentor pre-review my PR. There was no need to worry though, as people will be grateful for your effort to improve the project. The pride I felt when a committer approved my PR and welcomed me to Beam is something I still remember. + +**What advice would you give to the Beam community? What could we improve?** + +We can make it easier for new community members to get involved by providing more examples of tasks that we need help with, both in the form of code and non-code contributions. I will take it as an action point myself to label more issues accordingly and tailor the descriptions for newcomers. However, this is contingent on community members visiting the GitHub project. To address this, we could also proactively promote opportunities through social channels and the user mailing list. + +*We thank Johanna for the interview and for her contributions! If you would like to learn more about contributing to Beam you can learn more about it here: https://beam.apache.org/contribute/.* diff --git a/website/www/site/content/en/blog/dyi-content-discovery-platform-genai-beam.md b/website/www/site/content/en/blog/dyi-content-discovery-platform-genai-beam.md index 8057374591d73..fd967e318a070 100644 --- a/website/www/site/content/en/blog/dyi-content-discovery-platform-genai-beam.md +++ b/website/www/site/content/en/blog/dyi-content-discovery-platform-genai-beam.md @@ -1,7 +1,7 @@ --- layout: post title: "DIY GenAI Content Discovery Platform with Apache Beam" -date: 2023-09-27 00:00:01 -0800 +date: 2023-10-02 00:00:01 -0800 categories: - blog authors: diff --git a/website/www/site/content/en/blog/validate-beam-release.md b/website/www/site/content/en/blog/validate-beam-release.md index 60d5ddb0a791c..e4335530cfe36 100644 --- a/website/www/site/content/en/blog/validate-beam-release.md +++ b/website/www/site/content/en/blog/validate-beam-release.md @@ -112,10 +112,10 @@ works well. For Go SDK releases, you can fetch the Go SDK RC using [`go get`](https://golang.org/ref/mod#go-get), by requesting the specific pre-release version. -For example, to request the first release candidate for 2.34.0: +For example, to request the first release candidate for 2.44.0: ``` -go get -d github.com/apache/beam/sdks/v2@v2.34.0-RC1 +go get -d github.com/apache/beam/sdks/v2@v2.44.0-RC1 ``` With that, the Beam version in your `go.mod` will be the specified release candidate. @@ -123,4 +123,4 @@ You can go ahead and run your tests to verify that everything works well. You may need to also specify the RC's matching container when running a job. Use the `--environment_config` flag to specify the release candidate container: -eg. `--environment_config=apache/beam_go_sdk:2.34.0_rc1` +eg. `--environment_config=apache/beam_go_sdk:2.44.0rc1` diff --git a/website/www/site/content/en/case-studies/linkedin.md b/website/www/site/content/en/case-studies/linkedin.md index 5ed8b65a55787..b16c1f162157f 100644 --- a/website/www/site/content/en/case-studies/linkedin.md +++ b/website/www/site/content/en/case-studies/linkedin.md @@ -1,8 +1,17 @@ --- -title: "Linkedin" -icon: /images/logos/powered-by/linkedin.png -hasNav: true -hasLink: "https://www.youtube.com/watch?v=rBfwjbrMJTE&list=PL4dEBWmGSIU9OkXQU2OAXmITPLhiMSPRp&index=33" +title: "Revolutionizing Real-Time Stream Processing: 4 Trillion Events Daily at LinkedIn " +name: "LinkedIn" +icon: "/images/logos/powered-by/linkedin.png" +category: "study" +cardTitle: "Revolutionizing Real-Time Stream Processing: 4 Trillion Events Daily at LinkedIn" +cardDescription: "Apache Beam serves as the backbone of LinkedIn's streaming infrastructure, handling the near real-time processing of an astounding 4 trillion events daily through 3,000+ pipelines and thus powering personalized experiences for LinkedIn’s vast network of over 950 million members worldwide. The adoption of Apache Beam brought about a series of impressive enhancements, including 2x cost optimization depending on the use case, an astounding acceleration from days to minutes in labeling abuse, and more than 6% improvement in detecting logged-in scrapping profiles." +authorName: "Bingfeng Xia" +coauthorName: "Xinyu Liu" +authorPosition: "Engineering Manager @LinkedIn" +coauthorPosition: "Senior Staff Engineer @LinkedIn" +authorImg: /images/case-study/linkedin/bingfeng-xia.jpg +coauthorImg: /images/case-study/linkedin/xinyu-liu.jpg +publishDate: 2023-08-10T00:12:00+00:00 --- +

    +
    + +
    +
    +

    + “Apache Beam empowers LinkedIn to create timely recommendations and personalized experiences by leveraging the freshest data and processing it in real-time, ultimately benefiting LinkedIn's vast network of over 950 million members worldwide.” +

    +
    +
    + +
    +
    +
    + Bingfeng Xia +
    +
    + Engineering Manager @LinkedIn +
    +
    +
    +
    +
    +
    + +# Revolutionizing Real-Time Stream Processing: 4 Trillion Events Daily at LinkedIn + +## Background + +At LinkedIn, Apache Beam plays a pivotal role in stream processing infrastructures that process over 4 trillion events daily through more than 3,000 pipelines across multiple production data centers. This robust framework empowers near real-time data processing for critical services and platforms, ranging from machine learning and notifications to anti-abuse AI modeling. With over 950 million members, ensuring that our platform is running smoothly is critical to connecting members to opportunities worldwide. + +In this case study, LinkedIn's Bingfeng Xia, Engineering Manager, and Xinyu Liu, Senior Staff Engineer, shed light on how the Apache Beam programming model's unified, portable, and user-friendly data processing framework has enabled a multitude of sophisticated use cases and revolutionized Stream Processing at LinkedIn. This technology has [optimized cost-to-serve by 2x](https://engineering.linkedin.com/blog/2023/unified-streaming-and-batch-pipelines-at-linkedin--reducing-proc) by unifying stream and batch processing through Apache Samza and Apache Spark runners, enabled real-time ML feature generation, reduced time-to-production for new pipelines from months to days, allowed for processing time-series events at over 3 million queries per second, and more. For our members, this means that we’re able to serve more accurate job recommendations, improve feed recommendations, and identify fake profiles at a faster rate, etc. + + +## LinkedIn Open-Source Ecosystem and Journey to Beam + +LinkedIn has a rich history of actively contributing to the open-source community, demonstrating its commitment by creating, managing, and utilizing various open-source software projects. The LinkedIn engineering team has [open-sourced over 75 projects](https://engineering.linkedin.com/content/engineering/en-us/open-source) across multiple categories, with several gaining widespread adoption and becoming part of [the Apache Software Foundation](https://www.apache.org/). + +To enable the ingestion and real-time processing of enormous volumes of data, LinkedIn built a custom stream processing ecosystem largely with tools developed in-house (and subsequently open-sourced). In 2010, they introduced [Apache Kafka](https://kafka.apache.org/), a pivotal Big Data ingestion backbone for LinkedIn’s real-time infrastructure. To transition from batch-oriented processing and respond to Kafka events within minutes or seconds, they built an in-house distributed event streaming framework, [Apache Samza](https://samza.apache.org/). This framework, along with Apache Spark for batch processing, formed the basis of LinkedIn’s [lambda architecture](https://en.wikipedia.org/wiki/Lambda_architecture) for data processing jobs. Over time, LinkedIn's engineering team expanded the stream processing ecosystem with more proprietary tools like [Brooklin](https://github.com/linkedin/Brooklin/), facilitating data streaming across multiple stores and messaging systems, and [Venice](https://github.com/linkedin/venice), serving as a storage system for ingesting batch and stream processing job outputs, among others. + +Though the stream processing ecosystem with Apache Samza at its core enabled large-scale stateful data processing, LinkedIn’s ever-evolving demands required higher scalability and efficiency, as well as lower latency for the streaming pipelines. The lambda architecture approach led to operational complexity and inefficiencies, because it required maintaining two different codebases and two different engines for batch and streaming data. To address these challenges, data engineers sought a higher level of stream processing abstraction and out-of-the-box support for advanced aggregations and transformations. Additionally, they needed the ability to experiment with streaming pipelines in batch mode. There was also a growing need for multi-language support within the overall Java-prevalent teams due to emerging machine learning use cases requiring Python. + +The release of [Apache Beam](/about/) in 2016 proved to be a game-changer for LinkedIn. Apache Beam offers an open-source, advanced unified programming model for both batch and Stream Processing, making it possible to create a large-scale common data infrastructure across various applications. With support for Python, Go, and Java SDKs and a rich, versatile API layer, Apache Beam provided the ideal solution for building sophisticated multi-language pipelines and running them on any engine. + +
    +

    + When we started looking at Apache Beam, we realized it was a very attractive data processing framework for LinkedIn’s demands: not only does it provide an advanced API, but it also allows for converging stream and batch processing and multi-language support. Everything we were looking for and out-of-the-box. +

    +
    +
    + +
    +
    +
    + Xinyu Liu +
    +
    + Senior Staff Engineer @LinkedIn +
    +
    +
    +
    + +Recognizing the advantages of Apache Beam's unified data processing API, advanced capabilities, and multi-language support, LinkedIn began onboarding its first use cases and developed the [Apache Samza runner for Beam](/documentation/runners/samza/) in 2018. By 2019, Apache Beam pipelines were powering several critical use cases, and the programming model and framework saw extensive adoption across LinkedIn teams. Xinyu Liu showcased the benefits of migrating to Apache Beam pipelines during [Beam Summit Europe 2019](https://www.youtube.com/watch?v=uQcpr34RUKY&t=1694s). + +
    + + scheme + +
    + +## Apache Beam Use Cases at LinkedIn + +### Unified Streaming And Batch Pipelines + +Some of the first use cases that LinkedIn migrated to Apache Beam pipelines involved both real-time computations and periodic backfilling. One example was LinkedIn's standardization process. Standardization consists of a series of pipelines that use complex AI models to map LinkedIn user inputs, such as job titles, skills, or education history, into predefined internal IDs. For example, a LinkedIn member who lists their current position as "Chief Data Scientist" has their job title standardized for relevant job recommendations. + + +LinkedIn's standardization process requires both real-time processing to reflect immediate user updates and periodic backfilling to refresh data when new AI models are introduced. Before adopting Apache Beam, running backfilling as a streaming job required over 5,000 GB-hours in memory and nearly 4,000 hours in total CPU time. This heavy load led to extended backfilling times and scaling issues, causing the backfilling pipeline to act as a "noisy neighbor" to colocated streaming pipelines and failing to meet latency and throughput requirements. Although LinkedIn engineers considered migrating the backfilling logic to a batch Spark pipeline, they abandoned the idea due to the unnecessary overhead of maintaining two different codebases. + +
    +

    + We came to the question: is it possible to only maintain one codebase but with the ability to run it as either a batch job or streaming job? The unified Apache Beam model was the solution. +

    +
    +
    + +
    +
    +
    + Bingfeng Xia +
    +
    + Engineering Manager @LinkedIn +
    +
    +
    +
    + +The Apache Beam APIs enabled LinkedIn engineers to implement business logic once within a unified Apache Beam pipeline that efficiently handles both real-time standardization and backfilling. Apache Beam offers [PipelineOptions](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/options/PipelineOptions.html), enabling the configuration and customization of various aspects, such as the pipeline runner and runner-specific configurations. The extensibility of Apache Beam transforms allowed LinkedIn to [create a custom composite transform](https://beam.apache.org/documentation/programming-guide/#composite-transforms) to abstract away I/O differences and switch target processing on the fly based on data source type (bounded or unbounded). In addition, Apache Beam’s abstraction of the underlying infrastructure and the ability to "write once, run anywhere" empowered LinkedIn to seamlessly switch between data processing engines. Depending on the target processing type, streaming, or batch, the unified Apache Beam standardization pipeline can be deployed through the Samza cluster as a streaming job or through the Spark cluster as a batch backfilling job. + +
    + + scheme + +
    + +Hundreds of streaming Apache Beam jobs now power real-time standardization, listening to events 24/7, enriching streams with additional data from remote tables, performing necessary processing, and writing results to output databases. The batch Apache Beam backfilling job runs weekly, effectively handling 950 million member profiles at a rate of over 40,000 profiles per second. Apache Beam infers data points into sophisticated AI and machine learning models and joins complex data such as job types and work experiences, thus standardizing user data for search indexing or to run recommendation models. + +The migration of backfilling logic to a unified Apache Beam pipeline and its execution in batch mode resulted in a significant 50% improvement in memory and CPU usage efficiency (from ~5000 GB-hours and ~4000 CPU hours to ~2000 GB-hours and ~1700 CPU hours) and an impressive 94% acceleration in processing time (from 7.5 hours to 25 minutes). More details about this use case can be found on [LinkedIn’s engineering blog](https://engineering.linkedin.com/blog/2023/unified-streaming-and-batch-pipelines-at-linkedin--reducing-proc). + +### Anti-Abuse & Near Real-Time AI Modeling + +LinkedIn is firmly committed to creating a trusted environment for its members, and this dedication extends to safeguarding against various types of abuse on the platform. To achieve this, the Anti-Abuse AI Team at LinkedIn plays a crucial role in creating, deploying, and maintaining AI and deep learning models that can detect and prevent different forms of abuse, such as fake account creation, member profile scraping, automated spam, and account takeovers. + +Apache Beam fortifies LinkedIn’s internal anti-abuse platform, Chronos, enabling abuse detection and prevention in near real-time. Chronos relies on two streaming Apache Beam pipelines: the Filter pipeline and the Model pipeline. The Filter pipeline reads user activity events from Kafka, extracts relevant fields, aggregates and filters the events, and then generates filtered Kafka messages for downstream AI processing. Subsequently, the Model pipeline consumes these filtered messages, aggregates member activity within specific time windows, triggers AI scoring models, and writes the resulting abuse scores to various internal applications, services, and stores for offline processing. + +
    + + scheme + +
    + +The flexibility of Apache Beam's pluggable architecture and the availability of various I/O options seamlessly integrated the anti-abuse pipelines with Kafka and key-value stores. LinkedIn has dramatically reduced the time it takes to label abusive actions, cutting it down from 1 day to just 5 minutes and processing time-series events at an impressive rate of over 3 million queries per second. Apache Beam empowered near real-time processing, significantly bolstering LinkedIn's anti-abuse defenses. The nearline defenses are able to catch scrapers within minutes after they start to scrape and this leads to more than 6% improvement in detecting logged-in scrapping profiles. + +
    +

    + Apache Beam enabled revolutionary, phenomenal performance improvements - the anti-abuse processing accelerated from 1 day to 5 minutes. We have seen more than 6% improvement in detecting logged-in scrapping profiles. +

    +
    +
    + +
    +
    +
    + Xinyu Liu +
    +
    + Senior Staff Engineer @LinkedIn +
    +
    +
    +
    + +### Notifications Platform + +As a social media network, LinkedIn heavily relies on instant notifications to drive member engagement. To achieve this, Apache Beam and Apache Samza together power LinkedIn’s large-scale Notifications Platform that generates notification content, pinpoints the target audience, and ensures the timely and relevant distribution of content. + +The streaming Apache Beam pipelines have intricate business logic and handle enormous volumes of data in a near real-time fashion. The pipelines consume, aggregate, partition, and process events from over 950 million LinkedIn members and feed the data to downstream machine learning models. The ML models perform distributed targeting and scalable scoring on the order of millions of candidate notifications per second based on the recipient member’s historical actions and make personalized decisions for the recipient for each notification on the fly. As a result, LinkedIn members receive timely, relevant, and actionable activity-based notifications, such as connection invites, job recommendations, daily news digests, and other activities within their social network, through the right channels. + +The advanced Apache Beam API offers complex aggregation and filtering capabilities out-of-the-box, and its programming model allows for the creation of reusable components. These features enable LinkedIn to expedite development and streamline the scaling of the Notifications platform as they transition more notification use cases from Samza to Beam pipelines. + +
    +

    + LinkedIn’s user engagement is greatly driven by how timely we can send relevant notifications. Apache Beam enabled a scalable, near real-time infrastructure behind this business-critical use case. +

    +
    +
    + +
    +
    +
    + Bingfeng Xia +
    +
    + Engineering Manager @LinkedIn +
    +
    +
    +
    + +### Real-Time ML Feature Generation + +LinkedIn's core functionalities, such as job recommendations and search feed, heavily rely on ML models that consume thousands of features related to various entities like companies, job postings, and members. However, before the adoption of Apache Beam, the original offline ML feature generation pipeline suffered from a delay of 24 to 48 hours between member actions and the impact of those actions on the recommendation system. This delay resulted in missed opportunities, because the system lacked sufficient data about infrequent members and failed to capture the short-term intent and preferences of frequent members. In response to the growing demand for a scalable, real-time ML feature generation platform, LinkedIn turned to Apache Beam to address the challenge. + +Using Managed Beam as the foundation, LinkedIn developed a hosted platform for ML feature generation. The ML platform provides AI engineers with real-time features and an efficient pipeline authoring experience, all while abstracting away deployment and operational complexities. AI engineers create feature definitions and deploy them using Managed Beam. When LinkedIn members take actions on the platform, the streaming Apache Beam pipeline generates fresher machine learning features by filtering, processing, and aggregating the events emitted to Kafka in real-time and writes them to the feature store. Additionally, LinkedIn introduced other Apache Beam pipelines responsible for retrieving the data from the feature store, processing it, and feeding it into the recommendation system. + +
    + + scheme + +
    + +The powerful Apache Beam Stream Processing platform played a pivotal role in eliminating the delay between member actions and data availability, achieving an impressive end-to-end pipeline latency of just a few seconds. This significant improvement allowed LinkedIn's ML models to take advantage of up-to-date information and deliver more personalized and timely recommendations to our members, leading to significant gains in business metrics. + +### Managed Stream Processing Platform + +As LinkedIn's data infrastructure grew to encompass over 3,000 Apache Beam pipelines, catering to a diverse range of business use cases, LinkedIn's AI and data engineering teams found themselves overwhelmed with managing these streaming applications 24/7. The AI engineers encountered several technical challenges while creating new pipelines, including the intricacy of integrating multiple streaming tools and infrastructures into their frameworks, and limited knowledge of the underlying infrastructure when it came to deployment, monitoring, and operations. These challenges led to a time-consuming pipeline development cycle, often lasting one to two months. Apache Beam enabled LinkedIn to create Managed Beam, a managed Stream Processing platform that is designed to streamline and automate internal processes. This platform makes it easier and faster for teams to develop and operate sophisticated streaming applications while reducing the burden of on-call support. + +
    + + scheme + +
    + +The Apache Beam SDK empowered LinkedIn engineers to create custom workflow components as reusable sub-DAGs (Directed Acyclic Graphs) and expose them as standard PTransforms. These PTransforms serve as ready-to-use building blocks for new pipelines, significantly speeding up the authoring and testing process for LinkedIn AI engineers. By abstracting the low-level details of underlying engines and runtime environments, Apache Beam allows engineers to focus solely on business logic, further accelerating time to development. + +When the pipelines are ready for deployment, Managed Beam's central control plane comes into play, providing essential features like a deployment UI, operational dashboard, administrative tools, and automated pipeline lifecycle management. + +Apache Beam's abstraction facilitated the isolation of user code from framework evolution during build, deployment, and runtime. To ensure the separation of runner processes from user-defined functions (UDFs), Managed Beam packages the pipeline business logic and the framework logic as two separate JAR files: framework-less artifacts and framework artifacts. During pipeline execution on a YARN cluster, these pipeline artifacts run in a Samza container as two distinct processes, communicating through gRPC. This setup enabled LinkedIn to take advantage of automated framework upgrades, scalable UDF execution, log separation for easier troubleshooting, and multi-language APIs, fostering flexibility and efficiency. + +
    + + scheme + +
    + +Apache Beam also underpinned Managed Beam's autosizing controller tool, which automates hardware resource tuning and provides auto-remediation for streaming pipelines. Streaming Apache Beam pipelines self-report diagnostic information, such as metrics and key deployment logs, in the form of Kafka topics. Additionally, LinkedIn's internal monitoring tools report runtime errors, such as heartbeat failures, out-of-memory events, and processing lags. The Apache Beam diagnostics processor pipeline aggregates, repartitions, and windows these diagnostic events before passing them to the autosizing controller and writing them to Apache Pinot, LinkedIn's OLAP store for Managed Beam's operational and analytics dashboards. Based on the pre-processed and time-windowed diagnostic data, the autosizing controller generates sizing actions or restarting actions, and then forwards them to the Managed Beam control plane. The Managed Beam control plane then scales LinkedIn's streaming applications and clusters. + +
    +

    + Apache Beam helped streamline operations management and enabled fully-automated autoscaling, significantly reducing the time to onboard new applications. Previously, onboarding required a lot of manual 'trial and error' iterations and deep knowledge of the internal system and metrics. +

    +
    +
    + +
    +
    +
    + Bingfeng Xia +
    +
    + Engineering Manager @LinkedIn +
    +
    +
    +
    + +The extensibility, pluggability, portability, and abstraction of Apache Beam formed the backbone of LinkedIn's Managed Beam platform. The Managed Beam platform accelerated the time to author, test, and stabilize streaming pipelines from months to days, facilitated fast experimentation, and almost entirely eliminated operational costs for AI engineers. + +## Summary + +Apache Beam played a pivotal role in revolutionizing and scaling LinkedIn's data infrastructure. Beam's powerful streaming capabilities enable real-time processing for critical business use cases, at a scale of over 4 trillion events daily through more than 3,000 pipelines. + +The versatility of Apache Beam empowered LinkedIn’s engineering teams to optimize their data processing for various business use cases: +- Apache Beam's unified and portable framework allowed LinkedIn to consolidate streaming and batch processing into unified pipelines. These unified pipelines resulted in a 2x optimization in cost-to-serve, a 2x improvement in processing performance, and a 2x improvement in memory and CPU usage efficiency. +- LinkedIn's anti-abuse platform leveraged Apache Beam to process user activity events from Kafka in near-real-time, achieving a remarkable acceleration from days to minutes in labeling abusive actions. The nearline defenses are able to catch scrapers within minutes after they start to scrape and this leads to more than 6% improvement in detecting logged-in scrapping profiles. +- By adopting Apache Beam, LinkedIn was able to transition from an offline ML feature generation pipeline with a 24- to 48-hour delay to a real-time platform with an end-to-end pipeline latency at the millisecond or second level. +- Apache Beam’s abstraction and powerful programming model enabled LinkedIn to create a fully managed stream processing platform, thus facilitating easier authoring, testing, and deployment and accelerating time-to-production for new pipelines from months to days. + +Apache Beam boasts seamless plug-and-play capabilities, integrating smoothly with Apache Kafka, Apache Pinot, and other core technologies at LinkedIn, all while ensuring optimal performance at scale. As LinkedIn continues experimenting with new engines and tooling, the Apache Beam portability future-proofs our ecosystem against any changes in the underlying infrastructure. + +
    +

    + By enabling a scalable, near real-time infrastructure behind business-critical use cases, Apache Beam empowers LinkedIn to leverage the freshest data and process it in real-time to create timely recommendations and personalized experiences, ultimately benefiting LinkedIn's vast network of over 950 million members worldwide. +

    +
    +
    + +
    +
    +
    + Xinyu Liu +
    +
    + Senior Staff Engineer @LinkedIn +
    +
    +
    +
    + +

    + +{{< case_study_feedback "LinkedIn" >}} + +
    +
    diff --git a/website/www/site/content/en/documentation/dsls/dataframes/overview.md b/website/www/site/content/en/documentation/dsls/dataframes/overview.md index fa1ab0169f678..bc04c3cc6f7a2 100644 --- a/website/www/site/content/en/documentation/dsls/dataframes/overview.md +++ b/website/www/site/content/en/documentation/dsls/dataframes/overview.md @@ -18,7 +18,7 @@ limitations under the License. # Beam DataFrames overview -{{< button-colab url="https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/tour-of-beam/dataframes.ipynb" >}} +{{< button-colab url="https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/interactive-overview/dataframes.ipynb" >}} The Apache Beam Python SDK provides a DataFrame API for working with pandas-like [DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) objects. The feature lets you convert a PCollection to a DataFrame and then interact with the DataFrame using the standard methods available on the pandas DataFrame API. The DataFrame API is built on top of the pandas implementation, and pandas DataFrame methods are invoked on subsets of the datasets in parallel. The big difference between Beam DataFrames and pandas DataFrames is that operations are deferred by the Beam API, to support the Beam parallel processing model. (To learn more about differences between the DataFrame implementations, see [Differences from pandas](/documentation/dsls/dataframes/differences-from-pandas/).) @@ -107,4 +107,3 @@ pc1, pc2 = {'a': pc} | DataframeTransform(lambda a: expr1, expr2) [pydoc_to_dataframe]: https://beam.apache.org/releases/pydoc/current/apache_beam.dataframe.convert.html#apache_beam.dataframe.convert.to_dataframe [pydoc_to_pcollection]: https://beam.apache.org/releases/pydoc/current/apache_beam.dataframe.convert.html#apache_beam.dataframe.convert.to_pcollection -{{< button-colab url="https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/tour-of-beam/dataframes.ipynb" >}} diff --git a/website/www/site/content/en/documentation/io/built-in/google-bigquery.md b/website/www/site/content/en/documentation/io/built-in/google-bigquery.md index 7a31b63a3c96e..769b057413450 100644 --- a/website/www/site/content/en/documentation/io/built-in/google-bigquery.md +++ b/website/www/site/content/en/documentation/io/built-in/google-bigquery.md @@ -261,7 +261,7 @@ BigQuery's exported JSON format. {{< paragraph class="language-py" >}} ***Note:*** `BigQuerySource()` is deprecated as of Beam SDK 2.25.0. Before 2.25.0, to read from -a BigQuery table using the Beam SDK, you will apply a `Read` transform on a `BigQuerySource`. For example, +a BigQuery table using the Beam SDK, apply a `Read` transform on a `BigQuerySource`. For example, `beam.io.Read(beam.io.BigQuerySource(table_spec))`. {{< /paragraph >}} @@ -397,8 +397,8 @@ for the destination table(s): whether the destination table must exist or can be created by the write operation. * The destination table's write disposition. The write disposition specifies - whether the data you write will replace an existing table, append rows to an - existing table, or write only to an empty table. + whether the data you write replaces an existing table, appends rows to an + existing table, or writes only to an empty table. In addition, if your write operation creates a new BigQuery table, you must also supply a table schema for the destination table. @@ -512,7 +512,7 @@ use a string that contains a JSON-serialized `TableSchema` object. To create a table schema in Python, you can either use a `TableSchema` object, or use a string that defines a list of fields. Single string based schemas do not support nested fields, repeated fields, or specifying a BigQuery mode for -fields (the mode will always be set to `NULLABLE`). +fields (the mode is always set to `NULLABLE`). {{< /paragraph >}} #### Using a TableSchema @@ -539,7 +539,7 @@ To create and use a table schema as a `TableSchema` object, follow these steps. 2. Create and append a `TableFieldSchema` object for each field in your table. -3. Next, use the `schema` parameter to provide your table schema when you apply +3. Use the `schema` parameter to provide your table schema when you apply a write transform. Set the parameter’s value to the `TableSchema` object. {{< /paragraph >}} @@ -728,8 +728,8 @@ The following examples use this `PCollection` that contains quotes. The `writeTableRows` method writes a `PCollection` of BigQuery `TableRow` objects to a BigQuery table. Each element in the `PCollection` represents a single row in the table. This example uses `writeTableRows` to write elements to a -`PCollection`. The write operation creates a table if needed; if the -table already exists, it will be replaced. +`PCollection`. The write operation creates a table if needed. If the +table already exists, it is replaced. {{< /paragraph >}} {{< highlight java >}} @@ -745,7 +745,7 @@ table already exists, it will be replaced. {{< paragraph class="language-py" >}} The following example code shows how to apply a `WriteToBigQuery` transform to write a `PCollection` of dictionaries to a BigQuery table. The write operation -creates a table if needed; if the table already exists, it will be replaced. +creates a table if needed. If the table already exists, it is replaced. {{< /paragraph >}} {{< highlight py >}} @@ -759,8 +759,8 @@ The `write` transform writes a `PCollection` of custom typed objects to a BigQue table. Use `.withFormatFunction(SerializableFunction)` to provide a formatting function that converts each input element in the `PCollection` into a `TableRow`. This example uses `write` to write a `PCollection`. The -write operation creates a table if needed; if the table already exists, it will -be replaced. +write operation creates a table if needed. If the table already exists, it is +replaced. {{< /paragraph >}} {{< highlight java >}} @@ -786,7 +786,7 @@ BigQuery Storage Write API for Python SDK currently has some limitations on supp {{< /paragraph >}} {{< paragraph class="language-py" >}} -**Note:** If you want to run WriteToBigQuery with Storage Write API from the source code, you need to run `./gradlew :sdks:java:io:google-cloud-platform:expansion-service:build` to build the expansion-service jar. If you are running from a released Beam SDK, the jar will already be included. +**Note:** If you want to run WriteToBigQuery with Storage Write API from the source code, you need to run `./gradlew :sdks:java:io:google-cloud-platform:expansion-service:build` to build the expansion-service jar. If you are running from a released Beam SDK, the jar is already included. **Note:** Auto sharding is not currently supported for Python's Storage Write API exactly-once mode on DataflowRunner. @@ -877,34 +877,33 @@ Similar to streaming inserts, `STORAGE_WRITE_API` supports dynamically determini the number of parallel streams to write to BigQuery (starting 2.42.0). You can explicitly enable this using [`withAutoSharding`](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.Write.html#withAutoSharding--). -***Note:*** `STORAGE_WRITE_API` will default to dynamic sharding when +`STORAGE_WRITE_API` defaults to dynamic sharding when `numStorageWriteApiStreams` is set to 0 or is unspecified. -***Note:*** Auto sharding with `STORAGE_WRITE_API` is supported on Dataflow's legacy runner, but **not** on Runner V2 +***Note:*** Auto sharding with `STORAGE_WRITE_API` is supported by Dataflow, but **not** on Runner v2. {{< /paragraph >}} -When using `STORAGE_WRITE_API`, the PCollection returned by -[`WriteResult.getFailedInserts`](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/gcp/bigquery/WriteResult.html#getFailedInserts--) -will not contain the failed rows. If there are data validation errors, the -transform will throw a `RuntimeException`. +When using `STORAGE_WRITE_API`, the `PCollection` returned by +[`WriteResult.getFailedStorageApiInserts`](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/gcp/bigquery/WriteResult.html#getFailedStorageApiInserts--) +contains the rows that failed to be written to the Storage Write API sink. #### At-least-once semantics If your use case allows for potential duplicate records in the target table, you can use the [`STORAGE_API_AT_LEAST_ONCE`](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.Write.Method.html#STORAGE_API_AT_LEAST_ONCE) -method. Because this method doesn’t persist the records to be written to -BigQuery into its shuffle storage (needed to provide the exactly-once semantics -of the `STORAGE_WRITE_API` method), it is cheaper and results in lower latency -for most pipelines. If you use `STORAGE_API_AT_LEAST_ONCE`, you don’t need to +method. This method doesn’t persist the records to be written to +BigQuery into its shuffle storage, which is needed to provide the exactly-once semantics +of the `STORAGE_WRITE_API` method. Therefore, for most pipelines, using this method is often +less expensive and results in lower latency. +If you use `STORAGE_API_AT_LEAST_ONCE`, you don’t need to specify the number of streams, and you can’t specify the triggering frequency. Auto sharding is not applicable for `STORAGE_API_AT_LEAST_ONCE`. -When using `STORAGE_API_AT_LEAST_ONCE`, the PCollection returned by -[`WriteResult.getFailedInserts`](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/gcp/bigquery/WriteResult.html#getFailedInserts--) -will not contain the failed rows. If there are data validation errors, the -transform will throw a `RuntimeException`. +When using `STORAGE_API_AT_LEAST_ONCE`, the `PCollection` returned by +[`WriteResult.getFailedStorageApiInserts`](https://beam.apache.org/releases/javadoc/current/org/apache/beam/sdk/io/gcp/bigquery/WriteResult.html#getFailedStorageApiInserts--) +contains the rows that failed to be written to the Storage Write API sink. #### Quotas diff --git a/website/www/site/content/en/documentation/io/connectors.md b/website/www/site/content/en/documentation/io/connectors.md index 59b8898aa2265..ab8ccf935cb71 100644 --- a/website/www/site/content/en/documentation/io/connectors.md +++ b/website/www/site/content/en/documentation/io/connectors.md @@ -50,7 +50,10 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ native -
  • + @@ -158,7 +161,10 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ native - + - + @@ -613,7 +628,10 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ via X-language - + @@ -670,7 +688,10 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ native - + @@ -781,7 +802,10 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ native - + diff --git a/website/www/site/content/en/documentation/ml/multi-language-inference.md b/website/www/site/content/en/documentation/ml/multi-language-inference.md index 0d7a972e07657..1480b37ab4841 100644 --- a/website/www/site/content/en/documentation/ml/multi-language-inference.md +++ b/website/www/site/content/en/documentation/ml/multi-language-inference.md @@ -99,7 +99,7 @@ Finally, we postprocess the model predictions in the `Postprocess` DoFn. The `Po The custom Python code needs to be written in a local package and be compiled as a tarball. This package can then be used by the Java pipeline. The following example shows how to compile the Python package into a tarball: ```bash - python setup.py sdist + pip install --upgrade build && python -m build --sdist ``` In order to run this, a `setup.py` is required. The path to the tarball will be used as an argument in the pipeline options of the Java pipeline. diff --git a/website/www/site/content/en/documentation/ml/multi-model-pipelines.md b/website/www/site/content/en/documentation/ml/multi-model-pipelines.md index 569a51b8db55f..c42c8b8ae6611 100644 --- a/website/www/site/content/en/documentation/ml/multi-model-pipelines.md +++ b/website/www/site/content/en/documentation/ml/multi-model-pipelines.md @@ -95,3 +95,61 @@ captions. The solution consists of two open-source models: 2. **A caption ranking model ([CLIP](https://github.com/openai/CLIP))** that uses the image and candidate captions to rank the captions in the order in which they best describe the image. +## Use multiple differently-trained models + +You can use a `KeyedModelHandler` to load several different models into the `RunInference` transform. +Use the associated key to determine which model to use with which data. +The following example loads a model by using `config1`. That model is used for inference for all examples associated +with `key1`. It loads a second model by using `config2`. That model is used for all examples associated with `key2` and `key3`. + +``` +from apache_beam.ml.inference.base import KeyedModelHandler +keyed_model_handler = KeyedModelHandler([ + KeyModelMapping(['key1'], PytorchModelHandlerTensor()), + KeyModelMapping(['key2', 'key3'], PytorchModelHandlerTensor()) +]) +with pipeline as p: + data = p | beam.Create([ + ('key1', torch.tensor([[1,2,3],[4,5,6],...])), + ('key2', torch.tensor([[1,2,3],[4,5,6],...])), + ('key3', torch.tensor([[1,2,3],[4,5,6],...])), + ]) + predictions = data | RunInference(keyed_model_handler) +``` + +For a more detailed example, see the notebook +[Run ML inference with multiple differently-trained models](https://colab.sandbox.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/per_key_models.ipynb). + +Loading multiple models at the same times increases the risk of out of memory errors (OOMs). By default, `KeyedModelHandler` doesn't +limit the number of models loaded into memory at the same time. If the models don't all fit into memory, +your pipeline might fail with an out of memory error. To avoid this issue, use the `max_models_per_worker_hint` parameter +to set the maximum number of models that can be loaded into memory at the same time. + +The following example loads at most two models per SDK worker process at a time. It unloads models that aren't +currently in use. + +``` +mhs = [ + KeyModelMapping(['key1'], PytorchModelHandlerTensor()), + KeyModelMapping(['key2', 'key3'], PytorchModelHandlerTensor()), + KeyModelMapping(['key4'], PytorchModelHandlerTensor()), + KeyModelMapping(['key5', 'key6', 'key7'], PytorchModelHandlerTensor()), +] +keyed_model_handler = KeyedModelHandler(mhs, max_models_per_worker_hint=2) +``` + +Runners that have multiple SDK worker processes on a given machine load at most +`max_models_per_worker_hint*` models onto the machine. + +Leave enough space for the models and any additional memory needs from other transforms. +Because the memory might not be released immediately after a model is offloaded, +leaving an additional buffer is recommended. + +**Note**: Having many models but a small `max_models_per_worker_hint` can cause _memory thrashing_, where +a large amount of execution time is used to swap models in and out of memory. To reduce the likelihood and impact +of memory thrashing, if you're using a distributed runner, insert a +[`GroupByKey`](https://beam.apache.org/documentation/transforms/python/aggregation/groupbykey/) transform before your +inference step. The `GroupByKey` transform reduces thrashing by ensuring that elements with the same key and model are +collocated on the same worker. + +For more information, see [`KeyedModelHander`](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.KeyedModelHandler). diff --git a/website/www/site/content/en/documentation/ml/preprocess-data.md b/website/www/site/content/en/documentation/ml/preprocess-data.md index cb79afff60368..2b291b9c75a58 100644 --- a/website/www/site/content/en/documentation/ml/preprocess-data.md +++ b/website/www/site/content/en/documentation/ml/preprocess-data.md @@ -105,7 +105,7 @@ artifacts. When you use the `write_artifact_location` parameter, the `MLTransform` class runs the specified transformations on the dataset and then creates artifacts from these transformations. The artifacts are stored in the location that you specify in -the `write_artifact_location` parameter or in the `MLTransform` output. +the `write_artifact_location` parameter. Write mode is useful when you want to store the results of your transformations for future use. For example, if you apply the same transformations on a @@ -120,8 +120,7 @@ The following examples demonstrate how write mode works. The `ComputeAndApplyVocabulary` transform outputs the indices of the vocabulary to the vocabulary file. - The `ScaleToZScore` transform calculates the mean and variance over the entire dataset - and then normalizes the entire dataset using the mean and variance. The - mean and variance are outputted by the `MLTransform` operation. + and then normalizes the entire dataset using the mean and variance. When you use the `write_artifact_location` parameter, these values are stored as a `tensorflow` graph in the location specified by the `write_artifact_location` parameter value. You can reuse the values in read mode diff --git a/website/www/site/content/en/documentation/programming-guide.md b/website/www/site/content/en/documentation/programming-guide.md index 98dd045f4281f..564b01a7146e4 100644 --- a/website/www/site/content/en/documentation/programming-guide.md +++ b/website/www/site/content/en/documentation/programming-guide.md @@ -1212,10 +1212,13 @@ Here is a sequence diagram that shows the lifecycle of the DoFn during the execution of the ParDo transform. The comments give useful information to pipeline developers such as the constraints that apply to the objects or particular cases such as failover or - instance reuse. They also give instantiation use cases. Two key points - to note are that (1) teardown is done on a best effort basis and thus - isn't guaranteed and (2) the number of DoFn instances is runner - dependent. + instance reuse. They also give instantiation use cases. Three key points + to note are that: + 1. Teardown is done on a best effort basis and thus + isn't guaranteed. + 2. The number of DoFn instances created at runtime is runner-dependent. + 3. For the Python SDK, the pipeline contents such as DoFn user code, + is [serialized into a bytecode](https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#pickling-and-managing-the-main-session). Therefore, `DoFn`s should not reference objects that are not serializable, such as locks. To manage a single instance of an object across multiple `DoFn` instances in the same process, use utilities in the [shared.py](https://beam.apache.org/releases/pydoc/current/apache_beam.utils.shared.html) module. ![This is a sequence diagram that shows the lifecycle of the DoFn](/images/dofn-sequence-diagram.svg) diff --git a/website/www/site/content/en/documentation/runtime/environments.md b/website/www/site/content/en/documentation/runtime/environments.md index 624b5aa6b8810..452fb6141e63a 100644 --- a/website/www/site/content/en/documentation/runtime/environments.md +++ b/website/www/site/content/en/documentation/runtime/environments.md @@ -115,14 +115,13 @@ This method requires building image artifacts from Beam source. For additional i ./gradlew :sdks:java:container:java11:docker ./gradlew :sdks:java:container:java17:docker ./gradlew :sdks:go:container:docker - ./gradlew :sdks:python:container:py36:docker ./gradlew :sdks:python:container:py38:docker ./gradlew :sdks:python:container:py39:docker ./gradlew :sdks:python:container:py310:docker ./gradlew :sdks:python:container:py311:docker # Shortcut for building all Python SDKs - ./gradlew :sdks:python:container buildAll + ./gradlew :sdks:python:container:buildAll ``` 4. Verify the images you built were created by running `docker images`. diff --git a/website/www/site/content/en/documentation/sdks/python-machine-learning.md b/website/www/site/content/en/documentation/sdks/python-machine-learning.md index 5e0cf483ff3ea..a700806f14c6a 100644 --- a/website/www/site/content/en/documentation/sdks/python-machine-learning.md +++ b/website/www/site/content/en/documentation/sdks/python-machine-learning.md @@ -197,9 +197,9 @@ For more information on resource hints, see [Resource hints](/documentation/runt This section suggests patterns and best practices that you can use to make your inference pipelines simpler, more robust, and more efficient. -### Use a keyed ModelHandler +### Use a keyed ModelHandler object -If a key is attached to the examples, wrap the `KeyedModelHandler` around the `ModelHandler` object: +If a key is attached to the examples, wrap `KeyedModelHandler` around the `ModelHandler` object: ``` from apache_beam.ml.inference.base import KeyedModelHandler @@ -213,7 +213,61 @@ with pipeline as p: predictions = data | RunInference(keyed_model_handler) ``` -If you are unsure if your data is keyed, you can also use `MaybeKeyedModelHandler`. +If you are unsure if your data is keyed, you can use `MaybeKeyedModelHandler`. + +You can also use a `KeyedModelHandler` to load several different models based on their associated key. +The following example loads a model by using `config1`. That model is used for inference for all examples associated +with `key1`. It loads a second model by using `config2`. That model is used for all examples associated with `key2` and `key3`. + +``` +from apache_beam.ml.inference.base import KeyedModelHandler +keyed_model_handler = KeyedModelHandler([ + KeyModelMapping(['key1'], PytorchModelHandlerTensor()), + KeyModelMapping(['key2', 'key3'], PytorchModelHandlerTensor()) +]) +with pipeline as p: + data = p | beam.Create([ + ('key1', torch.tensor([[1,2,3],[4,5,6],...])), + ('key2', torch.tensor([[1,2,3],[4,5,6],...])), + ('key3', torch.tensor([[1,2,3],[4,5,6],...])), + ]) + predictions = data | RunInference(keyed_model_handler) +``` + +For a more detailed example, see the notebook +[Run ML inference with multiple differently-trained models](https://colab.sandbox.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/per_key_models.ipynb). + +Loading multiple models at the same times increases the risk of out of memory errors (OOMs). By default, `KeyedModelHandler` doesn't +limit the number of models loaded into memory at the same time. If the models don't all fit into memory, +your pipeline might fail with an out of memory error. To avoid this issue, use the `max_models_per_worker_hint` parameter +to set the maximum number of models that can be loaded into memory at the same time. + +The following example loads at most two models per SDK worker process at a time. It unloads models that aren't +currently in use. + +``` +mhs = [ + KeyModelMapping(['key1'], PytorchModelHandlerTensor()), + KeyModelMapping(['key2', 'key3'], PytorchModelHandlerTensor()), + KeyModelMapping(['key4'], PytorchModelHandlerTensor()), + KeyModelMapping(['key5', 'key6', 'key7'], PytorchModelHandlerTensor()), +] +keyed_model_handler = KeyedModelHandler(mhs, max_models_per_worker_hint=2) +``` + +Runners that have multiple SDK worker processes on a given machine load at most +`max_models_per_worker_hint*` models onto the machine. + +Leave enough space for the models and any additional memory needs from other transforms. +Because the memory might not be released immediately after a model is offloaded, +leaving an additional buffer is recommended. + +**Note**: Having many models but a small `max_models_per_worker_hint` can cause _memory thrashing_, where +a large amount of execution time is used to swap models in and out of memory. To reduce the likelihood and impact +of memory thrashing, if you're using a distributed runner, insert a +[`GroupByKey`](https://beam.apache.org/documentation/transforms/python/aggregation/groupbykey/) transform before your +inference step. The `GroupByKey` transform reduces thrashing by ensuring that elements with the same key and model are +collocated on the same worker. For more information, see [`KeyedModelHander`](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.KeyedModelHandler). diff --git a/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md b/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md index 378032ab6b588..c99c0b9c7cf8f 100644 --- a/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md +++ b/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md @@ -66,16 +66,17 @@ If your pipeline uses packages that are not available publicly (e.g. packages th This command lists all packages that are installed on your machine, regardless of where they were installed from. -2. Run your pipeline with the following command-line option: + 1. Run your pipeline with the following command-line option: - --extra_package /path/to/package/package-name + --extra_package /path/to/package/package-name - where package-name is the package's tarball. If you have the `setup.py` for that - package then you can build the tarball with the following command: + where package-name is the package's tarball. You can build the package tarball using a command line tool called [build](https://setuptools.pypa.io/en/latest/userguide/quickstart.html#install-build). - python setup.py sdist + # Install build using pip + pip install --upgrade build + python -m build --sdist - See the [sdist documentation](https://docs.python.org/3/distutils/sourcedist.html) for more details on this command. + See the [build documentation](https://pypa-build.readthedocs.io/en/latest/index.html) for more details on this command. ## Multiple File Dependencies diff --git a/website/www/site/content/en/documentation/sdks/python-unrecoverable-errors.md b/website/www/site/content/en/documentation/sdks/python-unrecoverable-errors.md index 4e5d94ce8a8db..4fbb739e7ec7b 100644 --- a/website/www/site/content/en/documentation/sdks/python-unrecoverable-errors.md +++ b/website/www/site/content/en/documentation/sdks/python-unrecoverable-errors.md @@ -16,46 +16,58 @@ See the License for the specific language governing permissions and limitations under the License. --> -# Unrecoverable Errors in Beam Python +# Unrecoverable errors in Beam Python -## What is an Unrecoverable Error? +Unrecoverable errors are issues that occur at job start-up time and +prevent jobs from ever running successfully. The problem usually stems +from a misconfiguration. This page provides context about +common errors and troubleshooting information. -An unrecoverable error is an issue at job start-up time that will -prevent a job from ever running successfully, usually due to some kind -of misconfiguration. Solving these issues when they occur is key to -successfully running a Beam Python pipeline. +## Job submission or Python runtime version mismatch {#python-version-mismatch} -## Common Unrecoverable Errors +If the Python version that you use to submit your job doesn't match the +Python version used to build the worker container, the job doesn't run. +The job fails immediately after job submission. -### Job Submission/Runtime Python Version Mismatch +To resolve this issue, ensure that the Python version used to submit the job +matches the Python container version. -If the Python version used for job submission does not match the -Python version used to build the worker container, the job will not -execute. Ensure that the Python version being used for job submission -and the container Python version match. +## Dependency resolution failures with pip {#dependency-resolution-failures} -### PIP Dependency Resolution Failures +During worker start-up, the worker might fail and, depending on the +runner, try to restart. -During worker start-up, dependencies are checked and installed in -the worker container before accepting work. If a pipeline requires -additional dependencies not already present in the runtime environment, -they are installed here. If there’s an issue during this process -(e.g. a dependency version cannot be found, or a worker cannot -connect to PyPI) the worker will fail and may try to restart -depending on the runner. Ensure that dependency versions provided in -your requirements.txt file exist and can be installed locally before -submitting jobs. +Before workers accept work, dependencies are checked and installed in +the worker container. If a pipeline requires +dependencies not already present in the runtime environment, +they are installed at this time. +When a problem occurs during this process, you might encounter +dependency resolution failures. -### Dependency Version Mismatches +Examples of problems include the following: -When additional dependencies like `torch`, `transformers`, etc. are not -specified via a requirements_file or preinstalled in a custom container -then the worker might fail to deserialize (unpickle) the user code. -This can result in `ModuleNotFound` errors. If dependencies are installed -but their versions don't match the versions in submission environment, -pipeline might have `AttributeError` messages. +- A dependency version can't be found. +- A worker can't connect to PyPI. -Ensure that the required dependencies at runtime and in the submission -environment are the same along with their versions. For better visibility, -debug logs are added specifying the dependencies at both stages starting in -Beam 2.52.0. For more information, see: https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#control-dependencies \ No newline at end of file +To resolve this issue, before submitting your job, ensure that the +dependency versions provided in your `requirements.txt` file exist +and that you can install them locally. + +## Dependency version mismatches {#dependency-version} + +When your pipeline has dependency version mismatches, you might +see `ModuleNotFound` errors or `AttributeError` messages. + + - The `ModuleNotFound` errors occur when additional dependencies, + such as `torch` and `transformers`, are neither specified in a + `requirements_file` nor preinstalled in a custom container. + In this case, the worker might fail to deserialize (unpickle) the user code. + +- Your pipeline might have `AttributeError` messages when dependencies + are installed but their versions don't match the versions in submission environment. + +To resolve these problems, ensure that the required dependencies and their versions are the same +at runtime and in the submission environment. To help you identify these issues, +in Apache Beam 2.52.0 and later versions, debug logs specify the dependencies at both stages. +For more information, see +[Control the dependencies the pipeline uses](https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#control-dependencies). \ No newline at end of file diff --git a/website/www/site/content/en/get-started/_index.md b/website/www/site/content/en/get-started/_index.md index c436129b066af..8aa6ff626c423 100644 --- a/website/www/site/content/en/get-started/_index.md +++ b/website/www/site/content/en/get-started/_index.md @@ -21,17 +21,18 @@ limitations under the License. # Get Started with Apache Beam -Learn to use Beam to create data processing pipelines that run on supported processing back-ends: +Learn how to use Beam to create data processing pipelines that run on supported processing back-ends. -## [Tour of Beam](https://tour.beam.apache.org) +## Tour of Beam -Learn Beam with an interactive tour with learning topics covering core Beam concepts -from simple ones to more advanced ones. +[Learn Beam with an interactive tour](https://tour.beam.apache.org). +Topics include core Beam concepts, from simple to advanced. You can try examples, do exercises, and solve challenges along the learning journey. -## [Beam Overview](/get-started/beam-overview) +## Beam Overview -Learn about the Beam model, the currently available Beam SDKs and Runners, and Beam's native I/O connectors. +Read the [Apache Beam Overview](/get-started/beam-overview) to learn about the Beam model, +the currently available Beam SDKs and runners, and Beam's native I/O connectors. ## Quickstarts for Java, Python, Go, and TypeScript @@ -49,10 +50,15 @@ See detailed walkthroughs of complete Beam pipelines. - [WordCount](/get-started/wordcount-example): Simple example pipelines that demonstrate basic Beam programming, including debugging and testing - [Mobile Gaming](/get-started/mobile-gaming-example): A series of more advanced pipelines that demonstrate use cases in the mobile gaming domain -## [Downloads and Releases](/get-started/downloads) +## Downloads and Releases -Find download links and information on the latest Beam releases, including versioning and release notes. +Find download links and information about the latest Beam releases, including versioning and release notes, +on the [Apache Beam Downloads](/get-started/downloads) page. -## [Support](/get-started/support) +## Support -Find resources, such as mailing lists and issue tracking, to help you use Beam. Ask questions and discuss topics via [Stack Overflow](https://stackoverflow.com/questions/tagged/apache-beam) or on Beam's [Slack Channel](https://apachebeam.slack.com). +- Find resources to help you use Beam, such as mailing lists and issue tracking, + on the [Support](/get-started/support) page. + - Ask questions and discuss topics on + [Stack Overflow](https://stackoverflow.com/questions/tagged/apache-beam) + or in the Beam [Slack Channel](https://apachebeam.slack.com). diff --git a/website/www/site/content/en/get-started/downloads.md b/website/www/site/content/en/get-started/downloads.md index 9a753dafe32ee..cc71f3101eb12 100644 --- a/website/www/site/content/en/get-started/downloads.md +++ b/website/www/site/content/en/get-started/downloads.md @@ -19,7 +19,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -# Apache Beam™ Downloads +# Apache Beam® Downloads > Beam SDK {{< param release_latest >}} is the latest released version. @@ -96,10 +96,18 @@ versions denoted `0.x.y`. ## Releases +### 2.51.0 (2023-10-11) +Official [source code download](https://downloads.apache.org/beam/2.51.0/apache-beam-2.51.0-source-release.zip). +[SHA-512](https://downloads.apache.org/beam/2.51.0/apache-beam-2.51.0-source-release.zip.sha512). +[signature](https://downloads.apache.org/beam/2.51.0/apache-beam-2.51.0-source-release.zip.asc). + +[Release notes](https://github.com/apache/beam/releases/tag/v2.51.0) +[Blog post](/blog/beam-2.51.0). + ### 2.50.0 (2023-08-30) -Official [source code download](https://downloads.apache.org/beam/2.50.0/apache-beam-2.50.0-source-release.zip). -[SHA-512](https://downloads.apache.org/beam/2.50.0/apache-beam-2.50.0-source-release.zip.sha512). -[signature](https://downloads.apache.org/beam/2.50.0/apache-beam-2.50.0-source-release.zip.asc). +Official [source code download](https://archive.apache.org/beam/2.50.0/apache-beam-2.50.0-source-release.zip). +[SHA-512](https://archive.apache.org/beam/2.50.0/apache-beam-2.50.0-source-release.zip.sha512). +[signature](https://archive.apache.org/beam/2.50.0/apache-beam-2.50.0-source-release.zip.asc). [Release notes](https://github.com/apache/beam/releases/tag/v2.50.0) [Blog post](/blog/beam-2.50.0). diff --git a/website/www/site/content/en/get-started/quickstart-go.md b/website/www/site/content/en/get-started/quickstart-go.md index 2f0bad49659c6..dd8f3ba586c6b 100644 --- a/website/www/site/content/en/get-started/quickstart-go.md +++ b/website/www/site/content/en/get-started/quickstart-go.md @@ -28,7 +28,7 @@ If you're interested in contributing to the Apache Beam Go codebase, see the [Co The Beam SDK for Go requires `go` version 1.20 or newer. It can be downloaded [here](https://golang.org/). Check what go version you have by running: {{< highlight >}} -$ go version +go version {{< /highlight >}} If you are unfamiliar with Go, see the [Get Started With Go Tutorial](https://go.dev/doc/tutorial/getting-started). @@ -43,12 +43,12 @@ required arguments described in the examples. For example, to run `wordcount`, run: {{< runner direct >}} -$ go run github.com/apache/beam/sdks/v2/go/examples/wordcount@latest --input "gs://apache-beam-samples/shakespeare/kinglear.txt" --output counts -$ less counts +go run github.com/apache/beam/sdks/v2/go/examples/wordcount@latest --input "gs://apache-beam-samples/shakespeare/kinglear.txt" --output counts +less counts {{< /runner >}} {{< runner dataflow >}} -$ go run github.com/apache/beam/sdks/v2/go/examples/wordcount@latest --input gs://dataflow-samples/shakespeare/kinglear.txt \ +go run github.com/apache/beam/sdks/v2/go/examples/wordcount@latest --input gs://dataflow-samples/shakespeare/kinglear.txt \ --output gs:///counts \ --runner dataflow \ --project your-gcp-project \ @@ -60,10 +60,10 @@ $ go run github.com/apache/beam/sdks/v2/go/examples/wordcount@latest --input gs: {{< runner spark >}} # Build and run the Spark job server from Beam source. # -PsparkMasterUrl is optional. If it is unset the job will be run inside an embedded Spark cluster. -$ ./gradlew :runners:spark:3:job-server:runShadow -PsparkMasterUrl=spark://localhost:7077 +./gradlew :runners:spark:3:job-server:runShadow -PsparkMasterUrl=spark://localhost:7077 # In a separate terminal, run: -$ go run github.com/apache/beam/sdks/v2/go/examples/wordcount@latest --input \ +go run github.com/apache/beam/sdks/v2/go/examples/wordcount@latest --input \ --output counts \ --runner spark \ --endpoint localhost:8099 diff --git a/website/www/site/content/en/roadmap/java-sdk.md b/website/www/site/content/en/roadmap/java-sdk.md index b65424b57a3d3..a1c85e1391936 100644 --- a/website/www/site/content/en/roadmap/java-sdk.md +++ b/website/www/site/content/en/roadmap/java-sdk.md @@ -17,9 +17,9 @@ limitations under the License. # Java SDK Roadmap -## Next Java LTS version support (Java 17) +## Next Java LTS version support (Java 21) Work to support the next LTS release of Java is in progress. For more details -about the scope and info on the various tasks please see the JIRA ticket. +about the scope and info on the various tasks please see the GitHub Issue. -- JIRA: [BEAM-12240](https://issues.apache.org/jira/browse/BEAM-12240) +- GitHub: [#28120](https://github.com/apache/beam/issues/28120) diff --git a/website/www/site/data/authors.yml b/website/www/site/data/authors.yml index 2776132cf586b..13c31c4f4782d 100644 --- a/website/www/site/data/authors.yml +++ b/website/www/site/data/authors.yml @@ -275,3 +275,6 @@ pabs: namitasharma: name: Namita Sharma email: namitasharma@google.com +talat: + name: Talat Uyarer + email: talat@apache.org diff --git a/website/www/site/data/en/quotes.yaml b/website/www/site/data/en/quotes.yaml index 5cd09eca90767..3a5225b3f29a4 100644 --- a/website/www/site/data/en/quotes.yaml +++ b/website/www/site/data/en/quotes.yaml @@ -11,6 +11,11 @@ # limitations under the License. #Cards with quotes will be displayed by the order listed, e.g., first card will display the first quote +- text: Apache Beam fuels LinkedIn's streaming infrastructure, processing 4 trillion events daily through 3K+ pipelines in near-real time. Beam enabled unified pipelines, yielding 2x cost savings and remarkable improvements for many use cases. + icon: icons/quote-icon.svg + logoUrl: images/logos/powered-by/linkedin.png + linkUrl: case-studies/linkedin/index.html + linkText: Learn more - text: With Apache Beam, OCTO accelerated the migration of one of France’s largest grocery retailers to streaming processing for transactional data, achieving 5x reduced infrastructure costs and 4x improved performance. icon: icons/quote-icon.svg logoUrl: images/logos/powered-by/octo.png diff --git a/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html b/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html index ba8b597aaeeb0..87d69ee60fe32 100644 --- a/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html +++ b/website/www/site/layouts/shortcodes/flink_java_pipeline_options.html @@ -52,6 +52,11 @@ + + + + + diff --git a/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html b/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html index 5293f35e6a1e3..27ae27ad05a3e 100644 --- a/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html +++ b/website/www/site/layouts/shortcodes/flink_python_pipeline_options.html @@ -52,6 +52,11 @@ + + + + + diff --git a/website/www/site/static/.htaccess b/website/www/site/static/.htaccess index a2ef056a262cb..216b415cab10d 100644 --- a/website/www/site/static/.htaccess +++ b/website/www/site/static/.htaccess @@ -22,3 +22,7 @@ RewriteRule ^(.*)$ https://beam.apache.org/$1 [L,R=301] RedirectMatch permanent "/documentation/sdks/(javadoc|pydoc)(.*)" "https://beam.apache.org/releases/$1$2" RedirectMatch "/contribute/design-documents" "https://cwiki.apache.org/confluence/display/BEAM/Design+Documents" + +RedirectMatch "/contribute/release-guide" "https://github.com/apache/beam/blob/master/contributor-docs/release-guide.md" + +RedirectMatch "/contribute/committer-guide" "https://github.com/apache/beam/blob/master/contributor-docs/committer-guide.md" \ No newline at end of file diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/autoscaling-metrics.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/autoscaling-metrics.png new file mode 100644 index 0000000000000..bf723d5d5a7af Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/autoscaling-metrics.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/backlog-graph.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/backlog-graph.png new file mode 100644 index 0000000000000..ab0f0c9b7f572 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/backlog-graph.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/fko-library.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/fko-library.png new file mode 100644 index 0000000000000..2dac61702c6ae Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/fko-library.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/flink-checkpoint-ui.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/flink-checkpoint-ui.png new file mode 100644 index 0000000000000..c9e95990da6e6 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/flink-checkpoint-ui.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/flink-deployment-yaml.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/flink-deployment-yaml.png new file mode 100644 index 0000000000000..0932b96a96da1 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/flink-deployment-yaml.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/gcs-write-graph.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/gcs-write-graph.png new file mode 100644 index 0000000000000..02266e118a7a8 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/gcs-write-graph.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/image1.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/image1.png new file mode 100644 index 0000000000000..c6d35e7cb63bd Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/image1.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/job-metrics.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/job-metrics.png new file mode 100644 index 0000000000000..1e11248a2f0c9 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/job-metrics.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/job-start-activity-diagram.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/job-start-activity-diagram.png new file mode 100644 index 0000000000000..3e7ede2c59f9b Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/job-start-activity-diagram.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/latency-graph.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/latency-graph.png new file mode 100644 index 0000000000000..3257f9b66df16 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/latency-graph.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/stream-service-changes.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/stream-service-changes.png new file mode 100644 index 0000000000000..5431792d56be6 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/stream-service-changes.png differ diff --git a/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/watermark-metrics.png b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/watermark-metrics.png new file mode 100644 index 0000000000000..968b0abe0c219 Binary files /dev/null and b/website/www/site/static/images/blog/apache-beam-flink-and-kubernetes/watermark-metrics.png differ diff --git a/website/www/site/static/images/case-study/linkedin/bingfeng-xia.jpg b/website/www/site/static/images/case-study/linkedin/bingfeng-xia.jpg new file mode 100644 index 0000000000000..ca07935b689b6 Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/bingfeng-xia.jpg differ diff --git a/website/www/site/static/images/case-study/linkedin/scheme-1.png b/website/www/site/static/images/case-study/linkedin/scheme-1.png new file mode 100644 index 0000000000000..535f2d5f316f8 Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/scheme-1.png differ diff --git a/website/www/site/static/images/case-study/linkedin/scheme-2.png b/website/www/site/static/images/case-study/linkedin/scheme-2.png new file mode 100644 index 0000000000000..2ab4e42810195 Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/scheme-2.png differ diff --git a/website/www/site/static/images/case-study/linkedin/scheme-3.png b/website/www/site/static/images/case-study/linkedin/scheme-3.png new file mode 100644 index 0000000000000..a7d1dd01b88ed Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/scheme-3.png differ diff --git a/website/www/site/static/images/case-study/linkedin/scheme-4.png b/website/www/site/static/images/case-study/linkedin/scheme-4.png new file mode 100644 index 0000000000000..3873b3a20b1f1 Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/scheme-4.png differ diff --git a/website/www/site/static/images/case-study/linkedin/scheme-5.png b/website/www/site/static/images/case-study/linkedin/scheme-5.png new file mode 100644 index 0000000000000..e28537a18a8b1 Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/scheme-5.png differ diff --git a/website/www/site/static/images/case-study/linkedin/scheme-6.png b/website/www/site/static/images/case-study/linkedin/scheme-6.png new file mode 100644 index 0000000000000..1dadc4c9126e5 Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/scheme-6.png differ diff --git a/website/www/site/static/images/case-study/linkedin/xinyu-liu.jpg b/website/www/site/static/images/case-study/linkedin/xinyu-liu.jpg new file mode 100644 index 0000000000000..89813af2b09df Binary files /dev/null and b/website/www/site/static/images/case-study/linkedin/xinyu-liu.jpg differ
    Not available + ✔ + native + Not available Not available + ✔ + native + via X-language @@ -536,6 +542,9 @@ This table provides a consolidated, at-a-glance overview of the available built- native +
    + ✔ + via X-language
    ✔ @@ -559,7 +568,13 @@ This table provides a consolidated, at-a-glance overview of the available built- ✔ via X-language Not available + ✔ + native (sink) +
    + ✔ + via X-language +
    Not available Not available + ✔ + native + Not available Not available + ✔ + native + Not available Not available + ✔ + native + Not available Not available Disable Beam metrics in Flink Runner Default: false
    enableStableInputDrainAllow drain operation for flink pipelines that contain RequiresStableInput operator. Note that at time of draining,the RequiresStableInput contract might be violated if there any processing related failures in the DoFn operator.Default: false
    executionModeForBatch Flink mode for data exchange of batch pipelines. Reference {@link org.apache.flink.api.common.ExecutionMode}. Set this to BATCH_FORCED if pipelines get blocked, see https://issues.apache.org/jira/browse/FLINK-10672Disable Beam metrics in Flink Runner Default: false
    enable_stable_input_drainAllow drain operation for flink pipelines that contain RequiresStableInput operator. Note that at time of draining,the RequiresStableInput contract might be violated if there any processing related failures in the DoFn operator.Default: false
    execution_mode_for_batch Flink mode for data exchange of batch pipelines. Reference {@link org.apache.flink.api.common.ExecutionMode}. Set this to BATCH_FORCED if pipelines get blocked, see https://issues.apache.org/jira/browse/FLINK-10672