.github/workflows/benchmark.yml

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#

name: Run benchmarks

on:
  workflow_dispatch:
    inputs:
      class:
        description: 'Benchmark class'
        required: true
        default: '*'
      jdk:
        description: 'JDK version: 17 or 21'
        required: true
        default: '17'
      scala:
        description: 'Scala version: 2.13'
        required: true
        default: '2.13'
      failfast:
        description: 'Failfast: true or false'
        required: true
        default: 'true'
      num-splits:
        description: 'Number of job splits'
        required: true
        default: '1'

jobs:
  matrix-gen:
    name: Generate matrix for job splits
    runs-on: ubuntu-20.04
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    env:
      SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
    steps:
    - name: Generate matrix
      id: set-matrix
      run: echo "matrix=["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]" >> $GITHUB_OUTPUT

  # Any TPC-DS related updates on this job need to be applied to tpcds-1g job of build_and_test.yml as well
  tpcds-1g-gen:
    name: "Generate an input dataset for TPCDSQueryBenchmark with SF=1"
    if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*')
    runs-on: ubuntu-20.04
    env:
      SPARK_LOCAL_IP: localhost
    steps:
      - name: Checkout Spark repository
        uses: actions/checkout@v4
        # In order to get diff files
        with:
          fetch-depth: 0
      - name: Cache Scala, SBT and Maven
        uses: actions/cache@v3
        with:
          path: |
            build/apache-maven-*
            build/scala-*
            build/*.jar
            ~/.sbt
          key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
          restore-keys: |
            build-
      - name: Cache Coursier local repository
        uses: actions/cache@v3
        with:
          path: ~/.cache/coursier
          key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
          restore-keys: |
            benchmark-coursier-${{ github.event.inputs.jdk }}
      - name: Cache TPC-DS generated data
        id: cache-tpcds-sf-1
        uses: actions/cache@v3
        with:
          path: ./tpcds-sf-1
          key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
      - name: Checkout tpcds-kit repository
        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
        uses: actions/checkout@v4
        with:
          repository: databricks/tpcds-kit
          ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
          path: ./tpcds-kit
      - name: Build tpcds-kit
        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
        run: cd tpcds-kit/tools && make OS=LINUX
      - name: Install Java ${{ github.event.inputs.jdk }}
        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
        uses: actions/setup-java@v4
        with:
          distribution: zulu
          java-version: ${{ github.event.inputs.jdk }}
      - name: Generate TPC-DS (SF=1) table data
        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
        run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"

  benchmark:
    name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, Scala ${{ github.event.inputs.scala }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)"
    if: always()
    needs: [matrix-gen, tpcds-1g-gen]
    # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}}
    env:
      SPARK_BENCHMARK_FAILFAST: ${{ github.event.inputs.failfast }}
      SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
      SPARK_BENCHMARK_CUR_SPLIT: ${{ matrix.split }}
      SPARK_GENERATE_BENCHMARK_FILES: 1
      SPARK_LOCAL_IP: localhost
      # To prevent spark.test.home not being set. See more detail in SPARK-36007.
      SPARK_HOME: ${{ github.workspace }}
      SPARK_TPCDS_DATA: ${{ github.workspace }}/tpcds-sf-1
    steps:
    - name: Checkout Spark repository
      uses: actions/checkout@v4
      # In order to get diff files
      with:
        fetch-depth: 0
    - name: Cache Scala, SBT and Maven
      uses: actions/cache@v3
      with:
        path: |
          build/apache-maven-*
          build/scala-*
          build/*.jar
          ~/.sbt
        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
        restore-keys: |
          build-
    - name: Cache Coursier local repository
      uses: actions/cache@v3
      with:
        path: ~/.cache/coursier
        key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
        restore-keys: |
          benchmark-coursier-${{ github.event.inputs.jdk }}
    - name: Install Java ${{ github.event.inputs.jdk }}
      uses: actions/setup-java@v4
      with:
        distribution: zulu
        java-version: ${{ github.event.inputs.jdk }}
    - name: Cache TPC-DS generated data
      if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*')
      id: cache-tpcds-sf-1
      uses: actions/cache@v3
      with:
        path: ./tpcds-sf-1
        key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
    - name: Run benchmarks
      run: |
        ./build/sbt -Pscala-${{ github.event.inputs.scala }} -Pyarn -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl Test/package
        # Make less noisy
        cp conf/log4j2.properties.template conf/log4j2.properties
        sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
        # In benchmark, we use local as master so set driver memory only. Note that GitHub Actions has 7 GB memory limit.
        bin/spark-submit \
          --driver-memory 6g --class org.apache.spark.benchmark.Benchmarks \
          --jars "`find . -name '*-SNAPSHOT-tests.jar' -o -name '*avro*-SNAPSHOT.jar' | paste -sd ',' -`,`find ~/.cache/coursier -name 'curator-test-*.jar'`" \
          "`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \
          "${{ github.event.inputs.class }}"
        # To keep the directory structure and file permissions, tar them
        # See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files
        echo "Preparing the benchmark results:"
        tar -cvf benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude-standard`
    - name: Upload benchmark results
      uses: actions/upload-artifact@v4
      with:
        name: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}-${{ matrix.split }}
        path: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar