[ZEPPELIN-5969] Remove Hadoop2 and move to Hadoop3 shaded client (apa…

…che#4691) * Drop hadoop2 in github actions * Update docs * Drop hadoop2 support * Remove hadoop2 integration tests * findbugs use the same version in all modules * Use hadoop3.3 for tests * Move to scala 2.12 * Try to fix flink * Usage of metals * Remove duplicate version and groupid * Fix Flink with Hadoop3 * fix log * R * fix * fix * fix * fix * hadoop-3.3 * fix * fix * Address comments * address comments --------- Co-authored-by: Philipp Dallig <[email protected]>
seung-00 · Apr 3, 2024 · fa6e3ee · fa6e3ee
1 parent f2253f1
commit fa6e3ee
Show file tree

Hide file tree

Showing 39 changed files with 693 additions and 1,964 deletions.
diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml
@@ -40,7 +40,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        hadoop: [hadoop2, hadoop3]
+        hadoop: [hadoop3]
         java: [ 8, 11 ]
     steps:
       - name: Checkout
@@ -183,7 +183,7 @@ jobs:
           R -e "IRkernel::installspec()"
       - name: install environment
         run: |
-          ./mvnw install -DskipTests -pl python,rlang,zeppelin-jupyter-interpreter -am -Phadoop2 ${MAVEN_ARGS}
+          ./mvnw install -DskipTests -pl python,rlang,zeppelin-jupyter-interpreter -am -Phadoop3 ${MAVEN_ARGS}
       - name: run tests with ${{ matrix.python }}
         run: |
           ./mvnw test -pl python,rlang,zeppelin-jupyter-interpreter -DfailIfNoTests=false ${MAVEN_ARGS}
@@ -221,7 +221,7 @@ jobs:
             ${{ runner.os }}-zeppelin-
       - name: install environment
         run: |
-          ./mvnw install -DskipTests -Phadoop2 -Pintegration -pl zeppelin-interpreter-integration,zeppelin-web,spark-submit,spark/scala-2.12,spark/scala-2.13,markdown,flink-cmd,flink/flink-scala-2.12,jdbc,shell -am -Pflink-117 ${MAVEN_ARGS}
+          ./mvnw install -DskipTests -Phadoop3 -Pintegration -pl zeppelin-interpreter-integration,zeppelin-web,spark-submit,spark/scala-2.12,spark/scala-2.13,markdown,flink-cmd,flink/flink-scala-2.12,jdbc,shell -am -Pflink-117 ${MAVEN_ARGS}
           ./mvnw package -pl zeppelin-plugins -amd -DskipTests ${MAVEN_ARGS}
       - name: Setup conda environment with python 3.9 and R
         uses: conda-incubator/setup-miniconda@v2
@@ -238,7 +238,7 @@ jobs:
         run: |
           R -e "IRkernel::installspec()"
       - name: run tests
-        run: ./mvnw test -pl zeppelin-interpreter-integration -Phadoop2 -Pintegration -DfailIfNoTests=false -Dtest=ZeppelinClientIntegrationTest,ZeppelinClientWithAuthIntegrationTest,ZSessionIntegrationTest,ShellIntegrationTest,JdbcIntegrationTest
+        run: ./mvnw test -pl zeppelin-interpreter-integration -Phadoop3 -Pintegration -DfailIfNoTests=false -Dtest=ZeppelinClientIntegrationTest,ZeppelinClientWithAuthIntegrationTest,ZSessionIntegrationTest,ShellIntegrationTest,JdbcIntegrationTest
       - name: Print zeppelin logs
         if: always()
         run: if [ -d "logs" ]; then cat logs/*; fi
@@ -278,7 +278,7 @@ jobs:
             ${{ runner.os }}-zeppelin-
       - name: install environment for flink
         run: |
-          ./mvnw install -DskipTests -am -pl flink/flink-scala-2.12,flink-cmd,zeppelin-interpreter-integration -Pflink-${{ matrix.flink }} -Phadoop2 -Pintegration ${MAVEN_ARGS}
+          ./mvnw install -DskipTests -am -pl flink/flink-scala-2.12,flink-cmd,zeppelin-interpreter-integration -Pflink-${{ matrix.flink }} -Phadoop3 -Pintegration ${MAVEN_ARGS}
           ./mvnw clean package -pl zeppelin-plugins -amd -DskipTests ${MAVEN_ARGS}
       - name: Setup conda environment with python ${{ matrix.python }} and R
         uses: conda-incubator/setup-miniconda@v2
@@ -292,7 +292,7 @@ jobs:
           auto-activate-base: false
           use-mamba: true
       - name: run tests for flink
-        run: ./mvnw verify -pl flink/flink-scala-2.12,flink-cmd,zeppelin-interpreter-integration -Pflink-${{ matrix.flink }} -am -Phadoop2 -Pintegration -DfailIfNoTests=false -Dtest=org.apache.zeppelin.flink.*Test,FlinkIntegrationTest${{ matrix.flink }} ${MAVEN_ARGS}
+        run: ./mvnw verify -pl flink/flink-scala-2.12,flink-cmd,zeppelin-interpreter-integration -Pflink-${{ matrix.flink }} -am -Phadoop3 -Pintegration -DfailIfNoTests=false -Dtest=org.apache.zeppelin.flink.*Test,FlinkIntegrationTest${{ matrix.flink }} ${MAVEN_ARGS}
       - name: Print zeppelin logs
         if: always()
         run: if [ -d "logs" ]; then cat logs/*; fi
@@ -328,7 +328,7 @@ jobs:
             ${{ runner.os }}-zeppelin-
       - name: install environment
         run: |
-          ./mvnw install -DskipTests -pl zeppelin-interpreter-integration,zeppelin-web,spark-submit,spark/scala-2.12,spark/scala-2.13,markdown -am -Phadoop2 -Pintegration ${MAVEN_ARGS}
+          ./mvnw install -DskipTests -pl zeppelin-interpreter-integration,zeppelin-web,spark-submit,spark/scala-2.12,spark/scala-2.13,markdown -am -Phadoop3 -Pintegration ${MAVEN_ARGS}
           ./mvnw clean package -pl zeppelin-plugins -amd -DskipTests ${MAVEN_ARGS}
       - name: Setup conda environment with python 3.9 and R
         uses: conda-incubator/setup-miniconda@v2
@@ -382,7 +382,7 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-zeppelin-
       - name: install environment
-        run: ./mvnw install -DskipTests -pl spark-submit,spark/scala-2.12,spark/scala-2.13 -am -Phadoop2 ${MAVEN_ARGS}
+        run: ./mvnw install -DskipTests -pl spark-submit,spark/scala-2.12,spark/scala-2.13 -am -Phadoop3 ${MAVEN_ARGS}
       - name: Setup conda environment with python ${{ matrix.python }} and R
         uses: conda-incubator/setup-miniconda@v2
         with:
@@ -400,11 +400,11 @@ jobs:
       - name: run spark-3.2 tests with scala-2.12 and python-${{ matrix.python }}
         run: |
           rm -rf spark/interpreter/metastore_db
-          ./mvnw verify -pl spark-submit,spark/interpreter -am -Dtest=org/apache/zeppelin/spark/* -Pspark-3.2 -Pspark-scala-2.12 -Phadoop2 -Pintegration -DfailIfNoTests=false ${MAVEN_ARGS}
+          ./mvnw verify -pl spark-submit,spark/interpreter -am -Dtest=org/apache/zeppelin/spark/* -Pspark-3.2 -Pspark-scala-2.12 -Phadoop3 -Pintegration -DfailIfNoTests=false ${MAVEN_ARGS}
       - name: run spark-3.2 tests with scala-2.13 and python-${{ matrix.python }}
         run: |
           rm -rf spark/interpreter/metastore_db
-          ./mvnw verify -pl spark-submit,spark/interpreter -am -Dtest=org/apache/zeppelin/spark/* -Pspark-3.2 -Pspark-scala-2.13 -Phadoop2 -Pintegration -DfailIfNoTests=false ${MAVEN_ARGS}
+          ./mvnw verify -pl spark-submit,spark/interpreter -am -Dtest=org/apache/zeppelin/spark/* -Pspark-3.2 -Pspark-scala-2.13 -Phadoop3 -Pintegration -DfailIfNoTests=false ${MAVEN_ARGS}
       - name: run spark-3.3 tests with scala-2.12 and python-${{ matrix.python }}
         run: |
           rm -rf spark/interpreter/metastore_db

diff --git a/.github/workflows/frontend.yml b/.github/workflows/frontend.yml
@@ -53,9 +53,9 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-zeppelin-
       - name: Install application
-        run: ./mvnw clean install -DskipTests -am -pl zeppelin-web -Pspark-scala-2.12 -Pspark-3.4 -Phadoop2 -Pweb-dist ${MAVEN_ARGS}
+        run: ./mvnw clean install -DskipTests -am -pl zeppelin-web -Pspark-scala-2.12 -Pspark-3.4 -Phadoop3 -Pweb-dist ${MAVEN_ARGS}
       - name: Run headless test
-        run: xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" ./mvnw verify -pl zeppelin-web -Pspark-scala-2.12 -Pspark-3.4 -Phadoop2 -Pweb-dist -Pweb-e2e ${MAVEN_ARGS}
+        run: xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" ./mvnw verify -pl zeppelin-web -Pspark-scala-2.12 -Pspark-3.4 -Phadoop3 -Pweb-dist -Pweb-e2e ${MAVEN_ARGS}
       - name: Print zeppelin logs
         if: always()
         run: if [ -d "logs" ]; then cat logs/*; fi
@@ -128,10 +128,10 @@ jobs:
           R -e "IRkernel::installspec()"
       - name: Install Environment
         run: |
-          ./mvnw clean install -DskipTests -am -pl zeppelin-integration -Pintegration -Pspark-scala-2.12 -Pspark-3.4 -Phadoop2 -Pweb-dist ${MAVEN_ARGS}
+          ./mvnw clean install -DskipTests -am -pl zeppelin-integration -Pintegration -Pspark-scala-2.12 -Pspark-3.4 -Phadoop3 -Pweb-dist ${MAVEN_ARGS}
       - name: run tests
         run: |
-          source ./testing/downloadSpark.sh "3.4.1" "3" && echo "SPARK_HOME: ${SPARK_HOME}" && xvfb-run --auto-servernum --server-args="-screen 0 1600x1024x16" ./mvnw verify -DfailIfNoTests=false -pl zeppelin-integration -Pintegration -Pspark-scala-2.12 -Pspark-3.4 -Phadoop2 -Pweb-dist -Pusing-source-tree ${MAVEN_ARGS}
+          source ./testing/downloadSpark.sh "3.4.1" "3" && echo "SPARK_HOME: ${SPARK_HOME}" && xvfb-run --auto-servernum --server-args="-screen 0 1600x1024x16" ./mvnw verify -DfailIfNoTests=false -pl zeppelin-integration -Pintegration -Pspark-scala-2.12 -Pspark-3.4 -Phadoop3 -Pweb-dist -Pusing-source-tree ${MAVEN_ARGS}
       - name: Print zeppelin logs
         if: always()
         run: if [ -d "logs" ]; then cat logs/*; fi
diff --git a/.github/workflows/quick.yml b/.github/workflows/quick.yml
@@ -41,7 +41,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        hadoop: [hadoop2, hadoop3]
+        hadoop: [hadoop3]
     steps:
       - name: Checkout
         uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,10 @@
 /interpreter/*
 !/interpreter/lib
 
+# metals
+.bloop
+.metals
+
 # interpreter temp files
 derby.log
 spark/metastore_db

diff --git a/alluxio/pom.xml b/alluxio/pom.xml
@@ -68,15 +68,27 @@
             <artifactId>alluxio-minicluster</artifactId>
             <version>${alluxio.version}</version>
             <scope>test</scope>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-client</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
 
         <dependency>
             <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-common</artifactId>
-            <version>3.2.4</version>
+            <artifactId>hadoop-client-api</artifactId>
+            <version>${hadoop.version}</version>
             <scope>test</scope>
         </dependency>
 
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-client-runtime</artifactId>
+            <version>${hadoop.version}</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <build>

diff --git a/docs/interpreter/flink.md b/docs/interpreter/flink.md
@@ -301,7 +301,7 @@ You can also add and set other Flink properties which are not listed in the tabl
   </tr>
   <tr>
     <td>zeppelin.flink.hive.version</td>
-    <td>2.3.4</td>
+    <td>2.3.7</td>
     <td>Hive version that you would like to connect</td>
   </tr>
   <tr>

diff --git a/docs/setup/basics/how_to_build.md b/docs/setup/basics/how_to_build.md
@@ -123,7 +123,6 @@ Set hadoop major version (default hadoop3).
 Available profiles are
 
 ```
--Phadoop2
 -Phadoop3
 ```
 

diff --git a/docs/setup/deployment/flink_and_spark_cluster.md b/docs/setup/deployment/flink_and_spark_cluster.md
@@ -225,16 +225,16 @@ Building from source is recommended  where possible, for simplicity in this tuto
 To download the Flink Binary use `wget`
 
 ```bash
-wget "http://mirror.cogentco.com/pub/apache/flink/flink-1.1.3/flink-1.1.3-bin-hadoop24-scala_2.10.tgz"
-tar -xzvf flink-1.1.3-bin-hadoop24-scala_2.10.tgz
+wget "http://mirror.cogentco.com/pub/apache/flink/flink-1.16.2/flink-1.16.2-bin-scala_2.12.tgz"
+tar -xzvf flink-1.16.2-bin-scala_2.12.tgz
 ```
 
-This will download Flink 1.1.3, compatible with Hadoop 2.4.  You do not have to install Hadoop for this binary to work, but if you are using Hadoop, please change `24` to your appropriate version.
+This will download Flink 1.16.2.
 
 Start the Flink Cluster.
 
 ```bash
-flink-1.1.3/bin/start-cluster.sh
+flink-1.16.2/bin/start-cluster.sh
 ```
 
 ###### Building From source
@@ -295,12 +295,12 @@ Using binaries is also
 To download the Spark Binary use `wget`
 
 ```bash
-wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.6.3-bin-hadoop2.6.tgz"
-tar -xzvf spark-1.6.3-bin-hadoop2.6.tgz
-mv spark-1.6.3-bin-hadoop2.6 spark
+wget "https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz"
+tar -xzvf spark-3.4.1-bin-hadoop3.tgz
+mv spark-3.4.1-bin-hadoop3 spark
 ```
 
-This will download Spark 1.6.3, compatible with Hadoop 2.6.  You do not have to install Hadoop for this binary to work, but if you are using Hadoop, please change `2.6` to your appropriate version.
+This will download Spark 3.4.1, compatible with Hadoop 3.  You do not have to install Hadoop for this binary to work, but if you are using Hadoop, please change `3` to your appropriate version.
 
 ###### Building From source
 

diff --git a/flink-cmd/pom.xml b/flink-cmd/pom.xml
@@ -44,14 +44,7 @@
 
     <dependency>
       <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-common</artifactId>
-      <version>${hadoop.version}</version>
-      <scope>provided</scope>
-    </dependency>
-
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-yarn-client</artifactId>
+      <artifactId>hadoop-client-runtime</artifactId>
       <version>${hadoop.version}</version>
       <scope>provided</scope>
     </dependency>
-Original file line number
+Diff line change
@@ Expand Up / @@ -123,7 +123,6 @@ Set hadoop major version (default hadoop3). @@
     Available profiles are
     ```
-    -Phadoop2
     -Phadoop3
     ```
@@ Expand Down @@