From c8ac2f3984e39c5a9e7479d9b6f33d59da1f9068 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" <6454655+adoroszlai@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:19:39 +0100 Subject: [PATCH] [ZEPPELIN-6157] Download artifacts from CDN if available ## What changes were proposed in this pull request? Current artifacts available in CDN (`dlcdn.apache.org`) may get removed without notice when new releases appear. To avoid broken links, build scripts contain permanent addresses from `archive.apache.org`. But download from `archive.apache.org` may be slow: ``` Thu, 05 Dec 2024 08:39:53 GMT [INFO] --- download:1.6.0:wget (download-sparkr-files) r --- Thu, 05 Dec 2024 08:39:54 GMT Warning: No signatures were supplied, skipping file validation Thu, 05 Dec 2024 08:39:54 GMT [INFO] Read Timeout is set to 60000 milliseconds (apprx 1 minutes) Thu, 05 Dec 2024 08:45:46 GMT [INFO] Expanding: /home/runner/work/zeppelin/zeppelin/rlang/target/spark-3.5.3-bin-without-hadoop.tgz into /home/runner/work/zeppelin/zeppelin/rlang/target ``` Apache Infra's [`closer.lua` script](https://infra.apache.org/release-download-pages.html#closer) can redirect to CDN or archive, depending on artifact availability. This change replaces `archive.apache.org` URLs, and one instance of `dist.apache.org`, with their `closer.lua` equivalent. Output filename has to be specified for `wget` unfortunately. https://issues.apache.org/jira/browse/ZEPPELIN-6157 ## How was this patch tested? Tried some of the URLs locally, both from CLI (`curl -L --head`) and regular build (`mvn -DskipTests clean package`). Full CI: - quick: https://github.com/adoroszlai/zeppelin/actions/runs/12319072153 - frontend: https://github.com/adoroszlai/zeppelin/actions/runs/12319072142 - core: https://github.com/adoroszlai/zeppelin/actions/runs/12319072156 Closes #4901 from adoroszlai/ZEPPELIN-6157. Signed-off-by: Philipp Dallig --- docs/quickstart/kubernetes.md | 2 +- docs/setup/deployment/flink_and_spark_cluster.md | 4 ++-- flink/flink-scala-2.12/pom.xml | 3 ++- rlang/pom.xml | 3 ++- scripts/docker/interpreter/Dockerfile | 2 +- .../spark-cluster-managers/spark_yarn_cluster/Dockerfile | 4 ++-- scripts/docker/zeppelin/bin/Dockerfile | 2 +- spark/interpreter/pom.xml | 6 ++++-- spark/pom.xml | 4 ++-- testing/downloadLivy.sh | 6 ++++-- testing/downloadSpark.sh | 8 +++++--- 11 files changed, 26 insertions(+), 18 deletions(-) diff --git a/docs/quickstart/kubernetes.md b/docs/quickstart/kubernetes.md index 470614f2f04..f60003f40a2 100644 --- a/docs/quickstart/kubernetes.md +++ b/docs/quickstart/kubernetes.md @@ -179,7 +179,7 @@ $ mv zeppelin-distribution/target/zeppelin-*-bin.tgz scripts/docker/zeppelin/bin # Find following section and comment out #RUN echo "$LOG_TAG Download Zeppelin binary" && \ -# wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \ +# wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz "https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download" && \ # tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \ # rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \ # mv /zeppelin-${Z_VERSION}-bin-all ${ZEPPELIN_HOME} diff --git a/docs/setup/deployment/flink_and_spark_cluster.md b/docs/setup/deployment/flink_and_spark_cluster.md index df5df80d9ad..070b2af0f59 100644 --- a/docs/setup/deployment/flink_and_spark_cluster.md +++ b/docs/setup/deployment/flink_and_spark_cluster.md @@ -215,7 +215,7 @@ Building from source is recommended where possible, for simplicity in this tuto To download the Flink Binary use `wget` ```bash -wget "https://archive.apache.org/dist/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz" +wget -O flink-1.17.1-bin-scala_2.12.tgz "https://www.apache.org/dyn/closer.lua/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz?action=download" tar -xzvf flink-1.17.1-bin-scala_2.12.tgz ``` @@ -285,7 +285,7 @@ Using binaries is also To download the Spark Binary use `wget` ```bash -wget "https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz" +wget -O spark-3.5.2-bin-hadoop3.tgz "https://www.apache.org/dyn/closer.lua/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz?action=download" tar -xzvf spark-3.5.2-bin-hadoop3.tgz mv spark-3.5.2-bin-hadoop3 spark ``` diff --git a/flink/flink-scala-2.12/pom.xml b/flink/flink-scala-2.12/pom.xml index f1939861c13..e624f0d3fbb 100644 --- a/flink/flink-scala-2.12/pom.xml +++ b/flink/flink-scala-2.12/pom.xml @@ -42,7 +42,7 @@ 10.14.2.0 5.3.0 - https://archive.apache.org/dist/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz + https://www.apache.org/dyn/closer.lua/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz?action=download @@ -1056,6 +1056,7 @@ ${flink.bin.download.url} true ${project.build.directory} + flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz diff --git a/rlang/pom.xml b/rlang/pom.xml index f70af86c8f6..38852e39b3d 100644 --- a/rlang/pom.xml +++ b/rlang/pom.xml @@ -38,7 +38,7 @@ spark-${spark.version} - https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz + https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download zeppelin-interpreter-r @@ -154,6 +154,7 @@ ${spark.bin.download.url} true ${project.build.directory} + ${spark.archive}-bin-without-hadoop.tgz diff --git a/scripts/docker/interpreter/Dockerfile b/scripts/docker/interpreter/Dockerfile index ab7f9668e1a..2de94c88c7f 100644 --- a/scripts/docker/interpreter/Dockerfile +++ b/scripts/docker/interpreter/Dockerfile @@ -30,7 +30,7 @@ RUN apt-get update && apt-get install -y curl unzip wget grep sed vim tzdata && RUN rm -rf /opt/zeppelin RUN rm -rf /spark -RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz +RUN wget -O spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz?action=download" RUN tar zxvf spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz RUN mv spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME} /opt/spark RUN rm spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz diff --git a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile index da3df1c10da..01b15308fdc 100644 --- a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile +++ b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile @@ -42,7 +42,7 @@ ENV PATH $PATH:$JAVA_HOME/bin RUN yum install -y curl which tar sudo openssh-server openssh-clients rsync # hadoop -RUN curl -s https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz | tar -xz -C /usr/local/ +RUN curl -s "https://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz?action=download" | tar -xz -C /usr/local/ RUN cd /usr/local && ln -s ./hadoop-$HADOOP_VERSION hadoop ENV HADOOP_PREFIX /usr/local/hadoop @@ -72,7 +72,7 @@ RUN rm /usr/local/hadoop/lib/native/* RUN curl -Ls http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-$HADOOP_VERSION.tar|tar -x -C /usr/local/hadoop/lib/native/ # install spark -RUN curl -s http://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz | tar -xz -C /usr/local/ +RUN curl -s "https://www.apache.org/dyn/closer.lua/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz?action=download" | tar -xz -C /usr/local/ RUN cd /usr/local && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE spark ENV SPARK_HOME /usr/local/spark diff --git a/scripts/docker/zeppelin/bin/Dockerfile b/scripts/docker/zeppelin/bin/Dockerfile index a04c077a085..40a3026711a 100644 --- a/scripts/docker/zeppelin/bin/Dockerfile +++ b/scripts/docker/zeppelin/bin/Dockerfile @@ -65,7 +65,7 @@ ENV PATH /opt/conda/envs/python_3_with_R/bin:/opt/conda/bin:$PATH RUN echo "$LOG_TAG Download Zeppelin binary" && \ mkdir -p ${ZEPPELIN_HOME} && \ - wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz https://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \ + wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz "https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download" && \ tar --strip-components=1 -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz -C ${ZEPPELIN_HOME} && \ rm -f /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \ chown -R root:root ${ZEPPELIN_HOME} && \ diff --git a/spark/interpreter/pom.xml b/spark/interpreter/pom.xml index 2fbfc042b7f..f77ca360175 100644 --- a/spark/interpreter/pom.xml +++ b/spark/interpreter/pom.xml @@ -48,10 +48,10 @@ spark-${spark.version} - https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz + https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download - https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz + https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download ${spark.scala.version} @@ -280,6 +280,7 @@ true ${spark.src.download.url} ${project.build.directory} + ${spark.archive}.tgz @@ -295,6 +296,7 @@ ${spark.bin.download.url} true ${project.build.directory} + ${spark.archive}-bin-without-hadoop.tgz diff --git a/spark/pom.xml b/spark/pom.xml index 9e5c9738115..5f122432d83 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -45,10 +45,10 @@ spark-${spark.version} - https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz + https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download - https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz + https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download diff --git a/testing/downloadLivy.sh b/testing/downloadLivy.sh index f09837a7574..fadd9973ee3 100755 --- a/testing/downloadLivy.sh +++ b/testing/downloadLivy.sh @@ -45,12 +45,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)" # None # Arguments: # url - source URL +# file - output filename # Returns: # None ####################################### download_with_retry() { local url="$1" - wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 "${url}" + local file="${2:-$(basename $url)}" + wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 --output-document "${file}" "${url}" if [[ "$?" -ne 0 ]]; then echo "3 download attempts for ${url} failed" fi @@ -72,7 +74,7 @@ if [[ ! -d "${LIVY_HOME}" ]]; then # download livy from archive if not cached echo "${LIVY_VERSION} being downloaded from archives" STARTTIME=`date +%s` - download_with_retry "https://dist.apache.org/repos/dist/release/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip" + download_with_retry "https://www.apache.org/dyn/closer.lua/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip?action=download" "${LIVY_ARCHIVE}.zip" ENDTIME=`date +%s` DOWNLOADTIME="$((ENDTIME-STARTTIME))" fi diff --git a/testing/downloadSpark.sh b/testing/downloadSpark.sh index 9c19e82bbc9..118097b0009 100755 --- a/testing/downloadSpark.sh +++ b/testing/downloadSpark.sh @@ -38,12 +38,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)" # None # Arguments: # url - source URL +# file - output filename # Returns: # None ####################################### download_with_retry() { local url="$1" - wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 "${url}" + local file="${2:-$(basename $url)}" + wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 --output-document "${file}" "${url}" if [[ "$?" -ne 0 ]]; then echo "3 download attempts for ${url} failed" fi @@ -65,8 +67,8 @@ if [[ ! -d "${SPARK_HOME}" ]]; then # download spark from archive if not cached echo "${SPARK_VERSION} being downloaded from archives" STARTTIME=`date +%s` - #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz" - download_with_retry "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz" + #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget -O "${SPARK_ARCHIVE}.tgz" "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download" + download_with_retry "https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download" "${SPARK_ARCHIVE}.tgz" ENDTIME=`date +%s` DOWNLOADTIME="$((ENDTIME-STARTTIME))" fi