Skip to content

Commit

Permalink
upgrade
Browse files Browse the repository at this point in the history
  • Loading branch information
BePPPower committed May 21, 2024
1 parent af9f5a2 commit 466bddc
Show file tree
Hide file tree
Showing 5 changed files with 227 additions and 15 deletions.
7 changes: 7 additions & 0 deletions thirdparty/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ Now there will be 2 set of libhdfs, one is without kerberos, the other is with k

## v20211215

## v20240521
- Modified: arrow 7.0.0 -> 13.0.0
- Modified: jemalloc for arrow 5.2.1 -> 5.3.0
- Modified: xsimd 7.0.0 -> 13.0.0
- Added: c-ares -> 1.19.1
- Added: grpc -> 1.54.3

### Changes

- Added: cyrus-sasl
Expand Down
72 changes: 71 additions & 1 deletion thirdparty/build-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,8 @@ build_arrow() {
export ARROW_ZLIB_URL="${TP_SOURCE_DIR}/${ZLIB_NAME}"
export ARROW_XSIMD_URL="${TP_SOURCE_DIR}/${XSIMD_NAME}"
export ARROW_ORC_URL="${TP_SOURCE_DIR}/${ORC_NAME}"
export ARROW_GRPC_URL="${TP_SOURCE_DIR}/${GRPC_NAME}"
export ARROW_PROTOBUF_URL="${TP_SOURCE_DIR}/${PROTOBUF_NAME}"

if [[ "${KERNEL}" != 'Darwin' ]]; then
ldflags="-L${TP_LIB_DIR} -static-libstdc++ -static-libgcc"
Expand All @@ -973,22 +975,38 @@ build_arrow() {
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DCMAKE_INSTALL_LIBDIR=lib64 \
-DARROW_BOOST_USE_SHARED=OFF \
-DARROW_WITH_GRPC=ON \
-DgRPC_SOURCE=SYSTEM \
-DgRPC_ROOT="${TP_INSTALL_DIR}" \
-DARROW_WITH_PROTOBUF=ON \
-DProtobuf_SOURCE=SYSTEM \
-DProtobuf_LIB="${TP_INSTALL_DIR}/lib/libprotoc.a" -DProtobuf_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DARROW_FLIGHT=ON \
-DARROW_FLIGHT_SQL=ON \
-DBoost_USE_STATIC_RUNTIME=ON \
-DARROW_GFLAGS_USE_SHARED=OFF \
-Dgflags_ROOT="${TP_INSTALL_DIR}" \
-DGLOG_ROOT="${TP_INSTALL_DIR}" \
-DRE2_ROOT="${TP_INSTALL_DIR}" \
-DZLIB_SOURCE=SYSTEM \
-DZLIB_LIBRARY="${TP_INSTALL_DIR}/lib/libz.a" -DZLIB_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DRapidJSON_SOURCE=SYSTEM \
-DRapidJSON_ROOT="${TP_INSTALL_DIR}" \
-DORC_ROOT="${TP_INSTALL_DIR}" \
-Dxsimd_SOURCE=BUNDLED \
-DBrotli_SOURCE=BUNDLED \
-DARROW_LZ4_USE_SHARED=OFF \
-DLZ4_LIB="${TP_INSTALL_DIR}/lib/liblz4.a" -DLZ4_INCLUDE_DIR="${TP_INSTALL_DIR}/include/lz4" \
-DLz4_SOURCE=SYSTEM \
-DARROW_ZSTD_USE_SHARED=OFF \
-DZSTD_LIB="${TP_INSTALL_DIR}/lib/libzstd.a" -DZSTD_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-Dzstd_SOURCE=SYSTEM \
-DSnappy_LIB="${TP_INSTALL_DIR}/lib/libsnappy.a" -DSnappy_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DSnappy_SOURCE=SYSTEM \
-DBOOST_ROOT="${TP_INSTALL_DIR}" --no-warn-unused-cli \
-Djemalloc_SOURCE=BUNDLED \
-DARROW_THRIFT_USE_SHARED=OFF \
-DThrift_SOURCE=SYSTEM \
-DThrift_ROOT="${TP_INSTALL_DIR}" ..

"${BUILD_SYSTEM}" -j "${PARALLEL}"
Expand Down Expand Up @@ -1662,6 +1680,56 @@ build_libdeflate() {
"${BUILD_SYSTEM}" install
}

# c-ares
build_cares() {
check_if_source_exist "${CARES_SOURCE}"
cd "${TP_SOURCE_DIR}/${CARES_SOURCE}"

mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release \
-DCARES_STATIC=ON \
-DCARES_SHARED=OFF \
-DCARES_STATIC_PIC=ON \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" ..
make
make install
}

# grpc
build_grpc() {
check_if_source_exist "${GRPC_SOURCE}"
cd "${TP_SOURCE_DIR}/${GRPC_SOURCE}"

mkdir -p cmake/build
cd cmake/build

cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DgRPC_CARES_PROVIDER=package \
-Dc-ares_DIR="${TP_INSTALL_DIR}" \
-DgRPC_ABSL_PROVIDER=package \
-Dabsl_DIR="${TP_INSTALL_DIR}" \
-DgRPC_PROTOBUF_PROVIDER=package \
-DProtobuf_DIR="${TP_INSTALL_DIR}" \
-DgRPC_RE2_PROVIDER=package \
-Dre2_DIR:STRING="${TP_INSTALL_DIR}" \
-DgRPC_SSL_PROVIDER=package \
-DOPENSSL_ROOT_DIR="${TP_INSTALL_DIR}" \
-DgRPC_ZLIB_PROVIDER=package \
-DZLIB_ROOT="${TP_INSTALL_DIR}" \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
../..

make -j "${PARALLEL}"
make install

# for grpc > v1.55, cmake 2.22 does not support find_dependency, delete this line after cmake version upgrade.
# sed -i 's/find_dependency/find_package/g' "${TP_INSTALL_DIR}"/lib64/cmake/grpc/gRPCConfig.cmake
}

if [[ "${#packages[@]}" -eq 0 ]]; then
packages=(
libunixodbc
Expand All @@ -1673,9 +1741,9 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
lzo2
zstd
boost # must before thrift
protobuf
gflags
gtest
protobuf # after gtest
glog
rapidjson
snappy
Expand All @@ -1693,6 +1761,8 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
librdkafka
flatbuffers
orc
cares
grpc # after cares, protobuf
arrow
abseil
s2
Expand Down
4 changes: 2 additions & 2 deletions thirdparty/download-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,10 @@ fi
echo "Finished patching ${OPENTELEMETRY_SOURCE}"

# arrow patch is used to get the raw orc reader for filter prune.
if [[ "${ARROW_SOURCE}" == "apache-arrow-7.0.0" ]]; then
if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-13.0.0" ]]; then
cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}"
if [[ ! -f "${PATCHED_MARK}" ]]; then
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-7.0.0.patch"
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-13.0.0.patch"
touch "${PATCHED_MARK}"
fi
cd -
Expand Down
120 changes: 120 additions & 0 deletions thirdparty/patches/apache-arrow-13.0.0.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 2466e7433..46b4402d4 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -47,9 +47,6 @@
#include "arrow/util/visibility.h"
#include "orc/Exceptions.hh"

-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
#define ORC_THROW_NOT_OK(s) \
do { \
Status _s = (s); \
@@ -202,6 +199,8 @@ class ORCFileReader::Impl {
return Init();
}

+ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); }
+
Status Init() {
int64_t nstripes = reader_->getNumberOfStripes();
stripes_.resize(nstripes);
@@ -479,6 +478,31 @@ class ORCFileReader::Impl {
return Status::OK();
}

+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ if (current_row_ >= NumberOfRows()) {
+ return nullptr;
+ }
+
+ liborc::RowReaderOptions opts = default_row_reader_options();
+ if (!include_names.empty()) {
+ RETURN_NOT_OK(SelectNames(&opts, include_names));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_id + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size,
+ pool_);
+ }
+
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
int64_t batch_size, const std::vector<int>& include_indices) {
if (current_row_ >= NumberOfRows()) {
@@ -544,6 +568,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
return std::move(result);
}

+liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); }
+
Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
return impl_->ReadMetadata();
}
@@ -605,6 +631,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
return impl_->NextStripeReader(batch_size, include_indices);
}

+Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ return impl_->NextStripeReader(batch_size, include_names);
+}
+
int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }

int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
index 013be7860..7fd06bcb8 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -30,6 +30,10 @@
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
+#include "orc/Reader.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;

namespace arrow {
namespace adapters {
@@ -53,6 +57,9 @@ class ARROW_EXPORT ORCFileReader {
public:
~ORCFileReader();

+ /// \brief Get ORC reader from inside.
+ liborc::Reader* GetRawORCReader();
+
/// \brief Creates a new ORC reader
///
/// \param[in] file the data source
@@ -174,6 +181,19 @@ class ARROW_EXPORT ORCFileReader {
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
int64_t batch_size, const std::vector<std::string>& include_names);

+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_names the selected field names to read
+ /// \return the returned stripe reader
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names);
+
/// \brief The number of stripes in the file
int64_t NumberOfStripes();
39 changes: 27 additions & 12 deletions thirdparty/vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,24 @@ FLATBUFFERS_NAME=flatbuffers-2.0.0.tar.gz
FLATBUFFERS_SOURCE=flatbuffers-2.0.0
FLATBUFFERS_MD5SUM="a27992324c3cbf86dd888268a23d17bd"

# c-ares
CARES_DOWNLOAD="https://github.com/c-ares/c-ares/releases/download/cares-1_19_1/c-ares-1.19.1.tar.gz"
CARES_NAME="c-ares-1.19.1.tar.gz"
CARES_SOURCE=c-ares-1.19.1
CARES_MD5SUM="dafc5825a92dc907e144570e4e75a908"

# grpc
# grpc v1.55 and above require protobuf >= 22
GRPC_DOWNLOAD="https://github.com/grpc/grpc/archive/refs/tags/v1.54.3.tar.gz"
GRPC_NAME="grpc-v1.54.3.tar.gz"
GRPC_SOURCE=grpc-1.54.3
GRPC_MD5SUM="af00a2edeae0f02bb25917cc3473b7de"

# arrow
ARROW_DOWNLOAD="https://archive.apache.org/dist/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz"
ARROW_NAME="apache-arrow-7.0.0.tar.gz"
ARROW_SOURCE="apache-arrow-7.0.0"
ARROW_MD5SUM="316ade159901646849b3b4760fa52816"
ARROW_DOWNLOAD="https://github.com/apache/arrow/archive/refs/tags/apache-arrow-13.0.0.tar.gz"
ARROW_NAME="apache-arrow-13.0.0.tar.gz"
ARROW_SOURCE="arrow-apache-arrow-13.0.0"
ARROW_MD5SUM="8ec1ec6a119514bcaea1cf7aabc9df1f"

# Abseil
ABSEIL_DOWNLOAD="https://github.com/abseil/abseil-cpp/archive/refs/tags/20220623.1.tar.gz"
Expand Down Expand Up @@ -287,10 +300,10 @@ ORC_SOURCE=orc-1.7.2
ORC_MD5SUM="6cab37935eacdec7d078d327746a8578"

# jemalloc for arrow
JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.2.1/jemalloc-5.2.1.tar.bz2"
JEMALLOC_ARROW_NAME="jemalloc-5.2.1.tar.bz2"
JEMALLOC_ARROW_SOURCE="jemalloc-5.2.1"
JEMALLOC_ARROW_MD5SUM="3d41fbf006e6ebffd489bdb304d009ae"
JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2"
JEMALLOC_ARROW_NAME="jemalloc-5.3.0.tar.bz2"
JEMALLOC_ARROW_SOURCE="jemalloc-5.3.0"
JEMALLOC_ARROW_MD5SUM="09a8328574dab22a7df848eae6dbbf53"

# jemalloc for doris
JEMALLOC_DORIS_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2"
Expand Down Expand Up @@ -399,10 +412,10 @@ BENCHMARK_MD5SUM="8ddf8571d3f6198d37852bcbd964f817"

# xsimd
# for arrow-7.0.0, if arrow upgrade, this version may also need to be changed
XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz"
XSIMD_NAME=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz
XSIMD_SOURCE=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2
XSIMD_MD5SUM="d024855f71c0a2837a6918c0f8f66245"
XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/refs/tags/9.0.1.tar.gz"
XSIMD_NAME="xsimd-9.0.1.tar.gz"
XSIMD_SOURCE=xsimd-9.0.1
XSIMD_MD5SUM="59f38fe3364acd7ed137771258812d6c"

# simdjson
SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v3.0.1.tar.gz"
Expand Down Expand Up @@ -505,6 +518,8 @@ export TP_ARCHIVES=(
'CYRUS_SASL'
'LIBRDKAFKA'
'FLATBUFFERS'
'CARES'
'GRPC'
'ARROW'
'BROTLI'
'ZSTD'
Expand Down

0 comments on commit 466bddc

Please sign in to comment.