-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
227 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc | ||
index 2466e7433..46b4402d4 100644 | ||
--- a/cpp/src/arrow/adapters/orc/adapter.cc | ||
+++ b/cpp/src/arrow/adapters/orc/adapter.cc | ||
@@ -47,9 +47,6 @@ | ||
#include "arrow/util/visibility.h" | ||
#include "orc/Exceptions.hh" | ||
|
||
-// alias to not interfere with nested orc namespace | ||
-namespace liborc = orc; | ||
- | ||
#define ORC_THROW_NOT_OK(s) \ | ||
do { \ | ||
Status _s = (s); \ | ||
@@ -202,6 +199,8 @@ class ORCFileReader::Impl { | ||
return Init(); | ||
} | ||
|
||
+ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); } | ||
+ | ||
Status Init() { | ||
int64_t nstripes = reader_->getNumberOfStripes(); | ||
stripes_.resize(nstripes); | ||
@@ -479,6 +478,31 @@ class ORCFileReader::Impl { | ||
return Status::OK(); | ||
} | ||
|
||
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader( | ||
+ int64_t batch_size, const std::vector<std::string>& include_names) { | ||
+ if (current_row_ >= NumberOfRows()) { | ||
+ return nullptr; | ||
+ } | ||
+ | ||
+ liborc::RowReaderOptions opts = default_row_reader_options(); | ||
+ if (!include_names.empty()) { | ||
+ RETURN_NOT_OK(SelectNames(&opts, include_names)); | ||
+ } | ||
+ StripeInformation stripe_info({0, 0, 0, 0}); | ||
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info)); | ||
+ ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); | ||
+ std::unique_ptr<liborc::RowReader> row_reader; | ||
+ | ||
+ ORC_BEGIN_CATCH_NOT_OK | ||
+ row_reader = reader_->createRowReader(opts); | ||
+ row_reader->seekToRow(current_row_); | ||
+ current_row_ = stripe_info.first_row_id + stripe_info.num_rows; | ||
+ ORC_END_CATCH_NOT_OK | ||
+ | ||
+ return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size, | ||
+ pool_); | ||
+ } | ||
+ | ||
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader( | ||
int64_t batch_size, const std::vector<int>& include_indices) { | ||
if (current_row_ >= NumberOfRows()) { | ||
@@ -544,6 +568,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open( | ||
return std::move(result); | ||
} | ||
|
||
+liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); } | ||
+ | ||
Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() { | ||
return impl_->ReadMetadata(); | ||
} | ||
@@ -605,6 +631,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader( | ||
return impl_->NextStripeReader(batch_size, include_indices); | ||
} | ||
|
||
+Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader( | ||
+ int64_t batch_size, const std::vector<std::string>& include_names) { | ||
+ return impl_->NextStripeReader(batch_size, include_names); | ||
+} | ||
+ | ||
int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } | ||
|
||
int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } | ||
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h | ||
index 013be7860..7fd06bcb8 100644 | ||
--- a/cpp/src/arrow/adapters/orc/adapter.h | ||
+++ b/cpp/src/arrow/adapters/orc/adapter.h | ||
@@ -30,6 +30,10 @@ | ||
#include "arrow/type_fwd.h" | ||
#include "arrow/util/macros.h" | ||
#include "arrow/util/visibility.h" | ||
+#include "orc/Reader.hh" | ||
+ | ||
+// alias to not interfere with nested orc namespace | ||
+namespace liborc = orc; | ||
|
||
namespace arrow { | ||
namespace adapters { | ||
@@ -53,6 +57,9 @@ class ARROW_EXPORT ORCFileReader { | ||
public: | ||
~ORCFileReader(); | ||
|
||
+ /// \brief Get ORC reader from inside. | ||
+ liborc::Reader* GetRawORCReader(); | ||
+ | ||
/// \brief Creates a new ORC reader | ||
/// | ||
/// \param[in] file the data source | ||
@@ -174,6 +181,19 @@ class ARROW_EXPORT ORCFileReader { | ||
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader( | ||
int64_t batch_size, const std::vector<std::string>& include_names); | ||
|
||
+ /// \brief Get a stripe level record batch iterator with specified row count | ||
+ /// in each record batch. NextStripeReader serves as a fine grain | ||
+ /// alternative to ReadStripe which may cause OOM issue by loading | ||
+ /// the whole stripes into memory. | ||
+ /// | ||
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row | ||
+ /// count in each record batch. | ||
+ /// | ||
+ /// \param[in] include_names the selected field names to read | ||
+ /// \return the returned stripe reader | ||
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader( | ||
+ int64_t batch_size, const std::vector<std::string>& include_names); | ||
+ | ||
/// \brief The number of stripes in the file | ||
int64_t NumberOfStripes(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters