From c477fae7b082c121e8e929eed0039a06afe16f86 Mon Sep 17 00:00:00 2001
From: "langshi.cls" <langshi.cls@alibaba-inc.com>
Date: Thu, 18 May 2023 11:25:40 +0800
Subject: [PATCH] [DATA] Implement zero-copied string dtype and accelerate
 shuffle.

1. Implement a zero-copied approach to read string data from Arrow to TF.
2. Accelerate the shuffle operation of string type in ParquetDataset.

preliminary benchmarking results
- col=300, `batch_size`=1000
- `Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz` with 128 logical cores.

| Dataset            | list type | shuffling | throughput (samples/s) | speedup over TFRecord |
| ---                | ---       | ---       | ---                    | ---                   |
| TFRecord           | N         | N         | 1404.23                | 1.0                   |
| HbParquet          | N         | N         | 41137.53               | 29.3                  |
| HbParquet-ZeroCopy | N         | N         | 51335.40               | 36.56                 |
| TFRecord           | N         | Y         | 1343.10                | 1.0                   |
| HbParquet          | N         | Y         | 6629.60                | 4.9                   |
| HbParquet-ZeroCopy | N         | Y         | 10941.25               | 8.1                   |
| TFRecord           | Y         | N         | 1352.05                | 1.0                   |
| HbParquet          | Y         | N         | 2307.33                | 1.71                  |
| HbParquet-ZeroCopy | Y         | N         | 2869.98                | 2.12                  |
| TFRecord           | Y         | Y         | 1367.96                | 1.0                   |
| HbParquet          | Y         | Y         | 1080.03                | 0.79                  |
| HbParquet-ZeroCopy | Y         | Y         | 1454.02                | 1.06                  |

Signed-off-by: langshi.cls <langshi.cls@alibaba-inc.com>
---
 ...e.developer-tf1.15-py3.6-cu100-ubuntu18.04 |   2 +-
 ...e.developer-tf1.15-py3.8-cu121-ubuntu20.04 |   4 +-
 .../benchmarks/data_benchmark_parquet.py      |  45 +-
 .../benchmarks/data_benchmark_tfrecord.py     |  97 +++-
 hybridbackend/tensorflow/common/arrow.cc      | 102 ++++-
 hybridbackend/tensorflow/common/arrow.h       |  27 ++
 .../tensorflow/data/rebatch/buffer.h          |  58 ++-
 .../tensorflow/data/rebatch/rebatch_buffer.cc | 431 ++++++++++++++----
 .../data/rebatch/rebatch_dataset_v2.cc        |   8 +-
 .../data/tests/parquet_dataset_string_test.py |  32 +-
 10 files changed, 639 insertions(+), 167 deletions(-)

diff --git a/build/dockerfiles/Dockerfile.developer-tf1.15-py3.6-cu100-ubuntu18.04 b/build/dockerfiles/Dockerfile.developer-tf1.15-py3.6-cu100-ubuntu18.04
index 574ff387..cf40db33 100644
--- a/build/dockerfiles/Dockerfile.developer-tf1.15-py3.6-cu100-ubuntu18.04
+++ b/build/dockerfiles/Dockerfile.developer-tf1.15-py3.6-cu100-ubuntu18.04
@@ -169,7 +169,7 @@ ENV HYBRIDBACKEND_WITH_CUDA=ON \
     HYBRIDBACKEND_WITH_NCCL=OFF \
     HYBRIDBACKEND_WITH_ARROW_ZEROCOPY=ON \
     HYBRIDBACKEND_WITH_TENSORFLOW_HALF=OFF \
-    HYBRIDBACKEND_WITH_TENSORFLOW_DISTRO=1015 \
+    HYBRIDBACKEND_WITH_TENSORFLOW_DISTRO=77661015 \
     HYBRIDBACKEND_USE_CXX11_ABI=0 \
     HYBRIDBACKEND_WHEEL_ALIAS=-tf115-cu100 \
     HYBRIDBACKEND_WHEEL_REQUIRES="tensorflow_gpu>=1.15,<2.0"
diff --git a/build/dockerfiles/Dockerfile.developer-tf1.15-py3.8-cu121-ubuntu20.04 b/build/dockerfiles/Dockerfile.developer-tf1.15-py3.8-cu121-ubuntu20.04
index bec3b082..e1c60974 100644
--- a/build/dockerfiles/Dockerfile.developer-tf1.15-py3.8-cu121-ubuntu20.04
+++ b/build/dockerfiles/Dockerfile.developer-tf1.15-py3.8-cu121-ubuntu20.04
@@ -121,9 +121,9 @@ COPY --from=devel_tools /opt/tools /usr/local
 ENV HYBRIDBACKEND_WITH_CUDA=ON \
     HYBRIDBACKEND_WITH_NCCL=ON \
     HYBRIDBACKEND_WITH_ARROW_ZEROCOPY=ON \
-    HYBRIDBACKEND_WITH_TENSORFLOW_DISTRO=1015 \
+    HYBRIDBACKEND_WITH_TENSORFLOW_DISTRO=77661015 \
     HYBRIDBACKEND_USE_CXX11_ABI=0 \
     HYBRIDBACKEND_USE_RUFF=1 \
     HYBRIDBACKEND_WHEEL_ALIAS=-tf115-cu121 \
     TENSORFLOW_INCLUDE=/opt/tensorflow/tensorflow-source \
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/cuda/lib64
\ No newline at end of file
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/cuda/lib64
diff --git a/hybridbackend/tensorflow/benchmarks/data_benchmark_parquet.py b/hybridbackend/tensorflow/benchmarks/data_benchmark_parquet.py
index 9c914fb9..aa27d0f3 100644
--- a/hybridbackend/tensorflow/benchmarks/data_benchmark_parquet.py
+++ b/hybridbackend/tensorflow/benchmarks/data_benchmark_parquet.py
@@ -40,12 +40,31 @@ def benchmark(params):
     tf.logging.info('Started generating mock file ...')
     workspace = tempfile.mkdtemp()
     params.filenames = [os.path.join(workspace, 'benchmark.parquet')]
-    df = pd.DataFrame(
-      np.random.randint(
-        0, 100,
-        size=(params.batch_size * 100, len(params.fields)),
-        dtype=np.int64),
-      columns=params.fields)
+    if params.use_string_data:
+      df = pd.DataFrame(
+        np.array([
+          [
+            *[
+              np.array(list(map(str, np.random.randint(
+                0, 9,
+                size=(np.random.randint(10, 30),),
+                dtype=np.int64))))
+              for _ in xrange(len(params.fields))]]
+          for _ in xrange(params.batch_size * 100)], dtype=object),
+        columns=params.fields)
+    elif params.use_fixed_len_string_data:
+      df = pd.DataFrame(
+        np.array([
+          ['abcdefghijklmnoprstu' for _ in xrange(len(params.fields))]
+          for _ in xrange(params.batch_size * 100)], dtype=np.str),
+        columns=params.fields)
+    else:
+      df = pd.DataFrame(
+        np.random.randint(
+          0, 100,
+          size=(params.batch_size * 100, len(params.fields)),
+          dtype=np.int64),
+        columns=params.fields)
     df.to_parquet(params.filenames[0])
     tf.logging.info(f'Mock file {params.filenames[0]} generated.')
   with tf.Graph().as_default():
@@ -66,7 +85,14 @@ def benchmark(params):
         ds = ds.batch(params.batch_size, drop_remainder=True)
     batch = tf.data.make_one_shot_iterator(ds).get_next()
     train_op = tf.group(list(batch.values()) + [step.assign_add(1)])
-    with tf.train.MonitoredTrainingSession('') as sess:
+    chief_only_hooks = []
+    if params.profile_every_n_iter is not None:
+      chief_only_hooks.append(
+        tf.train.ProfilerHook(
+          save_steps=params.profile_every_n_iter,
+          output_dir=params.output_dir))
+    with tf.train.MonitoredTrainingSession(
+        '', chief_only_hooks=chief_only_hooks) as sess:
       count = 0
       prev_ts = time.time()
       try:
@@ -100,8 +126,13 @@ def benchmark(params):
   parser = argparse.ArgumentParser()
   parser.add_argument('--baseline', default=False, action='store_true')
   parser.add_argument('--shuffle', default=False, action='store_true')
+  parser.add_argument('--use-string-data', default=False, action='store_true')
+  parser.add_argument(
+    '--use-fixed-len-string-data', default=False, action='store_true')
   parser.add_argument('--batch-size', type=int, default=64000)
   parser.add_argument('--num-steps', type=int, default=None)
+  parser.add_argument('--output-dir', default='./outputs')
+  parser.add_argument('--profile-every-n-iter', type=int, default=None)
   parser.add_argument(
     '--fields', nargs='+', default=[f'f{c}' for c in xrange(200)])
   parser.add_argument('filenames', nargs='*')
diff --git a/hybridbackend/tensorflow/benchmarks/data_benchmark_tfrecord.py b/hybridbackend/tensorflow/benchmarks/data_benchmark_tfrecord.py
index 4c75e4f9..33305708 100644
--- a/hybridbackend/tensorflow/benchmarks/data_benchmark_tfrecord.py
+++ b/hybridbackend/tensorflow/benchmarks/data_benchmark_tfrecord.py
@@ -38,19 +38,46 @@ def benchmark(params):
     tf.logging.info('Started generating mock file ...')
     workspace = tempfile.mkdtemp()
     params.filenames = [os.path.join(workspace, 'benchmark.tfrecord')]
-    df = pd.DataFrame(
-      np.random.randint(
-        0, 100,
-        size=(params.batch_size * 100, len(params.fields)),
-        dtype=np.int64),
-      columns=params.fields)
+    if params.use_string_data:
+      df = pd.DataFrame(
+        np.array([
+          [
+            *[
+              np.array(list(map(str, np.random.randint(
+                0, 9,
+                size=(np.random.randint(10, 30),),
+                dtype=np.int64))))
+              for _ in xrange(len(params.fields))]]
+          for _ in xrange(params.batch_size * 100)], dtype=object),
+        columns=params.fields)
+    elif params.use_fixed_len_string_data:
+      df = pd.DataFrame(
+        np.array([
+          ['abcdefghijklmnoprstu' for _ in xrange(len(params.fields))]
+          for _ in xrange(params.batch_size * 100)], dtype=np.str),
+        columns=params.fields)
+    else:
+      df = pd.DataFrame(
+        np.random.randint(
+          0, 100,
+          size=(params.batch_size * 100, len(params.fields)),
+          dtype=np.int64),
+        columns=params.fields)
     writer = tf.python_io.TFRecordWriter(params.filenames[0])
-    for row in tq(range(params.samples)):
-      feats = tf.train.Features(
-        feature={
-          f: tf.train.Feature(
-            int64_list=tf.train.Int64List(value=[df[f][row]]))
-          for f in params.fields})
+    for row in tq(range(params.batch_size * 100)):
+      if params.use_string_data or params.use_fixed_len_string_data:
+        feats = tf.train.Features(
+          feature={
+            f: tf.train.Feature(
+              bytes_list=tf.train.BytesList(
+                value=[bytes(val, 'utf-8') for val in df[f][row]]))
+            for f in params.fields})
+      else:
+        feats = tf.train.Features(
+          feature={
+            f: tf.train.Feature(
+              int64_list=tf.train.Int64List(value=[df[f][row]]))
+            for f in params.fields})
       example = tf.train.Example(features=feats)
       writer.write(example.SerializeToString())
     writer.close()
@@ -58,19 +85,42 @@ def benchmark(params):
   with tf.Graph().as_default():
     step = tf.train.get_or_create_global_step()
     ds = tf.data.TFRecordDataset(params.filenames)
+    if params.shuffle:
+      ds = ds.shuffle(params.batch_size * 10)
     ds = ds.batch(params.batch_size, drop_remainder=True)
-    ds = ds.map(
-      lambda line: tf.parse_example(
-        line, {f: tf.FixedLenFeature([1], tf.int64) for f in params.fields}))
+    if params.use_string_data or params.use_fixed_len_string_data:
+      ds = ds.map(
+        lambda line: tf.parse_example(
+          line, {f: tf.VarLenFeature(tf.string) for f in params.fields}))
+    else:
+      ds = ds.map(
+        lambda line: tf.parse_example(
+          line, {f: tf.FixedLenFeature([1], tf.int64) for f in params.fields}))
     batch = tf.data.make_one_shot_iterator(ds).get_next()
-    train_op = tf.group(batch + [step.assign_add(1)])
-    with tf.train.MonitoredTrainingSession('') as sess:
+    train_op = tf.group(list(batch.values()) + [step.assign_add(1)])
+    chief_only_hooks = []
+    if params.profile_every_n_iter is not None:
+      chief_only_hooks.append(
+        tf.train.ProfilerHook(
+          save_steps=params.profile_every_n_iter,
+          output_dir=params.output_dir))
+    with tf.train.MonitoredTrainingSession(
+        '', chief_only_hooks=chief_only_hooks) as sess:
       count = 0
       prev_ts = time.time()
       try:
-        while not sess.should_stop():
-          sess.run(train_op)
-          count += 1
+        with tq() as pbar:
+          should_stop = False
+          while not sess.should_stop() and not should_stop:
+            prev_sess_run = time.time()
+            sess.run(train_op)
+            sess_run_duration = time.time() - prev_sess_run
+            pbar.set_description(
+              f'{params.batch_size / sess_run_duration:6.2f} samples/sec')
+            pbar.update(1)
+            count += 1
+            if params.num_steps is not None:
+              should_stop = count >= params.num_steps
       except tf.errors.OutOfRangeError:
         pass
       duration = time.time() - prev_ts
@@ -87,7 +137,14 @@ def benchmark(params):
   os.environ['CUDA_VISIBLE_DEVICES'] = ''
   tf.logging.set_verbosity(tf.logging.INFO)
   parser = argparse.ArgumentParser()
+  parser.add_argument('--shuffle', default=False, action='store_true')
+  parser.add_argument('--use-string-data', default=False, action='store_true')
+  parser.add_argument(
+    '--use-fixed-len-string-data', default=False, action='store_true')
   parser.add_argument('--batch-size', type=int, default=64000)
+  parser.add_argument('--num-steps', type=int, default=None)
+  parser.add_argument('--output-dir', default='./outputs')
+  parser.add_argument('--profile-every-n-iter', type=int, default=None)
   parser.add_argument(
     '--fields', nargs='+', default=[f'f{c}' for c in xrange(200)])
   parser.add_argument('filenames', nargs='*')
diff --git a/hybridbackend/tensorflow/common/arrow.cc b/hybridbackend/tensorflow/common/arrow.cc
index 27591121..0fcdb6e1 100644
--- a/hybridbackend/tensorflow/common/arrow.cc
+++ b/hybridbackend/tensorflow/common/arrow.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <arrow/util/thread_pool.h>
 #include <tensorflow/core/framework/allocation_description.pb.h>
 
+#include "hybridbackend/common/env.h"
 #include "hybridbackend/tensorflow/common/arrow.h"
 #include "hybridbackend/tensorflow/common/eigen.h"
 #endif
@@ -31,8 +32,72 @@ limitations under the License.
 namespace tensorflow {
 namespace hybridbackend {
 
+namespace {
+inline bool ZeroCopyStringForRebatchDisabled() {
+  static const bool kZeroCopyStringForRebatchDisabled =
+      ::hybridbackend::EnvVarGetBool("HB_ZERO_COPY_STRING_FOR_REBATCH_DISABLED",
+                                     false);
+  return kZeroCopyStringForRebatchDisabled;
+}
+}  // namespace
+
 #if HYBRIDBACKEND_ARROW
 
+#if HYBRIDBACKEND_ARROW_ZEROCOPY
+#if (TF_MAJOR_VERSION * 1000L + TF_MINOR_VERSION) < 1014L
+ArrowStringTensorBuffer::ArrowStringTensorBuffer(
+    const std::shared_ptr<arrow::Buffer>& value_data_buf,
+    const std::shared_ptr<arrow::Buffer>& value_offsets_buf,
+    const uint8_t* raw_data, const int32_t* raw_value_offsets)
+    : value_data_buf_(value_data_buf),
+      value_offsets_buf_(value_offsets_buf),
+      raw_data_(raw_data),
+      raw_value_offsets_(raw_value_offsets) {}
+
+void ArrowStringTensorBuffer::data() const { return this; }
+
+#else
+ArrowStringTensorBuffer::ArrowStringTensorBuffer(
+    const std::shared_ptr<arrow::Buffer>& value_data_buf,
+    const std::shared_ptr<arrow::Buffer>& value_offsets_buf,
+    const uint8_t* raw_data, const int32_t* raw_value_offsets)
+    : TensorBuffer(this),
+      value_data_buf_(value_data_buf),
+      value_offsets_buf_(value_offsets_buf),
+      raw_data_(raw_data),
+      raw_value_offsets_(raw_value_offsets) {}
+#endif
+
+size_t ArrowStringTensorBuffer::size() const {
+  LOG(ERROR) << "When using zero copy string for rebatch, please and a "
+                "hb.data.rebatch(batch_size) following hb.data.ParquetDataset ";
+  return 0;
+}
+
+TensorBuffer* ArrowStringTensorBuffer::root_buffer() { return this; }
+
+void ArrowStringTensorBuffer::FillAllocationDescription(
+    AllocationDescription* proto) const {
+  proto->set_requested_bytes(sizeof(tstring));
+  proto->set_allocator_name("ZerocopyArrowStringTensorBuffer");
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+  // NOTE: vanilla tensorflow from community has no `data()` method of
+  // class `Tensor`, thus we have to leverage the FillAllocationDescription
+  // API to obtain the underlying ArrowStringTensorBuffer.
+  proto->set_ptr(reinterpret_cast<uint64>(this));
+#endif
+}
+
+bool ArrowStringTensorBuffer::OwnsMemory() const { return false; }
+
+const uint8_t* ArrowStringTensorBuffer::GetValue(int64_t i,
+                                                 int32_t* out_length) {
+  const int32_t pos = raw_value_offsets_[i];
+  *out_length = raw_value_offsets_[i + 1] - pos;
+  return raw_data_ + pos;
+}
+#endif
+
 namespace {
 #if HYBRIDBACKEND_ARROW_ZEROCOPY
 class ArrowPrimitiveTensorBuffer : public TensorBuffer {
@@ -143,15 +208,34 @@ ::arrow::Status MakeStringTensorFromArrowArray(
               &actual_shape))) {
     return ::arrow::Status::Invalid("Field shape is not fully defined");
   }
-
-  *tensor = Tensor(DT_STRING, actual_shape);
-  auto tensor_vec = tensor->vec<std::string>();
-
-  for (auto i = 0; i < total_num_elems; ++i) {
-    int string_size;
-    auto string_data = array.GetValue(i, &string_size);
-    tensor_vec(i).assign(reinterpret_cast<const char*>(string_data),
-                         string_size);
+  if (ZeroCopyStringForRebatchDisabled()) {
+    *tensor = Tensor(DT_STRING, actual_shape);
+    auto tensor_vec = tensor->vec<std::string>();
+
+    for (auto i = 0; i < total_num_elems; ++i) {
+      int string_size;
+      auto string_data = array.GetValue(i, &string_size);
+      tensor_vec(i).assign(reinterpret_cast<const char*>(string_data),
+                           string_size);
+    }
+  } else {
+#if HYBRIDBACKEND_ARROW_ZEROCOPY
+    ArrowStringTensorBuffer* tensor_buffer = new ArrowStringTensorBuffer(
+        array.value_data(), array.value_offsets(), array.raw_data(),
+        array.raw_value_offsets());
+    core::ScopedUnref unref(tensor_buffer);
+    *tensor = Tensor(DT_STRING, actual_shape, tensor_buffer);
+#else
+    *tensor = Tensor(DT_STRING, actual_shape);
+    auto tensor_vec = tensor->vec<std::string>();
+
+    for (auto i = 0; i < total_num_elems; ++i) {
+      int string_size;
+      auto string_data = array.GetValue(i, &string_size);
+      tensor_vec(i).assign(reinterpret_cast<const char*>(string_data),
+                           string_size);
+    }
+#endif
   }
   return ::arrow::Status::OK();
 }
diff --git a/hybridbackend/tensorflow/common/arrow.h b/hybridbackend/tensorflow/common/arrow.h
index b946b0c9..55765449 100644
--- a/hybridbackend/tensorflow/common/arrow.h
+++ b/hybridbackend/tensorflow/common/arrow.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <deque>
 
 #if HYBRIDBACKEND_ARROW
+#include <arrow/array.h>
 #include <arrow/dataset/api.h>
 #include <arrow/filesystem/path_util.h>
 #include <arrow/record_batch.h>
@@ -34,6 +35,7 @@ limitations under the License.
 
 #include <tensorflow/core/framework/tensor.h>
 #include <tensorflow/core/lib/core/errors.h>
+#include <tensorflow/core/public/version.h>
 
 #define TF_RETURN_IF_ARROW_ERROR(...)              \
   do {                                             \
@@ -89,6 +91,31 @@ MATCH_TYPE_AND_ARROW_ENUM(float, ::arrow::Type::FLOAT);
 MATCH_TYPE_AND_ARROW_ENUM(double, ::arrow::Type::DOUBLE);
 MATCH_TYPE_AND_ARROW_ENUM(string, ::arrow::Type::STRING);
 
+#if HYBRIDBACKEND_ARROW_ZEROCOPY
+class ArrowStringTensorBuffer : public TensorBuffer {
+ public:
+  ArrowStringTensorBuffer() = delete;
+  explicit ArrowStringTensorBuffer(
+      const std::shared_ptr<arrow::Buffer>& value_data_buf,
+      const std::shared_ptr<arrow::Buffer>& value_offsets_buf,
+      const uint8_t* raw_data, const int32_t* raw_value_offsets);
+#if (TF_MAJOR_VERSION * 1000L + TF_MINOR_VERSION) < 1014L
+  void* data() const override;
+#endif
+  const uint8_t* GetValue(int64_t i, int32_t* out_length);
+  size_t size() const override;
+  TensorBuffer* root_buffer() override;
+  void FillAllocationDescription(AllocationDescription* proto) const override;
+  bool OwnsMemory() const override;
+
+ private:
+  std::shared_ptr<::arrow::Buffer> value_data_buf_;
+  std::shared_ptr<::arrow::Buffer> value_offsets_buf_;
+  const uint8_t* raw_data_;
+  const int32_t* raw_value_offsets_;
+};
+#endif
+
 Status MakeDataTypeAndRaggedRankFromArrowDataType(
     const std::shared_ptr<::arrow::DataType>& arrow_dtype, DataType* dtype,
     int32* ragged_rank);
diff --git a/hybridbackend/tensorflow/data/rebatch/buffer.h b/hybridbackend/tensorflow/data/rebatch/buffer.h
index 4efbb725..b52f177c 100644
--- a/hybridbackend/tensorflow/data/rebatch/buffer.h
+++ b/hybridbackend/tensorflow/data/rebatch/buffer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <tensorflow/core/framework/tensor.h>
+#include <tensorflow/core/lib/core/threadpool.h>
 #include <tensorflow/core/lib/random/philox_random.h>
 #include <tensorflow/core/lib/random/random.h>
 #include <tensorflow/core/lib/random/random_distributions.h>
@@ -29,10 +30,32 @@ namespace hybridbackend {
 
 struct RebatchBufferItem {
  public:
-  RebatchBufferItem(int64 batch_size, const std::vector<Tensor>& components)
-      : batch_size(batch_size), components(components) {}
+  RebatchBufferItem(int64 batch_size, const std::vector<int64>& start,
+                    const std::vector<int64>& limit,
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+                    const std::vector<Tensor>& components,
+                    const std::vector<uint64>& zerocopied_string_buf_addr)
+#else
+                    const std::vector<Tensor>& components)
+#endif
+      : batch_size(batch_size),
+        start(start),
+        limit(limit),
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+        components(components),
+        zerocopied_string_buf_addr(zerocopied_string_buf_addr) {
+  }
+#else
+        components(components) {
+  }
+#endif
   int64 batch_size;
+  std::vector<int64> start;
+  std::vector<int64> limit;
   std::vector<Tensor> components;
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+  std::vector<uint64> zerocopied_string_buf_addr;
+#endif
 };
 
 class RebatchBuffer {
@@ -54,23 +77,40 @@ class RebatchBuffer {
   Status Take(Allocator* alloc, std::vector<Tensor>* output_tensors,
               const int64 num_rows);
 
+  Status FastPath(Allocator* alloc, const std::vector<Tensor>& input_tensors,
+                  std::vector<Tensor>* output_tensors);
+
+  Status CheckZeroCopiedString(const std::vector<Tensor>& input_tensors);
+
  private:
   Status TakeDense(Allocator* alloc, std::vector<Tensor>* output_tensors,
-                   std::vector<Tensor>* residual_tensors, const int64 num_rows,
-                   const int64 remained_rows, const int64 rank,
-                   const int64 col);
+                   std::vector<Tensor>* residual_tensors,
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+                   std::vector<uint64>* residual_zerocopied_string_buf_addr,
+#endif
+                   const int64 num_rows, const int64 remained_rows,
+                   const int64 rank, const int64 col);
 
   Status TakeSparse(Allocator* alloc, std::vector<Tensor>* output_tensors,
-                    std::vector<Tensor>* residual_tensors, const int64 num_rows,
-                    const int64 remained_rows, const int64 rank,
-                    const int64 col);
+                    std::vector<Tensor>* residual_tensors,
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+                    std::vector<uint64>* residual_zerocopied_string_buf_addr,
+#endif
+                    const int64 num_rows, const int64 remained_rows,
+                    const int64 rank, const int64 col);
 
   const DataTypeVector& output_dtypes_;
   const std::vector<PartialTensorShape>& output_shapes_;
   const std::vector<int32> field_ranks_;
 
   int64 size_;
-  std::deque<RebatchBufferItem> items_;
+  std::vector<std::unique_ptr<RebatchBufferItem>> items_;
+  std::shared_ptr<thread::ThreadPool> takers_;
+  std::vector<int32> field_cols_;
+  std::vector<bool> has_zerocopied_string_;
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+  std::vector<uint64> zerocopied_string_buf_addr_;
+#endif
 };
 
 }  // namespace hybridbackend
diff --git a/hybridbackend/tensorflow/data/rebatch/rebatch_buffer.cc b/hybridbackend/tensorflow/data/rebatch/rebatch_buffer.cc
index 7f4698ce..b60cadee 100644
--- a/hybridbackend/tensorflow/data/rebatch/rebatch_buffer.cc
+++ b/hybridbackend/tensorflow/data/rebatch/rebatch_buffer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <tensorflow/core/framework/register_types.h>
+#include <tensorflow/core/framework/tensor_description.pb.h>
 #include <tensorflow/core/public/version.h>
 
 #include <algorithm>
@@ -21,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "hybridbackend/common/env.h"
+#include "hybridbackend/tensorflow/common/arrow.h"
 #include "hybridbackend/tensorflow/common/eigen.h"
 #include "hybridbackend/tensorflow/data/rebatch/buffer.h"
 
@@ -133,15 +135,75 @@ RebatchBuffer::RebatchBuffer(
     : output_dtypes_(output_dtypes),
       output_shapes_(output_shapes),
       field_ranks_(field_ranks),
-      size_(0) {}
+      size_(0) {
+  const int kArrowNumThreads =
+      ::hybridbackend::EnvVarGetInt("ARROW_NUM_THREADS", 16);
+  takers_.reset(new thread::ThreadPool(
+      Env::Default(), ThreadOptions(), "rebatch_buffer_takers",
+      kArrowNumThreads, true /* low_latency_hint */));
+  int32 col = 0;
+  for (int64 rank : field_ranks_) {
+    field_cols_.emplace_back(col);
+    col += (rank + 1);
+  }
+
+  has_zerocopied_string_.reserve(output_dtypes_.size());
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+  zerocopied_string_buf_addr_.reserve(output_dtypes_.size());
+#endif
+}
+
+Status RebatchBuffer::CheckZeroCopiedString(
+    const std::vector<Tensor>& input_tensors) {
+  has_zerocopied_string_.resize(input_tensors.size());
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+  zerocopied_string_buf_addr_.resize(input_tensors.size());
+#endif
+  auto work = [&](int64 begin, int64 end) {
+    for (int64 i = begin; i < end; ++i) {
+      if (input_tensors[i].dtype() != DT_STRING) {
+        has_zerocopied_string_[i] = false;
+      } else {
+        ::tensorflow::TensorDescription tensor_description;
+        input_tensors[i].FillDescription(&tensor_description);
+        if (tensor_description.has_allocation_description() &&
+            tensor_description.allocation_description().allocator_name() ==
+                "ZerocopyArrowStringTensorBuffer") {
+          has_zerocopied_string_[i] = true;
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+          zerocopied_string_buf_addr_[i] =
+              tensor_description.allocation_description().ptr();
+#endif
+        } else {
+          has_zerocopied_string_[i] = false;
+        }
+      }
+    }
+  };
+  const int64 cost_per_unit = 20;
+  takers_->ParallelFor(input_tensors.size(), cost_per_unit, work);
+  return Status::OK();
+}
 
 Status RebatchBuffer::Put(const std::vector<Tensor>& input_tensors,
                           const int64 num_rows) {
   if (TF_PREDICT_FALSE(num_rows == 0)) {
     return Status::OK();
   }
-
-  items_.emplace_back(num_rows, input_tensors);
+  std::vector<int64> start(output_dtypes_.size(), 0);
+  std::vector<int64> limit(output_dtypes_.size(), num_rows);
+  for (int i = 0; i < input_tensors.size(); ++i) {
+    if (has_zerocopied_string_[i]) {
+      limit[i] = input_tensors[i].NumElements();
+    }
+  }
+  items_.emplace_back(absl::make_unique<RebatchBufferItem>(
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+      num_rows, start, limit, std::move(input_tensors),
+      zerocopied_string_buf_addr_));
+#else
+      num_rows, start, limit, std::move(input_tensors)));
+#endif
   size_ += num_rows;
   return Status::OK();
 }
@@ -153,32 +215,56 @@ Status RebatchBuffer::PutSlice(const std::vector<Tensor>& input_tensors,
   }
 
   std::vector<Tensor> sliced_input_tensors(output_dtypes_.size());
-  int64 col = 0;
-  for (int64 rank : field_ranks_) {
-    int64 start = row_start;
-    int64 limit = row_limit;
-    if (rank != 0) {
-      for (size_t split_idx = 1; split_idx < rank + 1; ++split_idx) {
-        auto split_slice =
-            input_tensors[col + split_idx].Slice(start, (limit + 1));
-        const int64 slice_limit = (limit - start);
-        start = split_slice.unaligned_flat<int32>()(0);
-        limit = split_slice.unaligned_flat<int32>()(slice_limit);
-        sliced_input_tensors[col + split_idx] = std::move(split_slice);
+  std::vector<int64> start_pos(output_dtypes_.size());
+  std::vector<int64> limit_pos(output_dtypes_.size());
+
+  auto work = [&](int64 begin, int64 end) {
+    for (int64 i = begin; i < end; ++i) {
+      int rank = field_ranks_[i];
+      int col = field_cols_[i];
+      int64 start = row_start;
+      int64 limit = row_limit;
+      if (rank != 0) {
+        for (size_t split_idx = 1; split_idx < rank + 1; ++split_idx) {
+          auto split_slice =
+              input_tensors[col + split_idx].Slice(start, (limit + 1));
+          const int64 slice_limit = (limit - start);
+          start = split_slice.unaligned_flat<int32>()(0);
+          limit = split_slice.unaligned_flat<int32>()(slice_limit);
+          sliced_input_tensors[col + split_idx] = std::move(split_slice);
+          start_pos[col + split_idx] = start;
+          limit_pos[col + split_idx] = limit;
+        }
+      }
+      const auto input_shape = input_tensors[col].shape();
+      int64 input_base_elems = 1;
+      if (TF_PREDICT_FALSE(input_shape.dims() > 1)) {
+        input_base_elems = input_shape.num_elements() / input_shape.dim_size(0);
+      }
+      if (!has_zerocopied_string_[col]) {
+        sliced_input_tensors[col] = input_tensors[col].Slice(
+            start / input_base_elems, limit / input_base_elems);
+        start_pos[col] = start / input_base_elems;
+        limit_pos[col] = limit / input_base_elems;
+      } else {
+        sliced_input_tensors[col] = input_tensors[col];
+        start_pos[col] = start;
+        limit_pos[col] = limit;
       }
     }
-    const auto input_shape = input_tensors[col].shape();
-    int64 input_base_elems = 1;
-    if (TF_PREDICT_FALSE(input_shape.dims() > 1)) {
-      input_base_elems = input_shape.num_elements() / input_shape.dim_size(0);
-    }
-    sliced_input_tensors[col] = input_tensors[col].Slice(
-        start / input_base_elems, limit / input_base_elems);
-    col += (rank + 1);
-  }
+    return Status::OK();
+  };
+  const int64 cost_per_unit = 20;
+  takers_->ParallelFor(field_ranks_.size(), cost_per_unit, work);
 
   const int64 num_rows = row_limit - row_start;
-  items_.emplace_back(num_rows, std::move(sliced_input_tensors));
+  items_.emplace_back(absl::make_unique<RebatchBufferItem>(
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+      num_rows, start_pos, limit_pos, std::move(sliced_input_tensors),
+      zerocopied_string_buf_addr_));
+#else
+      num_rows, start_pos, limit_pos, std::move(sliced_input_tensors)));
+#endif
   size_ += num_rows;
   return Status::OK();
 }
@@ -212,11 +298,11 @@ Status RebatchBuffer::Take(Allocator* alloc,
   int64 remained_rows = 0;
   int64 num_dirty_rows = 0;
   for (int64 row = 0; num_dirty_rows < items_.size(); ++num_dirty_rows) {
-    if (row + items_[num_dirty_rows].batch_size > num_rows) {
+    if (row + items_[num_dirty_rows]->batch_size > num_rows) {
       remained_rows = (num_rows - row);
       break;
     }
-    row += items_[num_dirty_rows].batch_size;
+    row += items_[num_dirty_rows]->batch_size;
   }
 
   const size_t num_components = output_dtypes_.size();
@@ -224,26 +310,46 @@ Status RebatchBuffer::Take(Allocator* alloc,
   output_tensors->resize(num_components);
   std::vector<Tensor> residual_tensors;
   residual_tensors.resize(num_components);
-  int64 col = 0;
-  for (int64 rank : field_ranks_) {
-    if (rank == 0) {
-      TF_RETURN_IF_ERROR(TakeDense(alloc, output_tensors, &residual_tensors,
-                                   num_rows, remained_rows, rank, col));
-    } else {
-      TF_RETURN_IF_ERROR(TakeSparse(alloc, output_tensors, &residual_tensors,
-                                    num_rows, remained_rows, rank, col));
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+  std::vector<uint64> residual_zerocopied_string_buf_addr;
+  residual_zerocopied_string_buf_addr.resize(num_components);
+#endif
+
+  auto work = [&](int64 begin, int64 end) {
+    for (int64 i = begin; i < end; ++i) {
+      if (field_ranks_[i] == 0) {
+        TF_RETURN_IF_ERROR(TakeDense(alloc, output_tensors, &residual_tensors,
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+                                     &residual_zerocopied_string_buf_addr,
+#endif
+                                     num_rows, remained_rows, field_ranks_[i],
+                                     field_cols_[i]));
+      } else {
+        TF_RETURN_IF_ERROR(TakeSparse(alloc, output_tensors, &residual_tensors,
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+                                      &residual_zerocopied_string_buf_addr,
+#endif
+                                      num_rows, remained_rows, field_ranks_[i],
+                                      field_cols_[i]));
+      }
     }
-    col += (rank + 1);
-  }
+    return Status::OK();
+  };
+  const int64 cost_per_unit = 200;
+  takers_->ParallelFor(field_ranks_.size(), cost_per_unit, work);
+  items_.erase(items_.begin(), items_.begin() + num_dirty_rows);
 
-  for (int64 idx = 0; idx < num_dirty_rows; ++idx) {
-    items_.pop_front();
-  }
   if (remained_rows > 0) {
-    int64 residual_rows = items_.front().batch_size - remained_rows;
-    items_.pop_front();
+    int64 residual_rows = items_.front()->batch_size - remained_rows;
     if (residual_rows > 0) {
-      items_.emplace_front(residual_rows, std::move(residual_tensors));
+      items_.front()->components = residual_tensors;
+      items_.front()->batch_size = residual_rows;
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+      items_.front()->zerocopied_string_buf_addr =
+          residual_zerocopied_string_buf_addr;
+#endif
+    } else {
+      items_.erase(items_.begin());
     }
   }
 
@@ -251,11 +357,53 @@ Status RebatchBuffer::Take(Allocator* alloc,
   return Status::OK();
 }
 
-Status RebatchBuffer::TakeDense(Allocator* alloc,
-                                std::vector<Tensor>* output_tensors,
-                                std::vector<Tensor>* residual_tensors,
-                                const int64 num_rows, const int64 remained_rows,
-                                const int64 rank, const int64 col) {
+Status RebatchBuffer::FastPath(Allocator* alloc,
+                               const std::vector<Tensor>& input_tensors,
+                               std::vector<Tensor>* output_tensors) {
+  auto work = [&](int64 begin, int64 end) {
+    for (int i = begin; i < end; ++i) {
+      if (!has_zerocopied_string_[i]) {
+        output_tensors->at(i) = input_tensors[i];
+      } else {
+        output_tensors->at(i) =
+            Tensor(alloc, input_tensors[i].dtype(), input_tensors[i].shape());
+        if (!output_tensors->at(i).IsInitialized()) {
+          return errors::ResourceExhausted(
+              "Failed to allocate memory for output component ", i);
+        }
+        auto output_tensor_ptr = output_tensors->at(i).vec<std::string>();
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+        auto input_string_buf = reinterpret_cast<ArrowStringTensorBuffer*>(
+            zerocopied_string_buf_addr_[i]);
+#else
+        auto input_string_buf =
+            reinterpret_cast<ArrowStringTensorBuffer*>(input_tensors[i].data());
+#endif
+        for (int j = 0; j < input_tensors[i].NumElements(); ++j) {
+          int string_size;
+          auto string_data = input_string_buf->GetValue(j, &string_size);
+          output_tensor_ptr(j).assign(
+              reinterpret_cast<const char*>(string_data), string_size);
+        }
+      }
+    }
+    return Status::OK();
+  };
+
+  const int64 cost_per_unit = 200;
+  takers_->ParallelFor(input_tensors.size(), cost_per_unit, work);
+
+  return Status::OK();
+}
+
+Status RebatchBuffer::TakeDense(
+    Allocator* alloc, std::vector<Tensor>* output_tensors,
+    std::vector<Tensor>* residual_tensors,
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+    std::vector<uint64>* residual_zerocopied_string_buf_addr,
+#endif
+    const int64 num_rows, const int64 remained_rows, const int64 rank,
+    const int64 col) {
   // Create output
   PartialTensorShape output_pshape(output_shapes_[col]);
   output_pshape.set_dim(0, num_rows);
@@ -269,47 +417,98 @@ Status RebatchBuffer::TakeDense(Allocator* alloc,
 
   // Populate output
   for (int64 idx = 0, row = 0; idx < items_.size(); ++idx) {
-    auto& item = items_[idx].components[col];
-    if (row + items_[idx].batch_size > num_rows) {
+    auto& item = items_[idx]->components[col];
+    if (row + items_[idx]->batch_size > num_rows) {
       if (remained_rows == 0) {
         break;
       }
-      auto sliced_input = item.Slice(0, remained_rows);
-      TF_RETURN_IF_ERROR(CopyToSlicesFromTensor(&(output_tensors->at(col)), row,
-                                                std::move(sliced_input), true));
-      (*residual_tensors)[col] =
-          item.Slice(remained_rows, items_[idx].batch_size);
+
+      if (!has_zerocopied_string_[col]) {
+        auto sliced_input = item.Slice(0, remained_rows);
+        TF_RETURN_IF_ERROR(CopyToSlicesFromTensor(
+            &(output_tensors->at(col)), row, std::move(sliced_input), true));
+        (*residual_tensors)[col] =
+            item.Slice(remained_rows, items_[idx]->batch_size);
+        items_[idx]->start[col] = remained_rows;
+      } else {
+        int64 start = items_[idx]->start[col];
+        int64 limit = start + remained_rows;
+        auto output_tensor_ptr = output_tensors->at(col).vec<std::string>();
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+        auto item_string_buf_ptr = reinterpret_cast<ArrowStringTensorBuffer*>(
+            items_[idx]->zerocopied_string_buf_addr[col]);
+#else
+        auto item_string_buf_ptr =
+            reinterpret_cast<ArrowStringTensorBuffer*>(item.data());
+#endif
+        for (int i = start; i < limit; ++i) {
+          int64 output_idx = row + i - start;
+          int string_size;
+          auto string_data = item_string_buf_ptr->GetValue(i, &string_size);
+          output_tensor_ptr(output_idx)
+              .assign(reinterpret_cast<const char*>(string_data), string_size);
+        }
+        (*residual_tensors)[col] = item;
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+        (*residual_zerocopied_string_buf_addr)[col] =
+            items_[idx]->zerocopied_string_buf_addr[col];
+#endif
+        items_[idx]->start[col] = limit;
+      }
       break;
     }
-    TF_RETURN_IF_ERROR(
-        CopyToSlicesFromTensor(&(output_tensors->at(col)), row, item, true));
-    row += items_[idx].batch_size;
+    if (!has_zerocopied_string_[col]) {
+      TF_RETURN_IF_ERROR(
+          CopyToSlicesFromTensor(&(output_tensors->at(col)), row, item, true));
+    } else {
+      // populating string type of output tensors
+      int64 item_start_pos = items_[idx]->start[col];
+      int64 item_limit_pos = items_[idx]->limit[col];
+      auto output_tensor_ptr = output_tensors->at(col).vec<std::string>();
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+      auto item_string_buf_ptr = reinterpret_cast<ArrowStringTensorBuffer*>(
+          items_[idx]->zerocopied_string_buf_addr[col]);
+#else
+      auto item_string_buf_ptr =
+          reinterpret_cast<ArrowStringTensorBuffer*>(item.data());
+#endif
+      for (int i = item_start_pos; i < item_limit_pos; ++i) {
+        int64 output_idx = row + i - item_start_pos;
+        int string_size;
+        auto string_data = item_string_buf_ptr->GetValue(i, &string_size);
+        output_tensor_ptr(output_idx)
+            .assign(reinterpret_cast<const char*>(string_data), string_size);
+      }
+    }
+    row += items_[idx]->batch_size;
   }
 
   return Status::OK();
 }
 
-Status RebatchBuffer::TakeSparse(Allocator* alloc,
-                                 std::vector<Tensor>* output_tensors,
-                                 std::vector<Tensor>* residual_tensors,
-                                 const int64 num_rows,
-                                 const int64 remained_rows, const int64 rank,
-                                 const int64 col) {
+Status RebatchBuffer::TakeSparse(
+    Allocator* alloc, std::vector<Tensor>* output_tensors,
+    std::vector<Tensor>* residual_tensors,
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+    std::vector<uint64>* residual_zerocopied_string_buf_addr,
+#endif
+    const int64 num_rows, const int64 remained_rows, const int64 rank,
+    const int64 col) {
   // Create and populate output splits
   int64 remained_dim0_size = 1 + remained_rows;
   for (size_t split_idx = 1; split_idx < rank + 1; ++split_idx) {
     int64 next_remained_dim0_size = 0;
     int64 dim0_size = 0;
     for (int64 idx = 0, row = 0; idx < items_.size(); ++idx) {
-      if (row + items_[idx].batch_size > num_rows) {
+      if (row + items_[idx]->batch_size > num_rows) {
         next_remained_dim0_size =
-            items_[idx].components[col + split_idx].unaligned_flat<int32>()(
+            items_[idx]->components[col + split_idx].unaligned_flat<int32>()(
                 remained_dim0_size - 1);
         dim0_size += (remained_dim0_size - 1);
         break;
       }
-      dim0_size += (items_[idx].components[col + split_idx].dim_size(0) - 1);
-      row += items_[idx].batch_size;
+      dim0_size += (items_[idx]->components[col + split_idx].dim_size(0) - 1);
+      row += items_[idx]->batch_size;
     }
 
     PartialTensorShape split_pshape(output_shapes_[col + split_idx]);
@@ -326,11 +525,11 @@ Status RebatchBuffer::TakeSparse(Allocator* alloc,
     (*output_tensors)[col + split_idx].unaligned_flat<int32>()(0) = 0;
     int64 dim0_index = 0;
     for (int64 idx = 0, row = 0; idx < items_.size(); ++idx) {
-      auto& split = items_[idx].components[col + split_idx];
+      auto& split = items_[idx]->components[col + split_idx];
       int32 output_last = output_tensors->at(col + split_idx)
                               .unaligned_flat<int32>()(dim0_index);
       int32 input_first = split.unaligned_flat<int32>()(0);
-      if (row + items_[idx].batch_size > num_rows) {
+      if (row + items_[idx]->batch_size > num_rows) {
         if (remained_rows == 0) {
           break;
         }
@@ -360,7 +559,7 @@ Status RebatchBuffer::TakeSparse(Allocator* alloc,
           sliced_output_split.unaligned_flat<int32>().constant(output_last -
                                                                input_first);
       dim0_index += sliced_input_split.dim_size(0);
-      row += items_[idx].batch_size;
+      row += items_[idx]->batch_size;
     }
 
     remained_dim0_size = next_remained_dim0_size;
@@ -369,11 +568,17 @@ Status RebatchBuffer::TakeSparse(Allocator* alloc,
   // Create and populate ouput values
   int64 values_dim0_size = 0;
   for (int64 idx = 0, row = 0; idx < items_.size(); ++idx) {
-    if (row + items_[idx].batch_size > num_rows) {
+    int64 item_start_pos = items_[idx]->start[col];
+    int64 item_limit_pos = items_[idx]->limit[col];
+    if (row + items_[idx]->batch_size > num_rows) {
       break;
     }
-    values_dim0_size += items_[idx].components[col].dim_size(0);
-    row += items_[idx].batch_size;
+    if (!has_zerocopied_string_[col]) {
+      values_dim0_size += items_[idx]->components[col].dim_size(0);
+    } else {
+      values_dim0_size += (item_limit_pos - item_start_pos);
+    }
+    row += items_[idx]->batch_size;
   }
   PartialTensorShape base_pshape(output_shapes_[col]);
   base_pshape.set_dim(0, 1);
@@ -392,8 +597,11 @@ Status RebatchBuffer::TakeSparse(Allocator* alloc,
 
   int64 dim0_index = 0;
   for (int64 idx = 0, row = 0; idx < items_.size(); ++idx) {
-    auto& values = items_[idx].components[col];
-    if (row + items_[idx].batch_size > num_rows) {
+    auto& values = items_[idx]->components[col];
+    int64 item_start_pos = items_[idx]->start[col];
+    int64 item_limit_pos = items_[idx]->limit[col];
+
+    if (row + items_[idx]->batch_size > num_rows) {
       if (remained_rows == 0) {
         break;
       }
@@ -403,20 +611,69 @@ Status RebatchBuffer::TakeSparse(Allocator* alloc,
         values_base_elems =
             values_shape.num_elements() / values_shape.dim_size(0);
       }
-      auto sliced_values =
-          values.Slice(0, remained_dim0_size / values_base_elems);
-      TF_RETURN_IF_ERROR(
-          CopyToSlicesFromTensor(&(output_tensors->at(col)), dim0_index,
-                                 std::move(sliced_values), true));
-      (*residual_tensors)[col] = values.Slice(
-          remained_dim0_size / values_base_elems, values.dim_size(0));
+      if (!has_zerocopied_string_[col]) {
+        auto sliced_values =
+            values.Slice(0, remained_dim0_size / values_base_elems);
+        TF_RETURN_IF_ERROR(
+            CopyToSlicesFromTensor(&(output_tensors->at(col)), dim0_index,
+                                   std::move(sliced_values), true));
+        (*residual_tensors)[col] = values.Slice(
+            remained_dim0_size / values_base_elems, values.dim_size(0));
+      } else {
+        int64 start = items_[idx]->start[col];
+        int64 limit = start + remained_dim0_size;
+        auto output_tensor_ptr = output_tensors->at(col).vec<std::string>();
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+        auto item_string_buf_ptr = reinterpret_cast<ArrowStringTensorBuffer*>(
+            items_[idx]->zerocopied_string_buf_addr[col]);
+#else
+        auto item_string_buf_ptr =
+            reinterpret_cast<ArrowStringTensorBuffer*>(values.data());
+#endif
+        for (int i = start; i < limit; ++i) {
+          int64 output_idx = dim0_index + i - start;
+          int string_size;
+          auto string_data = item_string_buf_ptr->GetValue(i, &string_size);
+          output_tensor_ptr(output_idx)
+              .assign(reinterpret_cast<const char*>(string_data), string_size);
+        }
+        (*residual_tensors)[col] = values;
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+        (*residual_zerocopied_string_buf_addr)[col] =
+            items_[idx]->zerocopied_string_buf_addr[col];
+#endif
+        items_[idx]->start[col] = limit;
+      }
       break;
     }
 
-    TF_RETURN_IF_ERROR(CopyToSlicesFromTensor(&(output_tensors->at(col)),
-                                              dim0_index, values, true));
-    dim0_index += values.dim_size(0);
-    row += items_[idx].batch_size;
+    if (!has_zerocopied_string_[col]) {
+      TF_RETURN_IF_ERROR(CopyToSlicesFromTensor(&(output_tensors->at(col)),
+                                                dim0_index, values, true));
+    } else {
+      auto output_tensor_ptr = output_tensors->at(col).vec<std::string>();
+#if HYBRIDBACKEND_TENSORFLOW_DISTRO == 1015
+      auto item_string_buf_ptr = reinterpret_cast<ArrowStringTensorBuffer*>(
+          items_[idx]->zerocopied_string_buf_addr[col]);
+#else
+      auto item_string_buf_ptr =
+          reinterpret_cast<ArrowStringTensorBuffer*>(values.data());
+#endif
+      for (int i = item_start_pos; i < item_limit_pos; ++i) {
+        int64 output_idx = dim0_index + i - item_start_pos;
+        int string_size;
+        auto string_data = item_string_buf_ptr->GetValue(i, &string_size);
+        output_tensor_ptr(output_idx)
+            .assign(reinterpret_cast<const char*>(string_data), string_size);
+      }
+    }
+
+    if (!has_zerocopied_string_[col]) {
+      dim0_index += values.dim_size(0);
+    } else {
+      dim0_index += (item_limit_pos - item_start_pos);
+    }
+    row += items_[idx]->batch_size;
   }
 
   return Status::OK();
diff --git a/hybridbackend/tensorflow/data/rebatch/rebatch_dataset_v2.cc b/hybridbackend/tensorflow/data/rebatch/rebatch_dataset_v2.cc
index 75732dda..97be8f0e 100644
--- a/hybridbackend/tensorflow/data/rebatch/rebatch_dataset_v2.cc
+++ b/hybridbackend/tensorflow/data/rebatch/rebatch_dataset_v2.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include <vector>
 
 #include "hybridbackend/common/env.h"
+#include "hybridbackend/tensorflow/common/arrow.h"
 #include "hybridbackend/tensorflow/common/dataset.h"
 #include "hybridbackend/tensorflow/common/eigen.h"
 #include "hybridbackend/tensorflow/data/rebatch/buffer.h"
@@ -333,13 +334,16 @@ class RebatchTabularDatasetV2Op::Dataset::Iterator
           }
         }
 
+        buffer_.CheckZeroCopiedString(std::move(input_tensors));
         // Fast path for same batch size
         static const bool kRebatchFaspathDisabled =
             ::hybridbackend::EnvVarGetBool(kRebatchDatasetFastPathEnv, false);
         if (TF_PREDICT_TRUE(!kRebatchFaspathDisabled)) {
           if (dataset()->shuffle_buffer_size_ < 1 && buffer_.size() == 0 &&
               dataset()->batch_size_ == input_batch_size) {
-            *output_tensors = std::move(input_tensors);
+            output_tensors->clear();
+            output_tensors->resize(input_tensors.size());
+            buffer_.FastPath(alloc, input_tensors, output_tensors);
             return Status::OK();
           }
         }
@@ -357,7 +361,7 @@ class RebatchTabularDatasetV2Op::Dataset::Iterator
       }
     }
     input_impl_.reset();
-    if (buffer_.size() > dataset()->batch_size_) {
+    if (buffer_.size() >= dataset()->batch_size_) {
       *end_of_sequence = false;
       if (dataset()->shuffle_buffer_size_ > 0) {
         TF_RETURN_IF_ERROR(buffer_.Shuffle(generator_, dataset()->batch_size_));
diff --git a/hybridbackend/tensorflow/data/tests/parquet_dataset_string_test.py b/hybridbackend/tensorflow/data/tests/parquet_dataset_string_test.py
index 7c62e75c..83eb531a 100644
--- a/hybridbackend/tensorflow/data/tests/parquet_dataset_string_test.py
+++ b/hybridbackend/tensorflow/data/tests/parquet_dataset_string_test.py
@@ -67,6 +67,7 @@ def test_read(self):
       ds = hb.data.ParquetDataset(
         [self._filename],
         batch_size=batch_size)
+      ds = ds.apply(hb.data.rebatch(batch_size))
       ds = ds.prefetch(4)
       batch = tf.data.make_one_shot_iterator(ds).get_next()
 
@@ -98,6 +99,7 @@ def test_to_sparse(self):
     batch_size = 32
     with tf.Graph().as_default() as graph:
       ds = hb.data.ParquetDataset(self._filename, batch_size=batch_size)
+      ds = ds.apply(hb.data.rebatch(batch_size))
       ds = ds.map(hb.data.DataFrame.parse)
       ds = ds.prefetch(4)
       batch = tf.data.make_one_shot_iterator(ds).get_next()
@@ -126,36 +128,6 @@ def test_to_sparse(self):
           len(set(list(zip(*actual.indices))[0])) + 1,
           len(expected.nested_row_splits[0]))
 
-  def test_unbatch_and_to_sparse(self):
-    with tf.Graph().as_default() as graph:
-      ds = hb.data.Dataset.from_parquet(self._filename)
-      ds = ds.prefetch(4)
-      batch = tf.data.make_one_shot_iterator(ds).get_next()
-
-    c = self._df['col0']
-    with tf.Session(graph=graph) as sess:
-      for i in xrange(3):
-        result = sess.run(batch)
-        start_row = i
-        end_row = i + 1
-        expected_items = c[start_row:end_row].to_numpy().tolist()
-        expected_values = []
-        expected_splits = [0]
-        for item in expected_items:
-          expected_values.extend(item)
-          expected_splits.append(expected_splits[-1] + len(item))
-        expected = hb.data.DataFrame.Value(
-          np.array(expected_values),
-          [np.array(expected_splits, dtype=np.int32)])
-        actual = result['col0']
-        expected_values = np.array(
-          list(map(str.encode, expected.values)),
-          dtype=object)
-        np.testing.assert_equal(actual.values, expected_values)
-        np.testing.assert_equal(
-          len(list(zip(*actual.indices))[0]),
-          expected.nested_row_splits[0][1])
-
 
 if __name__ == '__main__':
   hbtest.main(f'{__file__}.xml')