From b960aaa0024c26e0d4b862755f92b7956ccac356 Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 10:32:21 -0400
Subject: [PATCH 01/12] Bump ubuntu image version from 16.04 to 18.04

---
 docker/1.0-1/base/Dockerfile.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu
index 9cd923ac..7fb32b36 100644
--- a/docker/1.0-1/base/Dockerfile.cpu
+++ b/docker/1.0-1/base/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 # Install python and other runtime dependencies
 RUN apt-get update && \

From b185039cbe56ef650f5cb5ac6c6cafc994912a53 Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 10:36:56 -0400
Subject: [PATCH 02/12] Add ci/buildspec.yml

---
 ci/buildspec.yml | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 ci/buildspec.yml

diff --git a/ci/buildspec.yml b/ci/buildspec.yml
new file mode 100644
index 00000000..92393ca8
--- /dev/null
+++ b/ci/buildspec.yml
@@ -0,0 +1,61 @@
+version: 0.2
+
+phases:
+  install:
+    runtime-versions:
+      python: 3.6
+      docker: 19
+  pre_build:
+    commands:
+    - echo Pre-build started on `date`
+    - echo Installing dependencies...
+    - curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    - bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3
+    - export PATH=/miniconda3/bin:${PATH}
+    - conda install python=3.6
+    - conda update -y conda
+    - python3 -m pip install pip==20.1  # The new pip denpendency resolver in 20.2+ can't resolve 1.0-1 and 0.90 dependencies
+    - python3 -m pip install .[test]
+  build:
+    commands:
+    - echo Build started on `date`
+    - echo Docker login...
+    - docker login -u $dockerhub_username -p $dockerhub_password
+    - echo Building the Docker image...
+    - docker build -t xgboost-container-base:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/base/Dockerfile.cpu .
+    - python3 setup.py bdist_wheel --universal
+    - docker build -t preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/final/Dockerfile.cpu .
+    - echo Running tox...
+    - printf "FROM preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test
+    - docker build -t test-xgboost-container -f Dockerfile.test .
+    - docker run --rm -t test-xgboost-container sh -c 'pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit'
+    - docker run --rm -t test-xgboost-container sh -c 'flake8 setup.py src test'
+    - echo Running container tests...
+    - pytest test/integration/local --docker-base-name preprod-xgboost-container --tag $FRAMEWORK_VERSION-cpu-py3 --py-version 3 --framework-version $FRAMEWORK_VERSION
+    - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3
+    - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION
+  post_build:
+    commands:
+    - echo Build completed on `date`
+    - |
+      case $CODEBUILD_WEBHOOK_EVENT in
+        PULL_REQUEST_MERGED)
+          echo Logging in to Amazon ECR...
+          $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
+          echo Pushing the Docker image...
+          docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
+          docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
+          ;;
+        PULL_REQUEST_CREATED | PULL_REQUEST_UPDATED | PULL_REQUEST_REOPENED)
+          echo Logging in to Amazon ECR...
+          $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
+          echo Pushing the Docker image...
+          # pushes test tag for manual verification, requires cleanup in ECR every once in a while though
+          TEST_TAG=$SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:${FRAMEWORK_VERSION}-cpu-py3-test
+          docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 ${TEST_TAG}
+          docker push ${TEST_TAG} | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
+          ;;
+        *)
+          echo Undefined behavior for webhook event type $CODEBUILD_WEBHOOK_EVENT
+          ;;
+      esac
\ No newline at end of file

From 349abdf8020816c559702209428ef143984372e5 Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 10:44:11 -0400
Subject: [PATCH 03/12] Upgrade Python from 3.6 to 3.8

---
 ci/buildspec.yml                  |   4 +-
 docker/1.0-1/base/Dockerfile.cpu  | 142 +++++++++++++++++++++++-------
 docker/1.0-1/final/Dockerfile.cpu |  22 +++--
 3 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/ci/buildspec.yml b/ci/buildspec.yml
index 92393ca8..a7d9afab 100644
--- a/ci/buildspec.yml
+++ b/ci/buildspec.yml
@@ -3,7 +3,7 @@ version: 0.2
 phases:
   install:
     runtime-versions:
-      python: 3.6
+      python: 3.8
       docker: 19
   pre_build:
     commands:
@@ -12,7 +12,7 @@ phases:
     - curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
     - bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3
     - export PATH=/miniconda3/bin:${PATH}
-    - conda install python=3.6
+    - conda install python=3.8
     - conda update -y conda
     - python3 -m pip install pip==20.1  # The new pip denpendency resolver in 20.2+ can't resolve 1.0-1 and 0.90 dependencies
     - python3 -m pip install .[test]
diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu
index 7fb32b36..924494ab 100644
--- a/docker/1.0-1/base/Dockerfile.cpu
+++ b/docker/1.0-1/base/Dockerfile.cpu
@@ -1,35 +1,21 @@
-FROM ubuntu:18.04
+ARG UBUNTU_VERSION=18.04
+ARG CUDA_VERSION=10.2
+ARG IMAGE_DIGEST=218afa9c2002be9c4629406c07ae4daaf72a3d65eb3c5a5614d9d7110840a46e
 
-# Install python and other runtime dependencies
-RUN apt-get update && \
-    apt-get -y install \
-        build-essential \
-        libatlas-dev \
-        git \
-        wget \
-        curl \
-        nginx \
-        jq
-
-RUN apt-get update
-RUN apt-get clean
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}@sha256:${IMAGE_DIGEST}
 
-RUN apt-get -y install openjdk-8-jdk-headless
+ARG MINICONDA_VERSION=4.9.2
+ARG CONDA_PY_VERSION=38
+ARG CONDA_CHECKSUM="b4e46fcc8029e2cfa731b788f25b1d36"
+ARG CONDA_PKG_VERSION=4.10.1
+ARG PYTHON_VERSION=3.8.13
+ARG PYARROW_VERSION=1.0
+ARG MLIO_VERSION=0.7.0
+ARG XGBOOST_VERSION=1.0
 
-# Install mlio
-RUN echo 'installing miniconda' && \
-    curl -LO https://repo.anaconda.com/miniconda/Miniconda3-py38_4.8.3-Linux-x86_64.sh && \
-    echo "d63adf39f2c220950a063e0529d4ff74 Miniconda3-py38_4.8.3-Linux-x86_64.sh" | md5sum -c - && \
-    bash Miniconda3-py38_4.8.3-Linux-x86_64.sh -bfp /miniconda3 && \
-    rm Miniconda3-py38_4.8.3-Linux-x86_64.sh
-
-ENV PATH=/miniconda3/bin:${PATH}
-
-RUN conda install -c conda-forge python=3.6.13 && \
-    conda update -y conda && \
-    conda install pip=20.1 && \
-    conda install -c conda-forge pyarrow=0.14.1 && \
-    conda install -c mlio -c conda-forge mlio-py=0.1
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
 
 # Python won’t try to write .pyc or .pyo files on the import of source modules
 # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
@@ -37,5 +23,101 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONIOENCODING='utf-8'
 
+RUN rm /etc/apt/sources.list.d/cuda.list && \
+        rm /etc/apt/sources.list.d/nvidia-ml.list && \
+        apt-key del 7fa2af80 && \
+        apt-get update && apt-get install -y --no-install-recommends wget && \
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+        dpkg -i cuda-keyring_1.0-1_all.deb && \
+        apt-get update && \
+    apt-get -y upgrade && \
+    apt-get -y install --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        jq \
+        libatlas-base-dev \
+        nginx \
+        openjdk-8-jdk-headless \
+        unzip \
+        wget \
+        && \
+    # MLIO build dependencies
+    # Official Ubuntu APT repositories do not contain an up-to-date version of CMake required to build MLIO.
+    # Kitware contains the latest version of CMake.
+    apt-get -y install --no-install-recommends \
+        apt-transport-https \
+        ca-certificates \
+        gnupg \
+        software-properties-common \
+        && \
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \
+        gpg --dearmor - | \
+        tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \
+    echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ bionic main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
+    apt-get update && \
+    rm /usr/share/keyrings/kitware-archive-keyring.gpg && \
+    apt-get install -y --no-install-recommends \
+        autoconf \
+        automake \
+        build-essential \
+        cmake=3.18.4-0kitware1 \
+        cmake-data=3.18.4-0kitware1 \
+        doxygen \
+        kitware-archive-keyring \
+        libcurl4-openssl-dev \
+        libssl-dev \
+        libtool \
+        ninja-build \
+        python3-dev \
+        python3-distutils \
+        python3-pip \
+        zlib1g-dev \
+        && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install conda
+RUN cd /tmp && \
+    curl -L --output /tmp/Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-py${CONDA_PY_VERSION}_${MINICONDA_VERSION}-Linux-x86_64.sh && \
+    echo "${CONDA_CHECKSUM} /tmp/Miniconda3.sh" | md5sum -c - && \
+    bash /tmp/Miniconda3.sh -bfp /miniconda3 && \
+    rm /tmp/Miniconda3.sh
+
+ENV PATH=/miniconda3/bin:${PATH}
+
+# Install MLIO with Apache Arrow integration
+# We could install mlio-py from conda, but it comes  with extra support such as image reader that increases image size
+# which increases training time. We build from source to minimize the image size.
+RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \
+    # Conda configuration see https://conda.io/projects/conda/en/latest/configuration.html
+    conda config --system --set auto_update_conda false && \
+    conda config --system --set show_channel_urls true && \
+    echo "python ${PYTHON_VERSION}.*" >> /miniconda3/conda-meta/pinned && \
+    conda install -c conda-forge python=${PYTHON_VERSION} && \
+    conda install conda=${CONDA_PKG_VERSION} && \
+    conda update -y conda && \
+    conda install -c conda-forge pyarrow=${PYARROW_VERSION} && \
+    cd /tmp && \
+    git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \
+    cd mlio && \
+    build-tools/build-dependency build/third-party all && \
+    mkdir -p build/release && \
+    cd build/release && \
+    cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. && \
+    cmake --build . && \
+    cmake --build . --target install && \
+    cmake -DMLIO_INCLUDE_PYTHON_EXTENSION=ON -DPYTHON_EXECUTABLE="/miniconda3/bin/python3" \
+        -DMLIO_INCLUDE_ARROW_INTEGRATION=ON ../.. && \
+    cmake --build . --target mlio-py && \
+    cmake --build . --target mlio-arrow && \
+    cd ../../src/mlio-py && \
+    python3 setup.py bdist_wheel && \
+    python3 -m pip install typing && \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install dist/*.whl && \
+    cp -r /tmp/mlio/build/third-party/lib/intel64/gcc4.7/* /usr/local/lib/ && \
+    ldconfig && \
+    rm -rf /tmp/mlio
+
 # Install latest version of XGBoost
-RUN python3 -m pip install --no-cache -I xgboost==1.0
+RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION}
\ No newline at end of file
diff --git a/docker/1.0-1/final/Dockerfile.cpu b/docker/1.0-1/final/Dockerfile.cpu
index ea471596..2b9732ff 100644
--- a/docker/1.0-1/final/Dockerfile.cpu
+++ b/docker/1.0-1/final/Dockerfile.cpu
@@ -1,5 +1,9 @@
-FROM xgboost-container-base:1.0-1-cpu-py3
-ENV SAGEMAKER_XGBOOST_VERSION 1.0-1
+ARG SAGEMAKER_XGBOOST_VERSION=1.0-1
+ARG PYTHON_VERSION=3.8
+
+FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3
+
+ARG SAGEMAKER_XGBOOST_VERSION
 
 ########################
 # Install dependencies #
@@ -11,9 +15,9 @@ RUN python3 -m pip install -r /requirements.txt && rm /requirements.txt
 # Copy wheel to container #
 ###########################
 COPY dist/sagemaker_xgboost_container-2.0-py2.py3-none-any.whl /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl
-# https://github.com/googleapis/google-cloud-python/issues/6647
-RUN rm -rf /miniconda3/lib/python3.6/site-packages/numpy-1.19.5.dist-info && \
+RUN rm -rf /miniconda3/lib/python3.8/site-packages/numpy-1.21.2.dist-info && \
     python3 -m pip install --no-cache /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl && \
+    python3 -m pip uninstall -y typing && \
     rm /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl
 
 ##############
@@ -21,10 +25,10 @@ RUN rm -rf /miniconda3/lib/python3.6/site-packages/numpy-1.19.5.dist-info && \
 ##############
 # TODO: remove after making contributions back to xgboost for tracker.py
 COPY src/sagemaker_xgboost_container/dmlc_patch/tracker.py \
-   /miniconda3/lib/python3.6/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py
+   /miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py
 
 # Include DMLC python code in PYTHONPATH to use RabitTracker
-ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python3.6/site-packages/xgboost/dmlc-core/tracker
+ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker
 
 #######
 # MMS #
@@ -34,12 +38,12 @@ RUN useradd -m model-server
 RUN mkdir -p /home/model-server/tmp && chown -R model-server /home/model-server
 
 # Copy MMS configs
-COPY docker/$SAGEMAKER_XGBOOST_VERSION/resources/mms/config.properties.tmp /home/model-server
+COPY docker/${SAGEMAKER_XGBOOST_VERSION}/resources/mms/config.properties.tmp /home/model-server
 ENV XGBOOST_MMS_CONFIG=/home/model-server/config.properties
 
 # Copy execution parameters endpoint plugin for MMS
 RUN mkdir -p /tmp/plugins
-COPY docker/$SAGEMAKER_XGBOOST_VERSION/resources/mms/endpoints-1.0.jar /tmp/plugins
+COPY docker/${SAGEMAKER_XGBOOST_VERSION}/resources/mms/endpoints-1.0.jar /tmp/plugins
 RUN chmod +x /tmp/plugins/endpoints-1.0.jar
 
 # Create directory for models
@@ -67,4 +71,4 @@ ENV SAGEMAKER_SERVING_MODULE sagemaker_xgboost_container.serving:main
 
 EXPOSE 8080
 ENV TEMP=/home/model-server/tmp
-LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
\ No newline at end of file

From 1c97ddd346ea2a98cf7f0979daa18827bf450edb Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 10:46:13 -0400
Subject: [PATCH 04/12] Upgrade requirements and tox.ini

---
 requirements.txt      | 38 ++++++++++++++++++++++----------------
 test-requirements.txt |  1 +
 tox.ini               | 13 ++++++++-----
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9529a1da..0053e600 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,21 +1,27 @@
 Flask==1.1.1  # sagemaker-containers requires flask 1.1.1
 PyYAML==5.4.1
-boto3==1.10.14
-botocore==1.13.14
-gunicorn<20.0.0
-cryptography==3.4.6
-matplotlib==3.3.2
-multi-model-server==1.1.1
+Pillow==9.1.0
+boto3==1.17.52
+botocore==1.20.52
+cryptography==35.0.0
+gunicorn==19.10.0
+itsdangerous==2.0.1
+matplotlib==3.4.1
+multi-model-server==1.1.2
 numpy==1.19.2
-pandas==1.1.3
+pandas==1.2.4
+protobuf==3.20.1
 psutil==5.6.7  # sagemaker-containers requires psutil 5.6.7
-python-dateutil==2.8.0
-requests<2.21
+python-dateutil==2.8.1
+requests==2.25.1
 retrying==1.3.3
-sagemaker-containers>=2.8.3,<2.9
-sagemaker-inference==1.2.0
-scikit-learn==0.23.2
-scipy==1.2.2
-smdebug==0.4.13
-urllib3==1.25.9
-wheel
+sagemaker-containers==2.8.6.post2
+sagemaker-inference==1.5.5
+scikit-learn==0.24.1
+scipy==1.6.2
+smdebug==1.0.10
+urllib3==1.26.5
+wheel==0.36.2
+jinja2==2.11.3
+MarkupSafe==1.1.1
+Werkzeug==0.15.6
diff --git a/test-requirements.txt b/test-requirements.txt
index 2a0248cb..0108ed40 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -1,3 +1,4 @@
+Flask==1.1.1  # sagemaker-containers requires flask 1.1.1
 coverage
 docker-compose
 flake8
diff --git a/tox.ini b/tox.ini
index 70c3b119..6e7f4a66 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = {py36}-xgboost{1.0},flake8
+envlist = {py38}-xgboost{1.0},flake8
 
 [flake8]
 max-line-length = 120
@@ -11,19 +11,22 @@ deps =
     xgboost0.82: xgboost==0.82
     xgboost0.90: xgboost==0.90
     xgboost1.0: xgboost==1.0
+    xgboost1.2: xgboost==1.2
+    xgboost1.3: xgboost==1.3.3
+    xgboost1.5: xgboost==1.5.2
     xgboostlatest: xgboost
     -r{toxinidir}/requirements.txt
     -r{toxinidir}/test-requirements.txt
 conda_deps=
-    pyarrow=0.14.1
-    mlio-py=0.1
+    pyarrow==1.0.1
+    tbb==2020.2
+    mlio-py==0.7.0
 conda_channels=
     conda-forge
     mlio
 commands =
     pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit # increase minimum bar over time (75%+)
-install_command = python3 -m pip install {opts} {packages} --use-deprecated=legacy-resolver
 
 [testenv:flake8]
 deps = flake8
-commands = flake8 setup.py src test
+commands = flake8 setup.py src test
\ No newline at end of file

From 241bfee6cbcbdefdb20cde9571a9022ce7326ab0 Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 10:53:20 -0400
Subject: [PATCH 05/12] Correct conda version

---
 docker/1.0-1/base/Dockerfile.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu
index 924494ab..fc609259 100644
--- a/docker/1.0-1/base/Dockerfile.cpu
+++ b/docker/1.0-1/base/Dockerfile.cpu
@@ -5,7 +5,7 @@ ARG IMAGE_DIGEST=218afa9c2002be9c4629406c07ae4daaf72a3d65eb3c5a5614d9d7110840a46
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}@sha256:${IMAGE_DIGEST}
 
 ARG MINICONDA_VERSION=4.9.2
-ARG CONDA_PY_VERSION=38
+ARG CONDA_PY_VERSION=39
 ARG CONDA_CHECKSUM="b4e46fcc8029e2cfa731b788f25b1d36"
 ARG CONDA_PKG_VERSION=4.10.1
 ARG PYTHON_VERSION=3.8.13

From 78fce897b9135f05df7271f84e15fbc34c78b39b Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 11:25:30 -0400
Subject: [PATCH 06/12] Revert ML-IO version

---
 docker/1.0-1/base/Dockerfile.cpu | 2 +-
 tox.ini                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu
index fc609259..bdac1e3f 100644
--- a/docker/1.0-1/base/Dockerfile.cpu
+++ b/docker/1.0-1/base/Dockerfile.cpu
@@ -10,7 +10,7 @@ ARG CONDA_CHECKSUM="b4e46fcc8029e2cfa731b788f25b1d36"
 ARG CONDA_PKG_VERSION=4.10.1
 ARG PYTHON_VERSION=3.8.13
 ARG PYARROW_VERSION=1.0
-ARG MLIO_VERSION=0.7.0
+ARG MLIO_VERSION=0.1.0
 ARG XGBOOST_VERSION=1.0
 
 ENV DEBIAN_FRONTEND=noninteractive
diff --git a/tox.ini b/tox.ini
index 6e7f4a66..f5062ba3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -20,7 +20,7 @@ deps =
 conda_deps=
     pyarrow==1.0.1
     tbb==2020.2
-    mlio-py==0.7.0
+    mlio-py==0.1.0
 conda_channels=
     conda-forge
     mlio

From 3377be1e6f51cf0b53538928e51f5fbf7ef78f62 Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 11:36:34 -0400
Subject: [PATCH 07/12] Revert "Revert ML-IO version"

This reverts commit 78fce897b9135f05df7271f84e15fbc34c78b39b.
---
 docker/1.0-1/base/Dockerfile.cpu | 2 +-
 tox.ini                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu
index bdac1e3f..fc609259 100644
--- a/docker/1.0-1/base/Dockerfile.cpu
+++ b/docker/1.0-1/base/Dockerfile.cpu
@@ -10,7 +10,7 @@ ARG CONDA_CHECKSUM="b4e46fcc8029e2cfa731b788f25b1d36"
 ARG CONDA_PKG_VERSION=4.10.1
 ARG PYTHON_VERSION=3.8.13
 ARG PYARROW_VERSION=1.0
-ARG MLIO_VERSION=0.1.0
+ARG MLIO_VERSION=0.7.0
 ARG XGBOOST_VERSION=1.0
 
 ENV DEBIAN_FRONTEND=noninteractive
diff --git a/tox.ini b/tox.ini
index f5062ba3..6e7f4a66 100644
--- a/tox.ini
+++ b/tox.ini
@@ -20,7 +20,7 @@ deps =
 conda_deps=
     pyarrow==1.0.1
     tbb==2020.2
-    mlio-py==0.1.0
+    mlio-py==0.7.0
 conda_channels=
     conda-forge
     mlio

From c6dba111c946903aa0e0ca68480098e138fa77fd Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 11:39:49 -0400
Subject: [PATCH 08/12] Update unit tests for mlio-0.7.0

---
 test/unit/test_data_utils.py | 88 ++++++++++++++++++++++++++++++++----
 test/unit/test_encoder.py    | 10 +++-
 2 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/test/unit/test_data_utils.py b/test/unit/test_data_utils.py
index 837b7601..34d63ac9 100644
--- a/test/unit/test_data_utils.py
+++ b/test/unit/test_data_utils.py
@@ -14,11 +14,13 @@
 import unittest
 import os
 from pathlib import Path
+import pandas as pd
 import shutil
 import signal
 import subprocess
 import sys
 import time
+from mock import patch
 
 from sagemaker_algorithm_toolkit import exceptions as exc
 from sagemaker_xgboost_container import data_utils
@@ -82,23 +84,47 @@ def _check_dmatrix(self, reader, path, num_col, num_row, *args):
         self.assertEqual(num_col, single_node_dmatrix.num_col())
         self.assertEqual(num_row, single_node_dmatrix.num_row())
 
-        no_weight_test_features = ["f{}".format(idx) for idx in range(single_node_dmatrix.num_col())]
+    def _check_piped_dmatrix(self, file_path, pipe_dir, reader, num_col, num_row, *args):
+        python_exe = sys.executable
+        pipe_cmd = '{}/sagemaker_pipe.py train {} {}'.format(self.utils_path, file_path, pipe_dir)
 
-        self.assertEqual(no_weight_test_features, single_node_dmatrix.feature_names)
+        proc = subprocess.Popen([python_exe] + pipe_cmd.split(" "))
 
-    def _check_piped_dmatrix(self, file_path, pipe_path, pipe_dir, reader, num_col, num_row, *args):
+        try:
+            time.sleep(1)
+            pipe_path = os.path.join(pipe_dir, 'train')
+            self._check_dmatrix(reader, pipe_path, num_col, num_row, *args)
+        finally:
+            os.kill(proc.pid, signal.SIGTERM)
+            shutil.rmtree(pipe_dir)
+
+    def _check_piped_dmatrix2(self, file_path, pipe_dir, reader, num_col, num_row, *args):
         python_exe = sys.executable
         pipe_cmd = '{}/sagemaker_pipe.py train {} {}'.format(self.utils_path, file_path, pipe_dir)
+        pipe_cmd2 = '{}/sagemaker_pipe.py validation {} {}'.format(self.utils_path, file_path, pipe_dir)
 
         proc = subprocess.Popen([python_exe] + pipe_cmd.split(" "))
+        proc2 = subprocess.Popen([python_exe] + pipe_cmd2.split(" "))
 
         try:
             time.sleep(1)
-            self._check_dmatrix(reader, pipe_path, num_col, num_row, *args)
+            pipes_path = [os.path.join(pipe_dir, 'train'), os.path.join(pipe_dir, 'validation')]
+            self._check_dmatrix(reader, pipes_path, num_col, 2*num_row, *args)
         finally:
             os.kill(proc.pid, signal.SIGTERM)
+            os.kill(proc2.pid, signal.SIGTERM)
             shutil.rmtree(pipe_dir)
 
+    def test_get_dmatrix(self):
+        current_path = Path(os.path.abspath(__file__))
+        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
+        file_path = [os.path.join(data_path, path) for path in ['train', 'validation']]
+
+        dmatrix = data_utils.get_dmatrix(file_path, 'libsvm', 0, False)
+
+        self.assertEqual(9, dmatrix.num_col())
+        self.assertEqual(3548, dmatrix.num_row())
+
     def test_parse_csv_dmatrix(self):
         csv_file_paths_and_weight = [('train.csv', 0), ('train.csv.weights', 1), ('csv_files', 0)]
 
@@ -115,10 +141,20 @@ def test_parse_csv_dmatrix_pipe(self):
             with self.subTest(file_path=file_path, csv_weight=csv_weight):
                 csv_path = os.path.join(self.data_path, 'csv', file_path)
                 pipe_dir = os.path.join(self.data_path, 'csv', 'pipe_path', file_path)
-                pipe_path = os.path.join(pipe_dir, 'train')
                 reader = data_utils.get_csv_dmatrix
                 is_pipe = True
-                self._check_piped_dmatrix(csv_path, pipe_path, pipe_dir, reader, 5, 5, csv_weight, is_pipe)
+                self._check_piped_dmatrix(csv_path, pipe_dir, reader, 5, 5, csv_weight, is_pipe)
+
+    def test_parse_csv_dmatrix_pipe2(self):
+        csv_file_paths_and_weight = [('csv_files', 0), ('weighted_csv_files', 1)]
+
+        for file_path, csv_weight in csv_file_paths_and_weight:
+            with self.subTest(file_path=file_path, csv_weight=csv_weight):
+                csv_path = os.path.join(self.data_path, 'csv', file_path)
+                pipe_dir = os.path.join(self.data_path, 'csv', 'pipe_path2', file_path)
+                reader = data_utils.get_csv_dmatrix
+                is_pipe = True
+                self._check_piped_dmatrix2(csv_path, pipe_dir, reader, 5, 5, csv_weight, is_pipe)
 
     def test_parse_libsvm_dmatrix(self):
         libsvm_file_paths = ['train.libsvm', 'train.libsvm.weights', 'libsvm_files']
@@ -145,10 +181,9 @@ def test_parse_parquet_dmatrix_pipe(self):
             with self.subTest(file_path=file_path):
                 pq_path = os.path.join(self.data_path, 'parquet', file_path)
                 pipe_dir = os.path.join(self.data_path, 'parquet', 'pipe_path')
-                pipe_path = os.path.join(pipe_dir, 'train')
                 reader = data_utils.get_parquet_dmatrix
                 is_pipe = True
-                self._check_piped_dmatrix(pq_path, pipe_path, pipe_dir, reader, 5, 5, is_pipe)
+                self._check_piped_dmatrix(pq_path, pipe_dir, reader, 5, 5, is_pipe)
 
     def test_parse_protobuf_dmatrix(self):
         pb_file_paths = ['train.pb', 'pb_files']
@@ -166,10 +201,9 @@ def test_parse_protobuf_dmatrix_pipe(self):
             with self.subTest(file_path=file_path):
                 pb_path = os.path.join(self.data_path, 'recordio_protobuf', file_path)
                 pipe_dir = os.path.join(self.data_path, 'recordio_protobuf', 'pipe_path')
-                pipe_path = os.path.join(pipe_dir, 'train')
                 reader = data_utils.get_recordio_protobuf_dmatrix
                 is_pipe = True
-                self._check_piped_dmatrix(pb_path, pipe_path, pipe_dir, reader, 5, 5, is_pipe)
+                self._check_piped_dmatrix(pb_path, pipe_dir, reader, 5, 5, is_pipe)
 
     def test_parse_sparse_protobuf_dmatrix(self):
         pb_file_paths = ['sparse', 'sparse_edge_cases']
@@ -189,3 +223,37 @@ def test_parse_protobuf_dmatrix_single_feature_label(self):
                 pb_path = os.path.join(self.data_path, 'recordio_protobuf', file_path)
                 reader = data_utils.get_recordio_protobuf_dmatrix
                 self._check_dmatrix(reader, pb_path, 1, 1)
+
+    @patch("logging.warning")
+    def test_check_data_redundancy_positive(self, mock_log_warning):
+        current_path = Path(os.path.abspath(__file__))
+        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
+        file_path = os.path.join(data_path, "train")
+        data_utils.check_data_redundancy(file_path, file_path)
+        mock_log_warning.assert_called()
+
+    @patch("logging.warning")
+    def test_check_data_redundancy_negative(self, mock_log_warning):
+        current_path = Path(os.path.abspath(__file__))
+        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
+        file_path = [os.path.join(data_path, path) for path in ['train', 'validation']]
+        data_utils.check_data_redundancy(file_path[0], file_path[1])
+        mock_log_warning.assert_not_called()
+
+    def test_check_data_redundancy_does_not_throw_exception_file(self):
+        current_path = Path(os.path.abspath(__file__))
+        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
+        file_path = os.path.join(data_path, "train")
+        try:
+            data_utils.check_data_redundancy(file_path, file_path)
+        except Exception as e:
+            assert False, f"check_data_redundancy raised an exception {e} for file mode"
+
+    def test_check_data_redundancy_throws_exception_pipe(self):
+        pb_file_paths = ['pb_files']
+        with self.assertRaises(Exception):
+            data_utils.check_data_redundancy(pb_file_paths[0], pb_file_paths[1])
+
+    def test_pyarrow_to_parquet_conversion_does_not_throw_exception(self):
+        df = pd.DataFrame({'x': [1, 2]})
+        df.to_parquet('test.parquet', engine='pyarrow')
\ No newline at end of file
diff --git a/test/unit/test_encoder.py b/test/unit/test_encoder.py
index 7848ea6e..bb5ebd1a 100644
--- a/test/unit/test_encoder.py
+++ b/test/unit/test_encoder.py
@@ -24,7 +24,7 @@
 from sagemaker_xgboost_container import encoder
 
 
-@pytest.mark.parametrize('target', ('42,6,9', '42.0,6.0,9.0', '42\n6\n9\n'))
+@pytest.mark.parametrize('target', ('42,6,9', '42.0,6.0,9.0', '42\n6\n9\n', b'42,6,9', b'42.0,6.0,9.0', b'42\n6\n9\n'))
 def test_csv_to_dmatrix(target):
     actual = encoder.csv_to_dmatrix(target)
     assert type(actual) is xgb.DMatrix
@@ -97,6 +97,12 @@ def test_decode(content_type):
     decoder.assert_called_once_with(42)
 
 
+@pytest.mark.parametrize('content_type', ['text/csv; charset=UTF-8'])
+def test_decode_with_complex_csv_content_type(content_type):
+    dmatrix_result = encoder.decode("42.0,6.0,9.0\n42.0,6.0,9.0", content_type)
+    assert type(dmatrix_result) is xgb.DMatrix
+
+
 def test_encoder_jsonlines_from_json():
     json_response = json.dumps({'predictions': [{"predicted_label": 1, "probabilities": [0.4, 0.6]},
                                                 {"predicted_label": 0, "probabilities": [0.9, 0.1]}]})
@@ -110,4 +116,4 @@ def test_encoder_jsonlines_from_json():
 def test_encoder_jsonlines_from_json_error():
     bad_json_response = json.dumps({'predictions': [], 'metadata': []})
     with pytest.raises(ValueError):
-        encoder.json_to_jsonlines(bad_json_response)
+        encoder.json_to_jsonlines(bad_json_response)
\ No newline at end of file

From 2b8cd74d91f33a6c6b262d97308fef86fab074ca Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 12:12:23 -0400
Subject: [PATCH 09/12] Update encoder.py

---
 src/sagemaker_xgboost_container/encoder.py | 27 +++++++++++-----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/sagemaker_xgboost_container/encoder.py b/src/sagemaker_xgboost_container/encoder.py
index 814bbac1..500b5612 100644
--- a/src/sagemaker_xgboost_container/encoder.py
+++ b/src/sagemaker_xgboost_container/encoder.py
@@ -35,21 +35,23 @@ def _clean_csv_string(csv_string, delimiter):
     return ['nan' if x == '' else x for x in csv_string.split(delimiter)]
 
 
-def csv_to_dmatrix(string_like, dtype=None):  # type: (str) -> xgb.DMatrix
+def csv_to_dmatrix(input: Union[str, bytes], dtype=None) -> xgb.DMatrix:
     """Convert a CSV object to a DMatrix object.
     Args:
-        string_like (str): CSV string. Assumes the string has been stripped of leading or trailing newline chars.
+        input (str/binary): CSV string or binary object(encoded by UTF-8).
+                                Assumes the string has been stripped of leading or trailing newline chars.
         dtype (dtype, optional):  Data type of the resulting array. If None, the dtypes will be determined by the
                                         contents of each column, individually. This argument can only be used to
                                         'upcast' the array.  For downcasting, use the .astype(t) method.
     Returns:
         (xgb.DMatrix): XGBoost DataMatrix
     """
-    sniff_delimiter = csv.Sniffer().sniff(string_like.split('\n')[0][:512]).delimiter
+    csv_string = input.decode() if isinstance(input, bytes) else input
+    sniff_delimiter = csv.Sniffer().sniff(csv_string.split('\n')[0][:512]).delimiter
     delimiter = ',' if sniff_delimiter.isalnum() else sniff_delimiter
     logging.info("Determined delimiter of CSV input is \'{}\'".format(delimiter))
 
-    np_payload = np.array(list(map(lambda x: _clean_csv_string(x, delimiter), string_like.split('\n')))).astype(dtype)
+    np_payload = np.array(list(map(lambda x: _clean_csv_string(x, delimiter), csv_string.split('\n')))).astype(dtype)
     return xgb.DMatrix(np_payload)
 
 
@@ -83,21 +85,18 @@ def recordio_protobuf_to_dmatrix(string_like):  # type: (bytes) -> xgb.DMatrix
     """
     buf = bytes(string_like)
     dataset = [mlio.InMemoryStore(buf)]
-    reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=100)
+    reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
+    reader = mlio.RecordIOProtobufReader(reader_params)
 
-    if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
-        to_matrix = as_numpy
-        vstack = np.vstack
-    else:
-        to_matrix = to_coo_matrix
-        vstack = scipy_vstack
+    is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor
 
     examples = []
     for example in reader:
-        tmp = to_matrix(example['values'])  # Ignore labels if present
-        examples.append(tmp)
+        # Ignore labels if present
+        values = as_numpy(example['values']) if is_dense_tensor else to_coo_matrix(example['values'])
+        examples.append(values)
 
-    data = vstack(examples)
+    data = np.vstack(examples) if is_dense_tensor else scipy_vstack(examples).tocsr()
     dmatrix = xgb.DMatrix(data)
     return dmatrix
 

From eee1e6336dac801aaae9d98d8929acd698f1391e Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 12:58:47 -0400
Subject: [PATCH 10/12] Import Union from typing

---
 src/sagemaker_xgboost_container/encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sagemaker_xgboost_container/encoder.py b/src/sagemaker_xgboost_container/encoder.py
index 500b5612..f0f15f67 100644
--- a/src/sagemaker_xgboost_container/encoder.py
+++ b/src/sagemaker_xgboost_container/encoder.py
@@ -18,7 +18,7 @@
 import logging
 import os
 import tempfile
-from typing import Iterable
+from typing import Iterable, Union
 
 import mlio
 from mlio.integ.numpy import as_numpy

From 917d015f84c08ac8b28308be9bbbee6f88b14c5d Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 13:43:31 -0400
Subject: [PATCH 11/12] Revert "Update unit tests for mlio-0.7.0"

This reverts commit c6dba111c946903aa0e0ca68480098e138fa77fd.
---
 test/unit/test_data_utils.py | 88 ++++--------------------------------
 test/unit/test_encoder.py    | 10 +---
 2 files changed, 12 insertions(+), 86 deletions(-)

diff --git a/test/unit/test_data_utils.py b/test/unit/test_data_utils.py
index 34d63ac9..837b7601 100644
--- a/test/unit/test_data_utils.py
+++ b/test/unit/test_data_utils.py
@@ -14,13 +14,11 @@
 import unittest
 import os
 from pathlib import Path
-import pandas as pd
 import shutil
 import signal
 import subprocess
 import sys
 import time
-from mock import patch
 
 from sagemaker_algorithm_toolkit import exceptions as exc
 from sagemaker_xgboost_container import data_utils
@@ -84,47 +82,23 @@ def _check_dmatrix(self, reader, path, num_col, num_row, *args):
         self.assertEqual(num_col, single_node_dmatrix.num_col())
         self.assertEqual(num_row, single_node_dmatrix.num_row())
 
-    def _check_piped_dmatrix(self, file_path, pipe_dir, reader, num_col, num_row, *args):
-        python_exe = sys.executable
-        pipe_cmd = '{}/sagemaker_pipe.py train {} {}'.format(self.utils_path, file_path, pipe_dir)
+        no_weight_test_features = ["f{}".format(idx) for idx in range(single_node_dmatrix.num_col())]
 
-        proc = subprocess.Popen([python_exe] + pipe_cmd.split(" "))
+        self.assertEqual(no_weight_test_features, single_node_dmatrix.feature_names)
 
-        try:
-            time.sleep(1)
-            pipe_path = os.path.join(pipe_dir, 'train')
-            self._check_dmatrix(reader, pipe_path, num_col, num_row, *args)
-        finally:
-            os.kill(proc.pid, signal.SIGTERM)
-            shutil.rmtree(pipe_dir)
-
-    def _check_piped_dmatrix2(self, file_path, pipe_dir, reader, num_col, num_row, *args):
+    def _check_piped_dmatrix(self, file_path, pipe_path, pipe_dir, reader, num_col, num_row, *args):
         python_exe = sys.executable
         pipe_cmd = '{}/sagemaker_pipe.py train {} {}'.format(self.utils_path, file_path, pipe_dir)
-        pipe_cmd2 = '{}/sagemaker_pipe.py validation {} {}'.format(self.utils_path, file_path, pipe_dir)
 
         proc = subprocess.Popen([python_exe] + pipe_cmd.split(" "))
-        proc2 = subprocess.Popen([python_exe] + pipe_cmd2.split(" "))
 
         try:
             time.sleep(1)
-            pipes_path = [os.path.join(pipe_dir, 'train'), os.path.join(pipe_dir, 'validation')]
-            self._check_dmatrix(reader, pipes_path, num_col, 2*num_row, *args)
+            self._check_dmatrix(reader, pipe_path, num_col, num_row, *args)
         finally:
             os.kill(proc.pid, signal.SIGTERM)
-            os.kill(proc2.pid, signal.SIGTERM)
             shutil.rmtree(pipe_dir)
 
-    def test_get_dmatrix(self):
-        current_path = Path(os.path.abspath(__file__))
-        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
-        file_path = [os.path.join(data_path, path) for path in ['train', 'validation']]
-
-        dmatrix = data_utils.get_dmatrix(file_path, 'libsvm', 0, False)
-
-        self.assertEqual(9, dmatrix.num_col())
-        self.assertEqual(3548, dmatrix.num_row())
-
     def test_parse_csv_dmatrix(self):
         csv_file_paths_and_weight = [('train.csv', 0), ('train.csv.weights', 1), ('csv_files', 0)]
 
@@ -141,20 +115,10 @@ def test_parse_csv_dmatrix_pipe(self):
             with self.subTest(file_path=file_path, csv_weight=csv_weight):
                 csv_path = os.path.join(self.data_path, 'csv', file_path)
                 pipe_dir = os.path.join(self.data_path, 'csv', 'pipe_path', file_path)
+                pipe_path = os.path.join(pipe_dir, 'train')
                 reader = data_utils.get_csv_dmatrix
                 is_pipe = True
-                self._check_piped_dmatrix(csv_path, pipe_dir, reader, 5, 5, csv_weight, is_pipe)
-
-    def test_parse_csv_dmatrix_pipe2(self):
-        csv_file_paths_and_weight = [('csv_files', 0), ('weighted_csv_files', 1)]
-
-        for file_path, csv_weight in csv_file_paths_and_weight:
-            with self.subTest(file_path=file_path, csv_weight=csv_weight):
-                csv_path = os.path.join(self.data_path, 'csv', file_path)
-                pipe_dir = os.path.join(self.data_path, 'csv', 'pipe_path2', file_path)
-                reader = data_utils.get_csv_dmatrix
-                is_pipe = True
-                self._check_piped_dmatrix2(csv_path, pipe_dir, reader, 5, 5, csv_weight, is_pipe)
+                self._check_piped_dmatrix(csv_path, pipe_path, pipe_dir, reader, 5, 5, csv_weight, is_pipe)
 
     def test_parse_libsvm_dmatrix(self):
         libsvm_file_paths = ['train.libsvm', 'train.libsvm.weights', 'libsvm_files']
@@ -181,9 +145,10 @@ def test_parse_parquet_dmatrix_pipe(self):
             with self.subTest(file_path=file_path):
                 pq_path = os.path.join(self.data_path, 'parquet', file_path)
                 pipe_dir = os.path.join(self.data_path, 'parquet', 'pipe_path')
+                pipe_path = os.path.join(pipe_dir, 'train')
                 reader = data_utils.get_parquet_dmatrix
                 is_pipe = True
-                self._check_piped_dmatrix(pq_path, pipe_dir, reader, 5, 5, is_pipe)
+                self._check_piped_dmatrix(pq_path, pipe_path, pipe_dir, reader, 5, 5, is_pipe)
 
     def test_parse_protobuf_dmatrix(self):
         pb_file_paths = ['train.pb', 'pb_files']
@@ -201,9 +166,10 @@ def test_parse_protobuf_dmatrix_pipe(self):
             with self.subTest(file_path=file_path):
                 pb_path = os.path.join(self.data_path, 'recordio_protobuf', file_path)
                 pipe_dir = os.path.join(self.data_path, 'recordio_protobuf', 'pipe_path')
+                pipe_path = os.path.join(pipe_dir, 'train')
                 reader = data_utils.get_recordio_protobuf_dmatrix
                 is_pipe = True
-                self._check_piped_dmatrix(pb_path, pipe_dir, reader, 5, 5, is_pipe)
+                self._check_piped_dmatrix(pb_path, pipe_path, pipe_dir, reader, 5, 5, is_pipe)
 
     def test_parse_sparse_protobuf_dmatrix(self):
         pb_file_paths = ['sparse', 'sparse_edge_cases']
@@ -223,37 +189,3 @@ def test_parse_protobuf_dmatrix_single_feature_label(self):
                 pb_path = os.path.join(self.data_path, 'recordio_protobuf', file_path)
                 reader = data_utils.get_recordio_protobuf_dmatrix
                 self._check_dmatrix(reader, pb_path, 1, 1)
-
-    @patch("logging.warning")
-    def test_check_data_redundancy_positive(self, mock_log_warning):
-        current_path = Path(os.path.abspath(__file__))
-        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
-        file_path = os.path.join(data_path, "train")
-        data_utils.check_data_redundancy(file_path, file_path)
-        mock_log_warning.assert_called()
-
-    @patch("logging.warning")
-    def test_check_data_redundancy_negative(self, mock_log_warning):
-        current_path = Path(os.path.abspath(__file__))
-        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
-        file_path = [os.path.join(data_path, path) for path in ['train', 'validation']]
-        data_utils.check_data_redundancy(file_path[0], file_path[1])
-        mock_log_warning.assert_not_called()
-
-    def test_check_data_redundancy_does_not_throw_exception_file(self):
-        current_path = Path(os.path.abspath(__file__))
-        data_path = os.path.join(str(current_path.parent.parent), 'resources', 'abalone', 'data')
-        file_path = os.path.join(data_path, "train")
-        try:
-            data_utils.check_data_redundancy(file_path, file_path)
-        except Exception as e:
-            assert False, f"check_data_redundancy raised an exception {e} for file mode"
-
-    def test_check_data_redundancy_throws_exception_pipe(self):
-        pb_file_paths = ['pb_files']
-        with self.assertRaises(Exception):
-            data_utils.check_data_redundancy(pb_file_paths[0], pb_file_paths[1])
-
-    def test_pyarrow_to_parquet_conversion_does_not_throw_exception(self):
-        df = pd.DataFrame({'x': [1, 2]})
-        df.to_parquet('test.parquet', engine='pyarrow')
\ No newline at end of file
diff --git a/test/unit/test_encoder.py b/test/unit/test_encoder.py
index bb5ebd1a..7848ea6e 100644
--- a/test/unit/test_encoder.py
+++ b/test/unit/test_encoder.py
@@ -24,7 +24,7 @@
 from sagemaker_xgboost_container import encoder
 
 
-@pytest.mark.parametrize('target', ('42,6,9', '42.0,6.0,9.0', '42\n6\n9\n', b'42,6,9', b'42.0,6.0,9.0', b'42\n6\n9\n'))
+@pytest.mark.parametrize('target', ('42,6,9', '42.0,6.0,9.0', '42\n6\n9\n'))
 def test_csv_to_dmatrix(target):
     actual = encoder.csv_to_dmatrix(target)
     assert type(actual) is xgb.DMatrix
@@ -97,12 +97,6 @@ def test_decode(content_type):
     decoder.assert_called_once_with(42)
 
 
-@pytest.mark.parametrize('content_type', ['text/csv; charset=UTF-8'])
-def test_decode_with_complex_csv_content_type(content_type):
-    dmatrix_result = encoder.decode("42.0,6.0,9.0\n42.0,6.0,9.0", content_type)
-    assert type(dmatrix_result) is xgb.DMatrix
-
-
 def test_encoder_jsonlines_from_json():
     json_response = json.dumps({'predictions': [{"predicted_label": 1, "probabilities": [0.4, 0.6]},
                                                 {"predicted_label": 0, "probabilities": [0.9, 0.1]}]})
@@ -116,4 +110,4 @@ def test_encoder_jsonlines_from_json():
 def test_encoder_jsonlines_from_json_error():
     bad_json_response = json.dumps({'predictions': [], 'metadata': []})
     with pytest.raises(ValueError):
-        encoder.json_to_jsonlines(bad_json_response)
\ No newline at end of file
+        encoder.json_to_jsonlines(bad_json_response)

From 783cb4ae9ad7f030b4dc108e4ef2451d3023160f Mon Sep 17 00:00:00 2001
From: Mark Bunday <mabunday@amazon.com>
Date: Thu, 1 Sep 2022 13:43:52 -0400
Subject: [PATCH 12/12] Update get_recordio_protobuf_dmatrix for mlio-0.7.0

---
 src/sagemaker_xgboost_container/data_utils.py | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/sagemaker_xgboost_container/data_utils.py b/src/sagemaker_xgboost_container/data_utils.py
index d17efdae..f42344ee 100644
--- a/src/sagemaker_xgboost_container/data_utils.py
+++ b/src/sagemaker_xgboost_container/data_utils.py
@@ -442,40 +442,34 @@ def get_parquet_dmatrix(path, is_pipe=False):
 
 def get_recordio_protobuf_dmatrix(path, is_pipe=False):
     """Get Data Matrix from recordio-protobuf data.
-
     :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
     :param is_pipe: Boolean to indicate if data is being read in pipe mode
     :return: xgb.DMatrix or None
     """
     try:
         if is_pipe:
-            dataset = [mlio.SageMakerPipe(path)]
-            reader = mlio.RecordIOProtobufReader(dataset=dataset,
-                                                 batch_size=BATCH_SIZE)
+            pipes_path = path if isinstance(path, list) else [path]
+            dataset = [mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path]
         else:
             dataset = mlio.list_files(path)
-            reader = mlio.RecordIOProtobufReader(dataset=dataset,
-                                                 batch_size=BATCH_SIZE)
+
+        reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
+        reader = mlio.RecordIOProtobufReader(reader_params)
 
         if reader.peek_example() is not None:
             # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
-            if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
-                to_matrix = as_numpy
-                vstack = np.vstack
-            else:
-                to_matrix = to_coo_matrix
-                vstack = scipy_vstack
+            is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor
 
             all_features = []
             all_labels = []
             for example in reader:
-                features = to_matrix(example['values'])
+                features = as_numpy(example['values']) if is_dense_tensor else to_coo_matrix(example['values'])
                 all_features.append(features)
 
                 labels = as_numpy(example['label_values'])
                 all_labels.append(labels)
 
-            all_features = vstack(all_features)
+            all_features = np.vstack(all_features) if is_dense_tensor else scipy_vstack(all_features).tocsr()
             all_labels = np.concatenate(all_labels, axis=None)
             dmatrix = xgb.DMatrix(all_features, label=all_labels)
             return dmatrix