aws · mabunday · Sep 1, 2022 · Sep 1, 2022 · Sep 1, 2022 · Sep 1, 2022
diff --git a/ci/buildspec.yml b/ci/buildspec.yml
@@ -0,0 +1,61 @@
+version: 0.2
+
+phases:
+  install:
+    runtime-versions:
+      python: 3.8
+      docker: 19
+  pre_build:
+    commands:
+    - echo Pre-build started on `date`
+    - echo Installing dependencies...
+    - curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    - bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3
+    - export PATH=/miniconda3/bin:${PATH}
+    - conda install python=3.8
+    - conda update -y conda
+    - python3 -m pip install pip==20.1  # The new pip denpendency resolver in 20.2+ can't resolve 1.0-1 and 0.90 dependencies
+    - python3 -m pip install .[test]
+  build:
+    commands:
+    - echo Build started on `date`
+    - echo Docker login...
+    - docker login -u $dockerhub_username -p $dockerhub_password
+    - echo Building the Docker image...
+    - docker build -t xgboost-container-base:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/base/Dockerfile.cpu .
+    - python3 setup.py bdist_wheel --universal
+    - docker build -t preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/final/Dockerfile.cpu .
+    - echo Running tox...
+    - printf "FROM preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test
+    - docker build -t test-xgboost-container -f Dockerfile.test .
+    - docker run --rm -t test-xgboost-container sh -c 'pytest --cov=sagemaker_xgboost_container --cov-fail-under=60 test/unit'
+    - docker run --rm -t test-xgboost-container sh -c 'flake8 setup.py src test'
+    - echo Running container tests...
+    - pytest test/integration/local --docker-base-name preprod-xgboost-container --tag $FRAMEWORK_VERSION-cpu-py3 --py-version 3 --framework-version $FRAMEWORK_VERSION
+    - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3
+    - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION
+  post_build:
+    commands:
+    - echo Build completed on `date`
+    - |
+      case $CODEBUILD_WEBHOOK_EVENT in
+        PULL_REQUEST_MERGED)
+          echo Logging in to Amazon ECR...
+          $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
+          echo Pushing the Docker image...
+          docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
+          docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
+          ;;
+        PULL_REQUEST_CREATED | PULL_REQUEST_UPDATED | PULL_REQUEST_REOPENED)
+          echo Logging in to Amazon ECR...
+          $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
+          echo Pushing the Docker image...
+          # pushes test tag for manual verification, requires cleanup in ECR every once in a while though
+          TEST_TAG=$SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:${FRAMEWORK_VERSION}-cpu-py3-test
+          docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 ${TEST_TAG}
+          docker push ${TEST_TAG} | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
+          ;;
+        *)
+          echo Undefined behavior for webhook event type $CODEBUILD_WEBHOOK_EVENT
+          ;;
+      esac
diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu
@@ -1,41 +1,123 @@
-FROM ubuntu:16.04
+ARG UBUNTU_VERSION=18.04
+ARG CUDA_VERSION=10.2
+ARG IMAGE_DIGEST=218afa9c2002be9c4629406c07ae4daaf72a3d65eb3c5a5614d9d7110840a46e
 
-# Install python and other runtime dependencies
-RUN apt-get update && \
-    apt-get -y install \
-        build-essential \
-        libatlas-dev \
-        git \
-        wget \
-        curl \
-        nginx \
-        jq
-
-RUN apt-get update
-RUN apt-get clean
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}@sha256:${IMAGE_DIGEST}
 
-RUN apt-get -y install openjdk-8-jdk-headless
+ARG MINICONDA_VERSION=4.9.2
+ARG CONDA_PY_VERSION=39
+ARG CONDA_CHECKSUM="b4e46fcc8029e2cfa731b788f25b1d36"
+ARG CONDA_PKG_VERSION=4.10.1
+ARG PYTHON_VERSION=3.8.13
+ARG PYARROW_VERSION=1.0
+ARG MLIO_VERSION=0.7.0
+ARG XGBOOST_VERSION=1.0
 
-# Install mlio
-RUN echo 'installing miniconda' && \
-    curl -LO https://repo.anaconda.com/miniconda/Miniconda3-py38_4.8.3-Linux-x86_64.sh && \
-    echo "d63adf39f2c220950a063e0529d4ff74 Miniconda3-py38_4.8.3-Linux-x86_64.sh" | md5sum -c - && \
-    bash Miniconda3-py38_4.8.3-Linux-x86_64.sh -bfp /miniconda3 && \
-    rm Miniconda3-py38_4.8.3-Linux-x86_64.sh
-
-ENV PATH=/miniconda3/bin:${PATH}
-
-RUN conda install -c conda-forge python=3.6.13 && \
-    conda update -y conda && \
-    conda install pip=20.1 && \
-    conda install -c conda-forge pyarrow=0.14.1 && \
-    conda install -c mlio -c conda-forge mlio-py=0.1
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
 
 # Python won’t try to write .pyc or .pyo files on the import of source modules
 # Force stdin, stdout and stderr to be totally unbuffered. Good for logging
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONIOENCODING='utf-8'
 
+RUN rm /etc/apt/sources.list.d/cuda.list && \
+        rm /etc/apt/sources.list.d/nvidia-ml.list && \
+        apt-key del 7fa2af80 && \
+        apt-get update && apt-get install -y --no-install-recommends wget && \
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+        dpkg -i cuda-keyring_1.0-1_all.deb && \
+        apt-get update && \
+    apt-get -y upgrade && \
+    apt-get -y install --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        jq \
+        libatlas-base-dev \
+        nginx \
+        openjdk-8-jdk-headless \
+        unzip \
+        wget \
+        && \
+    # MLIO build dependencies
+    # Official Ubuntu APT repositories do not contain an up-to-date version of CMake required to build MLIO.
+    # Kitware contains the latest version of CMake.
+    apt-get -y install --no-install-recommends \
+        apt-transport-https \
+        ca-certificates \
+        gnupg \
+        software-properties-common \
+        && \
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \
+        gpg --dearmor - | \
+        tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \
+    echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ bionic main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
+    apt-get update && \
+    rm /usr/share/keyrings/kitware-archive-keyring.gpg && \
+    apt-get install -y --no-install-recommends \
+        autoconf \
+        automake \
+        build-essential \
+        cmake=3.18.4-0kitware1 \
+        cmake-data=3.18.4-0kitware1 \
+        doxygen \
+        kitware-archive-keyring \
+        libcurl4-openssl-dev \
+        libssl-dev \
+        libtool \
+        ninja-build \
+        python3-dev \
+        python3-distutils \
+        python3-pip \
+        zlib1g-dev \
+        && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install conda
+RUN cd /tmp && \
+    curl -L --output /tmp/Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-py${CONDA_PY_VERSION}_${MINICONDA_VERSION}-Linux-x86_64.sh && \
+    echo "${CONDA_CHECKSUM} /tmp/Miniconda3.sh" | md5sum -c - && \
+    bash /tmp/Miniconda3.sh -bfp /miniconda3 && \
+    rm /tmp/Miniconda3.sh
+
+ENV PATH=/miniconda3/bin:${PATH}
+
+# Install MLIO with Apache Arrow integration
+# We could install mlio-py from conda, but it comes  with extra support such as image reader that increases image size
+# which increases training time. We build from source to minimize the image size.
+RUN echo "conda ${CONDA_PKG_VERSION}" >> /miniconda3/conda-meta/pinned && \
+    # Conda configuration see https://conda.io/projects/conda/en/latest/configuration.html
+    conda config --system --set auto_update_conda false && \
+    conda config --system --set show_channel_urls true && \
+    echo "python ${PYTHON_VERSION}.*" >> /miniconda3/conda-meta/pinned && \
+    conda install -c conda-forge python=${PYTHON_VERSION} && \
+    conda install conda=${CONDA_PKG_VERSION} && \
+    conda update -y conda && \
+    conda install -c conda-forge pyarrow=${PYARROW_VERSION} && \
+    cd /tmp && \
+    git clone --branch v${MLIO_VERSION} https://github.com/awslabs/ml-io.git mlio && \
+    cd mlio && \
+    build-tools/build-dependency build/third-party all && \
+    mkdir -p build/release && \
+    cd build/release && \
+    cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(pwd)/../third-party" ../.. && \
+    cmake --build . && \
+    cmake --build . --target install && \
+    cmake -DMLIO_INCLUDE_PYTHON_EXTENSION=ON -DPYTHON_EXECUTABLE="/miniconda3/bin/python3" \
+        -DMLIO_INCLUDE_ARROW_INTEGRATION=ON ../.. && \
+    cmake --build . --target mlio-py && \
+    cmake --build . --target mlio-arrow && \
+    cd ../../src/mlio-py && \
+    python3 setup.py bdist_wheel && \
+    python3 -m pip install typing && \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install dist/*.whl && \
+    cp -r /tmp/mlio/build/third-party/lib/intel64/gcc4.7/* /usr/local/lib/ && \
+    ldconfig && \
+    rm -rf /tmp/mlio
+
 # Install latest version of XGBoost
-RUN python3 -m pip install --no-cache -I xgboost==1.0
+RUN python3 -m pip install --no-cache -I xgboost==${XGBOOST_VERSION}
diff --git a/docker/1.0-1/final/Dockerfile.cpu b/docker/1.0-1/final/Dockerfile.cpu
@@ -1,5 +1,9 @@
-FROM xgboost-container-base:1.0-1-cpu-py3
-ENV SAGEMAKER_XGBOOST_VERSION 1.0-1
+ARG SAGEMAKER_XGBOOST_VERSION=1.0-1
+ARG PYTHON_VERSION=3.8
+
+FROM xgboost-container-base:${SAGEMAKER_XGBOOST_VERSION}-cpu-py3
+
+ARG SAGEMAKER_XGBOOST_VERSION
 
 ########################
 # Install dependencies #
@@ -11,20 +15,20 @@ RUN python3 -m pip install -r /requirements.txt && rm /requirements.txt
 # Copy wheel to container #
 ###########################
 COPY dist/sagemaker_xgboost_container-2.0-py2.py3-none-any.whl /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl
-# https://github.com/googleapis/google-cloud-python/issues/6647
-RUN rm -rf /miniconda3/lib/python3.6/site-packages/numpy-1.19.5.dist-info && \
+RUN rm -rf /miniconda3/lib/python3.8/site-packages/numpy-1.21.2.dist-info && \
     python3 -m pip install --no-cache /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl && \
+    python3 -m pip uninstall -y typing && \
     rm /sagemaker_xgboost_container-1.0-py2.py3-none-any.whl
 
 ##############
 # DMLC PATCH #
 ##############
 # TODO: remove after making contributions back to xgboost for tracker.py
 COPY src/sagemaker_xgboost_container/dmlc_patch/tracker.py \
-   /miniconda3/lib/python3.6/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py
+   /miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker/dmlc_tracker/tracker.py
 
 # Include DMLC python code in PYTHONPATH to use RabitTracker
-ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python3.6/site-packages/xgboost/dmlc-core/tracker
+ENV PYTHONPATH=$PYTHONPATH:/miniconda3/lib/python${PYTHON_VERSION}/site-packages/xgboost/dmlc-core/tracker
 
 #######
 # MMS #
@@ -34,12 +38,12 @@ RUN useradd -m model-server
 RUN mkdir -p /home/model-server/tmp && chown -R model-server /home/model-server
 
 # Copy MMS configs
-COPY docker/$SAGEMAKER_XGBOOST_VERSION/resources/mms/config.properties.tmp /home/model-server
+COPY docker/${SAGEMAKER_XGBOOST_VERSION}/resources/mms/config.properties.tmp /home/model-server
 ENV XGBOOST_MMS_CONFIG=/home/model-server/config.properties
 
 # Copy execution parameters endpoint plugin for MMS
 RUN mkdir -p /tmp/plugins
-COPY docker/$SAGEMAKER_XGBOOST_VERSION/resources/mms/endpoints-1.0.jar /tmp/plugins
+COPY docker/${SAGEMAKER_XGBOOST_VERSION}/resources/mms/endpoints-1.0.jar /tmp/plugins
 RUN chmod +x /tmp/plugins/endpoints-1.0.jar
 
 # Create directory for models
@@ -67,4 +71,4 @@ ENV SAGEMAKER_SERVING_MODULE sagemaker_xgboost_container.serving:main
 
 EXPOSE 8080
 ENV TEMP=/home/model-server/tmp
-LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
diff --git a/requirements.txt b/requirements.txt
@@ -1,21 +1,27 @@
 Flask==1.1.1  # sagemaker-containers requires flask 1.1.1
 PyYAML==5.4.1
-boto3==1.10.14
-botocore==1.13.14
-gunicorn<20.0.0
-cryptography==3.4.6
-matplotlib==3.3.2
-multi-model-server==1.1.1
+Pillow==9.1.0
+boto3==1.17.52
+botocore==1.20.52
+cryptography==35.0.0
+gunicorn==19.10.0
+itsdangerous==2.0.1
+matplotlib==3.4.1
+multi-model-server==1.1.2
 numpy==1.19.2
-pandas==1.1.3
+pandas==1.2.4
+protobuf==3.20.1
 psutil==5.6.7  # sagemaker-containers requires psutil 5.6.7
-python-dateutil==2.8.0
-requests<2.21
+python-dateutil==2.8.1
+requests==2.25.1
 retrying==1.3.3
-sagemaker-containers>=2.8.3,<2.9
-sagemaker-inference==1.2.0
-scikit-learn==0.23.2
-scipy==1.2.2
-smdebug==0.4.13
-urllib3==1.25.9
-wheel
+sagemaker-containers==2.8.6.post2
+sagemaker-inference==1.5.5
+scikit-learn==0.24.1
+scipy==1.6.2
+smdebug==1.0.10
+urllib3==1.26.5
+wheel==0.36.2
+jinja2==2.11.3
+MarkupSafe==1.1.1
+Werkzeug==0.15.6
diff --git a/src/sagemaker_xgboost_container/data_utils.py b/src/sagemaker_xgboost_container/data_utils.py
@@ -442,40 +442,34 @@ def get_parquet_dmatrix(path, is_pipe=False):
 
 def get_recordio_protobuf_dmatrix(path, is_pipe=False):
     """Get Data Matrix from recordio-protobuf data.
-
     :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe
     :param is_pipe: Boolean to indicate if data is being read in pipe mode
     :return: xgb.DMatrix or None
     """
     try:
         if is_pipe:
-            dataset = [mlio.SageMakerPipe(path)]
-            reader = mlio.RecordIOProtobufReader(dataset=dataset,
-                                                 batch_size=BATCH_SIZE)
+            pipes_path = path if isinstance(path, list) else [path]
+            dataset = [mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path]
         else:
             dataset = mlio.list_files(path)
-            reader = mlio.RecordIOProtobufReader(dataset=dataset,
-                                                 batch_size=BATCH_SIZE)
+
+        reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
+        reader = mlio.RecordIOProtobufReader(reader_params)
 
         if reader.peek_example() is not None:
             # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
-            if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
-                to_matrix = as_numpy
-                vstack = np.vstack
-            else:
-                to_matrix = to_coo_matrix
-                vstack = scipy_vstack
+            is_dense_tensor = type(reader.peek_example()['values']) is mlio.DenseTensor
 
             all_features = []
             all_labels = []
             for example in reader:
-                features = to_matrix(example['values'])
+                features = as_numpy(example['values']) if is_dense_tensor else to_coo_matrix(example['values'])
                 all_features.append(features)
 
                 labels = as_numpy(example['label_values'])
                 all_labels.append(labels)
 
-            all_features = vstack(all_features)
+            all_features = np.vstack(all_features) if is_dense_tensor else scipy_vstack(all_features).tocsr()
             all_labels = np.concatenate(all_labels, axis=None)
             dmatrix = xgb.DMatrix(all_features, label=all_labels)
             return dmatrix