From 1d60071280e1dba8f164ceec70b0aa520eacfdb9 Mon Sep 17 00:00:00 2001 From: Nikhil Raverkar Date: Fri, 9 Sep 2022 13:48:03 -0400 Subject: [PATCH] Updated MLIO version and added buildspec --- ci/buildspec.yml | 60 +++++++++++++++++++ docker/1.0-1/base/Dockerfile.cpu | 11 ++-- requirements.txt | 26 ++++---- src/sagemaker_xgboost_container/data_utils.py | 21 ++++--- src/sagemaker_xgboost_container/encoder.py | 5 +- test-requirements.txt | 1 + tox.ini | 7 ++- 7 files changed, 101 insertions(+), 30 deletions(-) create mode 100644 ci/buildspec.yml diff --git a/ci/buildspec.yml b/ci/buildspec.yml new file mode 100644 index 00000000..3226777f --- /dev/null +++ b/ci/buildspec.yml @@ -0,0 +1,60 @@ +version: 0.2 + +phases: + install: + runtime-versions: + python: 3.8 + docker: 19 + pre_build: + commands: + - echo Pre-build started on `date` + - echo Installing dependencies... + - curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + - bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3 + - export PATH=/miniconda3/bin:${PATH} + - conda install python=3.8 + - conda update -y conda + - python3 -m pip install pip==20.1 + - python3 -m pip install .[test] + build: + commands: + - echo Build started on `date` + - echo Docker login... + - docker login -u $dockerhub_username -p $dockerhub_password + - echo Building the Docker image... + - docker build -t xgboost-container-base:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/base/Dockerfile.cpu . + - python3 setup.py bdist_wheel --universal + - docker build -t preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/final/Dockerfile.cpu . + - printf "FROM preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test + - docker build -t test-xgboost-container -f Dockerfile.test . + - echo Running tox... + - docker run --rm -t test-xgboost-container sh -c 'tox -e ALL' + - echo Running container tests... + - pytest test/integration/local --docker-base-name preprod-xgboost-container --tag $FRAMEWORK_VERSION-cpu-py3 --py-version 3 --framework-version $FRAMEWORK_VERSION + - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 + - docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION + post_build: + commands: + - echo Build completed on `date` + - | + case $CODEBUILD_WEBHOOK_EVENT in + PULL_REQUEST_MERGED) + echo Logging in to Amazon ECR... + $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION) + echo Pushing the Docker image... + docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com" + docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com" + ;; + PULL_REQUEST_CREATED | PULL_REQUEST_UPDATED | PULL_REQUEST_REOPENED) + echo Logging in to Amazon ECR... + $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION) + echo Pushing the Docker image... + # pushes test tag for manual verification, requires cleanup in ECR every once in a while though + TEST_TAG=$SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:${FRAMEWORK_VERSION}-cpu-py3-test + docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 ${TEST_TAG} + docker push ${TEST_TAG} | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com" + ;; + *) + echo Undefined behavior for webhook event type $CODEBUILD_WEBHOOK_EVENT + ;; + esac \ No newline at end of file diff --git a/docker/1.0-1/base/Dockerfile.cpu b/docker/1.0-1/base/Dockerfile.cpu index 9cd923ac..5c452cb5 100644 --- a/docker/1.0-1/base/Dockerfile.cpu +++ b/docker/1.0-1/base/Dockerfile.cpu @@ -1,5 +1,8 @@ FROM ubuntu:16.04 +ARG PYARROW_VERSION=0.16.0 +ARG MLIO_VERSION=0.6.0 +ARG PYTHON_VERSION=3.7.10 # Install python and other runtime dependencies RUN apt-get update && \ apt-get -y install \ @@ -25,11 +28,11 @@ RUN echo 'installing miniconda' && \ ENV PATH=/miniconda3/bin:${PATH} -RUN conda install -c conda-forge python=3.6.13 && \ +RUN conda install -c conda-forge python=${PYTHON_VERSION} && \ conda update -y conda && \ - conda install pip=20.1 && \ - conda install -c conda-forge pyarrow=0.14.1 && \ - conda install -c mlio -c conda-forge mlio-py=0.1 + python3 -m pip install --upgrade pip && \ + conda install -c conda-forge pyarrow=${PYARROW_VERSION} && \ + conda install -c mlio -c conda-forge mlio-py=${MLIO_VERSION} # Python won’t try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging diff --git a/requirements.txt b/requirements.txt index 9529a1da..ebb205ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,27 @@ Flask==1.1.1 # sagemaker-containers requires flask 1.1.1 PyYAML==5.4.1 -boto3==1.10.14 -botocore==1.13.14 -gunicorn<20.0.0 -cryptography==3.4.6 +Pillow==9.1.0 +boto3==1.17.52 +botocore==1.20.52 +cryptography==35.0.0 +gunicorn==19.10.0 matplotlib==3.3.2 multi-model-server==1.1.1 numpy==1.19.2 pandas==1.1.3 +protobuf==3.20.1 psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7 python-dateutil==2.8.0 -requests<2.21 +requests==2.25.1 retrying==1.3.3 -sagemaker-containers>=2.8.3,<2.9 +sagemaker-containers==2.8.6.post2 sagemaker-inference==1.2.0 scikit-learn==0.23.2 -scipy==1.2.2 -smdebug==0.4.13 -urllib3==1.25.9 -wheel +scipy==1.5.3 +smdebug==1.0.10 +urllib3==1.26.5 +wheel==0.35.1 +jinja2==3.0.3 +itsdangerous==2.0.1 +MarkupSafe==2.1.1 +Werkzeug==0.15.6 \ No newline at end of file diff --git a/src/sagemaker_xgboost_container/data_utils.py b/src/sagemaker_xgboost_container/data_utils.py index d17efdae..f9ff24dc 100644 --- a/src/sagemaker_xgboost_container/data_utils.py +++ b/src/sagemaker_xgboost_container/data_utils.py @@ -299,10 +299,11 @@ def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights): :return: xgb.DMatrix or None """ try: - dataset = [mlio.SageMakerPipe(pipe_path)] - reader = mlio.CsvReader(dataset=dataset, - batch_size=BATCH_SIZE, - header_row_index=None) + pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path] + dataset = [mlio.SageMakerPipe(path) for path in pipes_path] + reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) + csv_params = mlio.CsvParams(header_row_index=None) + reader = mlio.CsvReader(reader_params, csv_params) # Check if data is present in reader if reader.peek_example() is not None: @@ -449,17 +450,15 @@ def get_recordio_protobuf_dmatrix(path, is_pipe=False): """ try: if is_pipe: - dataset = [mlio.SageMakerPipe(path)] - reader = mlio.RecordIOProtobufReader(dataset=dataset, - batch_size=BATCH_SIZE) + pipes_path = path if isinstance(path, list) else [path] + dataset = [mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path] else: dataset = mlio.list_files(path) - reader = mlio.RecordIOProtobufReader(dataset=dataset, - batch_size=BATCH_SIZE) - + reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE) + reader = mlio.RecordIOProtobufReader(reader_params) if reader.peek_example() is not None: # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) - if type(reader.peek_example()['values']) is mlio.core.DenseTensor: + if type(reader.peek_example()['values']) is mlio.DenseTensor: to_matrix = as_numpy vstack = np.vstack else: diff --git a/src/sagemaker_xgboost_container/encoder.py b/src/sagemaker_xgboost_container/encoder.py index 814bbac1..1160e5a2 100644 --- a/src/sagemaker_xgboost_container/encoder.py +++ b/src/sagemaker_xgboost_container/encoder.py @@ -83,9 +83,10 @@ def recordio_protobuf_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix """ buf = bytes(string_like) dataset = [mlio.InMemoryStore(buf)] - reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=100) + reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100) + reader = mlio.RecordIOProtobufReader(reader_params) - if type(reader.peek_example()['values']) is mlio.core.DenseTensor: + if type(reader.peek_example()['values']) is mlio.DenseTensor: to_matrix = as_numpy vstack = np.vstack else: diff --git a/test-requirements.txt b/test-requirements.txt index 2a0248cb..73bd33f2 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,3 +1,4 @@ +Flask==1.1.1 coverage docker-compose flake8 diff --git a/tox.ini b/tox.ini index 70c3b119..a7f880b1 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = {py36}-xgboost{1.0},flake8 +envlist = {py37}-xgboost{1.0},flake8 [flake8] max-line-length = 120 @@ -15,8 +15,9 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt conda_deps= - pyarrow=0.14.1 - mlio-py=0.1 + pyarrow==0.16.0 + mlio-py==0.6.0 + tbb==2020.2 conda_channels= conda-forge mlio