Skip to content

Commit

Permalink
Updated MLIO version and added buildspec
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikhil Raverkar committed Sep 12, 2022
1 parent 4a77d1d commit 1d60071
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 30 deletions.
60 changes: 60 additions & 0 deletions ci/buildspec.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
version: 0.2

phases:
install:
runtime-versions:
python: 3.8
docker: 19
pre_build:
commands:
- echo Pre-build started on `date`
- echo Installing dependencies...
- curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
- bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3
- export PATH=/miniconda3/bin:${PATH}
- conda install python=3.8
- conda update -y conda
- python3 -m pip install pip==20.1
- python3 -m pip install .[test]
build:
commands:
- echo Build started on `date`
- echo Docker login...
- docker login -u $dockerhub_username -p $dockerhub_password
- echo Building the Docker image...
- docker build -t xgboost-container-base:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/base/Dockerfile.cpu .
- python3 setup.py bdist_wheel --universal
- docker build -t preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/final/Dockerfile.cpu .
- printf "FROM preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test
- docker build -t test-xgboost-container -f Dockerfile.test .
- echo Running tox...
- docker run --rm -t test-xgboost-container sh -c 'tox -e ALL'
- echo Running container tests...
- pytest test/integration/local --docker-base-name preprod-xgboost-container --tag $FRAMEWORK_VERSION-cpu-py3 --py-version 3 --framework-version $FRAMEWORK_VERSION
- docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3
- docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION
post_build:
commands:
- echo Build completed on `date`
- |
case $CODEBUILD_WEBHOOK_EVENT in
PULL_REQUEST_MERGED)
echo Logging in to Amazon ECR...
$(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
echo Pushing the Docker image...
docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
;;
PULL_REQUEST_CREATED | PULL_REQUEST_UPDATED | PULL_REQUEST_REOPENED)
echo Logging in to Amazon ECR...
$(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
echo Pushing the Docker image...
# pushes test tag for manual verification, requires cleanup in ECR every once in a while though
TEST_TAG=$SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:${FRAMEWORK_VERSION}-cpu-py3-test
docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 ${TEST_TAG}
docker push ${TEST_TAG} | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
;;
*)
echo Undefined behavior for webhook event type $CODEBUILD_WEBHOOK_EVENT
;;
esac
11 changes: 7 additions & 4 deletions docker/1.0-1/base/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
FROM ubuntu:16.04

ARG PYARROW_VERSION=0.16.0
ARG MLIO_VERSION=0.6.0
ARG PYTHON_VERSION=3.7.10
# Install python and other runtime dependencies
RUN apt-get update && \
apt-get -y install \
Expand All @@ -25,11 +28,11 @@ RUN echo 'installing miniconda' && \

ENV PATH=/miniconda3/bin:${PATH}

RUN conda install -c conda-forge python=3.6.13 && \
RUN conda install -c conda-forge python=${PYTHON_VERSION} && \
conda update -y conda && \
conda install pip=20.1 && \
conda install -c conda-forge pyarrow=0.14.1 && \
conda install -c mlio -c conda-forge mlio-py=0.1
python3 -m pip install --upgrade pip && \
conda install -c conda-forge pyarrow=${PYARROW_VERSION} && \
conda install -c mlio -c conda-forge mlio-py=${MLIO_VERSION}

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
Expand Down
26 changes: 16 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
Flask==1.1.1 # sagemaker-containers requires flask 1.1.1
PyYAML==5.4.1
boto3==1.10.14
botocore==1.13.14
gunicorn<20.0.0
cryptography==3.4.6
Pillow==9.1.0
boto3==1.17.52
botocore==1.20.52
cryptography==35.0.0
gunicorn==19.10.0
matplotlib==3.3.2
multi-model-server==1.1.1
numpy==1.19.2
pandas==1.1.3
protobuf==3.20.1
psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7
python-dateutil==2.8.0
requests<2.21
requests==2.25.1
retrying==1.3.3
sagemaker-containers>=2.8.3,<2.9
sagemaker-containers==2.8.6.post2
sagemaker-inference==1.2.0
scikit-learn==0.23.2
scipy==1.2.2
smdebug==0.4.13
urllib3==1.25.9
wheel
scipy==1.5.3
smdebug==1.0.10
urllib3==1.26.5
wheel==0.35.1
jinja2==3.0.3
itsdangerous==2.0.1
MarkupSafe==2.1.1
Werkzeug==0.15.6
21 changes: 10 additions & 11 deletions src/sagemaker_xgboost_container/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,11 @@ def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights):
:return: xgb.DMatrix or None
"""
try:
dataset = [mlio.SageMakerPipe(pipe_path)]
reader = mlio.CsvReader(dataset=dataset,
batch_size=BATCH_SIZE,
header_row_index=None)
pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path]
dataset = [mlio.SageMakerPipe(path) for path in pipes_path]
reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
csv_params = mlio.CsvParams(header_row_index=None)
reader = mlio.CsvReader(reader_params, csv_params)

# Check if data is present in reader
if reader.peek_example() is not None:
Expand Down Expand Up @@ -449,17 +450,15 @@ def get_recordio_protobuf_dmatrix(path, is_pipe=False):
"""
try:
if is_pipe:
dataset = [mlio.SageMakerPipe(path)]
reader = mlio.RecordIOProtobufReader(dataset=dataset,
batch_size=BATCH_SIZE)
pipes_path = path if isinstance(path, list) else [path]
dataset = [mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path]
else:
dataset = mlio.list_files(path)
reader = mlio.RecordIOProtobufReader(dataset=dataset,
batch_size=BATCH_SIZE)

reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
reader = mlio.RecordIOProtobufReader(reader_params)
if reader.peek_example() is not None:
# recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
if type(reader.peek_example()['values']) is mlio.DenseTensor:
to_matrix = as_numpy
vstack = np.vstack
else:
Expand Down
5 changes: 3 additions & 2 deletions src/sagemaker_xgboost_container/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,10 @@ def recordio_protobuf_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix
"""
buf = bytes(string_like)
dataset = [mlio.InMemoryStore(buf)]
reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=100)
reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
reader = mlio.RecordIOProtobufReader(reader_params)

if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
if type(reader.peek_example()['values']) is mlio.DenseTensor:
to_matrix = as_numpy
vstack = np.vstack
else:
Expand Down
1 change: 1 addition & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Flask==1.1.1
coverage
docker-compose
flake8
Expand Down
7 changes: 4 additions & 3 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = {py36}-xgboost{1.0},flake8
envlist = {py37}-xgboost{1.0},flake8

[flake8]
max-line-length = 120
Expand All @@ -15,8 +15,9 @@ deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
conda_deps=
pyarrow=0.14.1
mlio-py=0.1
pyarrow==0.16.0
mlio-py==0.6.0
tbb==2020.2
conda_channels=
conda-forge
mlio
Expand Down

0 comments on commit 1d60071

Please sign in to comment.