Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated MLIO version and added buildspec #311

Draft
wants to merge 1 commit into
base: 1.0-1
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions ci/buildspec.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
version: 0.2

phases:
install:
runtime-versions:
python: 3.8
docker: 19
pre_build:
commands:
- echo Pre-build started on `date`
- echo Installing dependencies...
- curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
- bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3
- export PATH=/miniconda3/bin:${PATH}
- conda install python=3.8
- conda update -y conda
- python3 -m pip install pip==20.1
- python3 -m pip install .[test]
build:
commands:
- echo Build started on `date`
- echo Docker login...
- docker login -u $dockerhub_username -p $dockerhub_password
- echo Building the Docker image...
- docker build -t xgboost-container-base:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/base/Dockerfile.cpu .
- python3 setup.py bdist_wheel --universal
- docker build -t preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 -f docker/$FRAMEWORK_VERSION/final/Dockerfile.cpu .
- printf "FROM preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3\nADD . /app\nWORKDIR /app\nRUN python3 -m pip install .[test]" > Dockerfile.test
- docker build -t test-xgboost-container -f Dockerfile.test .
- echo Running tox...
- docker run --rm -t test-xgboost-container sh -c 'tox -e ALL'
- echo Running container tests...
- pytest test/integration/local --docker-base-name preprod-xgboost-container --tag $FRAMEWORK_VERSION-cpu-py3 --py-version 3 --framework-version $FRAMEWORK_VERSION
- docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3
- docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION
post_build:
commands:
- echo Build completed on `date`
- |
case $CODEBUILD_WEBHOOK_EVENT in
PULL_REQUEST_MERGED)
echo Logging in to Amazon ECR...
$(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
echo Pushing the Docker image...
docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION-cpu-py3 | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
docker push $SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:$FRAMEWORK_VERSION | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
;;
PULL_REQUEST_CREATED | PULL_REQUEST_UPDATED | PULL_REQUEST_REOPENED)
echo Logging in to Amazon ECR...
$(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION)
echo Pushing the Docker image...
# pushes test tag for manual verification, requires cleanup in ECR every once in a while though
TEST_TAG=$SM_ALPHA.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:${FRAMEWORK_VERSION}-cpu-py3-test
docker tag preprod-xgboost-container:$FRAMEWORK_VERSION-cpu-py3 ${TEST_TAG}
docker push ${TEST_TAG} | grep -v -E "[0-9]{12}.dkr.ecr.\S+.amazonaws.com"
;;
*)
echo Undefined behavior for webhook event type $CODEBUILD_WEBHOOK_EVENT
;;
esac
11 changes: 7 additions & 4 deletions docker/1.0-1/base/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
FROM ubuntu:16.04

ARG PYARROW_VERSION=0.16.0
ARG MLIO_VERSION=0.6.0
ARG PYTHON_VERSION=3.7.10
# Install python and other runtime dependencies
RUN apt-get update && \
apt-get -y install \
Expand All @@ -25,11 +28,11 @@ RUN echo 'installing miniconda' && \

ENV PATH=/miniconda3/bin:${PATH}

RUN conda install -c conda-forge python=3.6.13 && \
RUN conda install -c conda-forge python=${PYTHON_VERSION} && \
conda update -y conda && \
conda install pip=20.1 && \
conda install -c conda-forge pyarrow=0.14.1 && \
conda install -c mlio -c conda-forge mlio-py=0.1
python3 -m pip install --upgrade pip && \
conda install -c conda-forge pyarrow=${PYARROW_VERSION} && \
conda install -c mlio -c conda-forge mlio-py=${MLIO_VERSION}

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
Expand Down
26 changes: 16 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
Flask==1.1.1 # sagemaker-containers requires flask 1.1.1
PyYAML==5.4.1
boto3==1.10.14
botocore==1.13.14
gunicorn<20.0.0
cryptography==3.4.6
Pillow==9.1.0
boto3==1.17.52
botocore==1.20.52
cryptography==35.0.0
gunicorn==19.10.0
matplotlib==3.3.2
multi-model-server==1.1.1
numpy==1.19.2
pandas==1.1.3
protobuf==3.20.1
psutil==5.6.7 # sagemaker-containers requires psutil 5.6.7
python-dateutil==2.8.0
requests<2.21
requests==2.25.1
retrying==1.3.3
sagemaker-containers>=2.8.3,<2.9
sagemaker-containers==2.8.6.post2
sagemaker-inference==1.2.0
scikit-learn==0.23.2
scipy==1.2.2
smdebug==0.4.13
urllib3==1.25.9
wheel
scipy==1.5.3
smdebug==1.0.10
urllib3==1.26.5
wheel==0.35.1
jinja2==3.0.3
itsdangerous==2.0.1
MarkupSafe==2.1.1
Werkzeug==0.15.6
21 changes: 10 additions & 11 deletions src/sagemaker_xgboost_container/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,11 @@ def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights):
:return: xgb.DMatrix or None
"""
try:
dataset = [mlio.SageMakerPipe(pipe_path)]
reader = mlio.CsvReader(dataset=dataset,
batch_size=BATCH_SIZE,
header_row_index=None)
pipes_path = pipe_path if isinstance(pipe_path, list) else [pipe_path]
dataset = [mlio.SageMakerPipe(path) for path in pipes_path]
reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
csv_params = mlio.CsvParams(header_row_index=None)
reader = mlio.CsvReader(reader_params, csv_params)

# Check if data is present in reader
if reader.peek_example() is not None:
Expand Down Expand Up @@ -449,17 +450,15 @@ def get_recordio_protobuf_dmatrix(path, is_pipe=False):
"""
try:
if is_pipe:
dataset = [mlio.SageMakerPipe(path)]
reader = mlio.RecordIOProtobufReader(dataset=dataset,
batch_size=BATCH_SIZE)
pipes_path = path if isinstance(path, list) else [path]
dataset = [mlio.SageMakerPipe(pipe_path) for pipe_path in pipes_path]
else:
dataset = mlio.list_files(path)
reader = mlio.RecordIOProtobufReader(dataset=dataset,
batch_size=BATCH_SIZE)

reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=BATCH_SIZE)
reader = mlio.RecordIOProtobufReader(reader_params)
if reader.peek_example() is not None:
# recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy)
if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
if type(reader.peek_example()['values']) is mlio.DenseTensor:
to_matrix = as_numpy
vstack = np.vstack
else:
Expand Down
5 changes: 3 additions & 2 deletions src/sagemaker_xgboost_container/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,10 @@ def recordio_protobuf_to_dmatrix(string_like): # type: (bytes) -> xgb.DMatrix
"""
buf = bytes(string_like)
dataset = [mlio.InMemoryStore(buf)]
reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=100)
reader_params = mlio.DataReaderParams(dataset=dataset, batch_size=100)
reader = mlio.RecordIOProtobufReader(reader_params)

if type(reader.peek_example()['values']) is mlio.core.DenseTensor:
if type(reader.peek_example()['values']) is mlio.DenseTensor:
to_matrix = as_numpy
vstack = np.vstack
else:
Expand Down
1 change: 1 addition & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Flask==1.1.1
coverage
docker-compose
flake8
Expand Down
7 changes: 4 additions & 3 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = {py36}-xgboost{1.0},flake8
envlist = {py37}-xgboost{1.0},flake8

[flake8]
max-line-length = 120
Expand All @@ -15,8 +15,9 @@ deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
conda_deps=
pyarrow=0.14.1
mlio-py=0.1
pyarrow==0.16.0
mlio-py==0.6.0
tbb==2020.2
conda_channels=
conda-forge
mlio
Expand Down