Skip to content

Commit fd0ec0b

Browse files
navinsoninavinns
andauthored
Upgrade Spark and PySpark version to 3.3.0 (#146)
Co-authored-by: Navin Soni <[email protected]>
1 parent 0dfda76 commit fd0ec0b

File tree

18 files changed

+128
-99
lines changed

18 files changed

+128
-99
lines changed

buildspec-deploy.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ phases:
1818

1919
build:
2020
commands:
21+
- export SBT_OPTS="-Xms1024M -Xmx4G -Xss2M -XX:MaxMetaspaceSize=2G"
22+
2123
# ignore reuse error to allow retry of this build stage
2224
# when sonatype step has transient error
2325
- publish-pypi-package --ignore-reuse-error $CODEBUILD_SRC_DIR_ARTIFACT_1/sagemaker-pyspark-sdk/dist/sagemaker_pyspark-*.tar.gz

buildspec-release.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,21 @@ phases:
1818

1919
build:
2020
commands:
21+
- export SBT_OPTS="-Xms1024M -Xmx4G -Xss2M -XX:MaxMetaspaceSize=2G"
22+
2123
# prepare the release (update versions, changelog etc.)
2224
- git-release --prepare
2325

2426
# spark unit tests and package (no coverage)
2527
- cd $CODEBUILD_SRC_DIR/sagemaker-spark-sdk
26-
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
27-
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI=
28-
sbt -Dsbt.log.noformat=true clean test package
28+
- sbt -Dsbt.log.noformat=true clean test package
2929

3030
# pyspark linters, package and doc build tests
3131
- cd $CODEBUILD_SRC_DIR/sagemaker-pyspark-sdk
3232
- tox -e flake8,twine,sphinx
3333

3434
# pyspark unit tests (no coverage)
35-
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
36-
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= IGNORE_COVERAGE=-
37-
tox -e py27,py36 -- tests/
35+
- tox -e py37 -- tests/
3836

3937
# todo consider adding subset of integration tests
4038

buildspec.yml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ phases:
1010
- export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/bin
1111

1212
# install sbt launcher
13-
- curl -LO https://github.com/sbt/sbt/releases/download/v1.1.6/sbt-1.1.6.tgz
13+
- curl -LO https://github.com/sbt/sbt/releases/download/v1.7.1/sbt-1.7.1.tgz
1414
- tar -xf sbt-*.tgz
1515
- export PATH=$CODEBUILD_SRC_DIR/sbt/bin/:$PATH
1616
- cd $CODEBUILD_SRC_DIR/sagemaker-spark-sdk
@@ -26,13 +26,13 @@ phases:
2626

2727
build:
2828
commands:
29+
- export SBT_OPTS="-Xms1024M -Xmx4G -Xss2M -XX:MaxMetaspaceSize=2G"
30+
2931
# build spark sdk first, since pyspark package depends on it (even linters)
3032

3133
# spark unit tests
3234
- cd $CODEBUILD_SRC_DIR/sagemaker-spark-sdk
33-
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
34-
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI=
35-
sbt -Dsbt.log.noformat=true clean coverage test coverageReport
35+
- sbt -Dsbt.log.noformat=true clean coverage test coverageReport
3636

3737
# rebuild without coverage instrumentation
3838
- cd $CODEBUILD_SRC_DIR/sagemaker-spark-sdk
@@ -41,16 +41,16 @@ phases:
4141
# pyspark linters and unit tests
4242
- cd $CODEBUILD_SRC_DIR/sagemaker-pyspark-sdk
4343
- tox -e flake8,twine,sphinx
44-
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
45-
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI=
46-
tox -e py36,stats -- tests/
44+
- tox -e py37,stats -- tests/
4745

4846
# spark integration tests
4947
- cd $CODEBUILD_SRC_DIR/integration-tests/sagemaker-spark-sdk
50-
- test_cmd="sbt -Dsbt.log.noformat=true it:test"
51-
- execute-command-if-has-matching-changes "$test_cmd" "src/" "test/" "build.sbt" "buildspec.yml"
48+
- sbt -Dsbt.log.noformat=true it:test
49+
# - test_cmd="sbt -Dsbt.log.noformat=true it:test"
50+
# - execute-command-if-has-matching-changes "$test_cmd" "src/" "test/" "build.sbt" "buildspec.yml"
5251

5352
# pyspark integration tests
5453
- cd $CODEBUILD_SRC_DIR/sagemaker-pyspark-sdk
55-
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- $CODEBUILD_SRC_DIR/integration-tests/sagemaker-pyspark-sdk/tests/ -n 10 --boxed --reruns 2"
56-
- execute-command-if-has-matching-changes "$test_cmd" "src/" "tests/" "setup.*" "requirements.txt" "tox.ini" "buildspec.yml"
54+
- IGNORE_COVERAGE=- tox -e py37 -- $CODEBUILD_SRC_DIR/integration-tests/sagemaker-pyspark-sdk/tests/ -n 10 --boxed --reruns 2
55+
# - test_cmd="IGNORE_COVERAGE=- tox -e py37 -- $CODEBUILD_SRC_DIR/integration-tests/sagemaker-pyspark-sdk/tests/ -n 10 --boxed --reruns 2"
56+
# - execute-command-if-has-matching-changes "$test_cmd" "src/" "tests/" "setup.*" "requirements.txt" "tox.ini" "buildspec.yml"

sagemaker-pyspark-sdk/setup.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,19 @@ def read_version():
3636
print("Could not create dir {0}".format(TEMP_PATH), file=sys.stderr)
3737
exit(1)
3838

39-
p = subprocess.Popen("sbt printClasspath".split(),
40-
stdout=subprocess.PIPE,
41-
stderr=subprocess.PIPE,
42-
cwd="../sagemaker-spark-sdk/")
39+
p = subprocess.Popen(
40+
"sbt printClasspath".split(),
41+
stdout=subprocess.PIPE,
42+
stderr=subprocess.PIPE,
43+
cwd="../sagemaker-spark-sdk/",
44+
)
4345

4446
output, errors = p.communicate()
4547

4648
classpath = []
4749
# Java Libraries to include.
48-
java_libraries = ['aws', 'sagemaker', 'hadoop', 'htrace']
49-
for line in output.decode('utf-8').splitlines():
50+
java_libraries = ["aws", "sagemaker", "hadoop", "htrace"]
51+
for line in output.decode("utf-8").splitlines():
5052
path = str(line.strip())
5153
if path.endswith(".jar") and os.path.exists(path):
5254
jar = os.path.basename(path).lower()
@@ -65,8 +67,10 @@ def read_version():
6567

6668
else:
6769
if not os.path.exists(JARS_TARGET):
68-
print("You need to be in the sagemaker-pyspark-sdk root folder to package",
69-
file=sys.stderr)
70+
print(
71+
"You need to be in the sagemaker-pyspark-sdk root folder to package",
72+
file=sys.stderr,
73+
)
7074
exit(-1)
7175

7276
setup(
@@ -76,32 +80,30 @@ def read_version():
7680
author="Amazon Web Services",
7781
url="https://github.com/aws/sagemaker-spark",
7882
license="Apache License 2.0",
83+
python_requires=">= 3.7",
7984
zip_safe=False,
80-
81-
packages=["sagemaker_pyspark",
82-
"sagemaker_pyspark.algorithms",
83-
"sagemaker_pyspark.transformation",
84-
"sagemaker_pyspark.transformation.deserializers",
85-
"sagemaker_pyspark.transformation.serializers",
86-
"sagemaker_pyspark.jars",
87-
"sagemaker_pyspark.licenses"],
88-
85+
packages=[
86+
"sagemaker_pyspark",
87+
"sagemaker_pyspark.algorithms",
88+
"sagemaker_pyspark.transformation",
89+
"sagemaker_pyspark.transformation.deserializers",
90+
"sagemaker_pyspark.transformation.serializers",
91+
"sagemaker_pyspark.jars",
92+
"sagemaker_pyspark.licenses",
93+
],
8994
package_dir={
9095
"sagemaker_pyspark": "src/sagemaker_pyspark",
9196
"sagemaker_pyspark.jars": "deps/jars",
92-
"sagemaker_pyspark.licenses": "licenses"
97+
"sagemaker_pyspark.licenses": "licenses",
9398
},
9499
include_package_data=True,
95-
96100
package_data={
97101
"sagemaker_pyspark.jars": ["*.jar"],
98-
"sagemaker_pyspark.licenses": ["*.txt"]
102+
"sagemaker_pyspark.licenses": ["*.txt"],
99103
},
100-
101104
scripts=["bin/sagemakerpyspark-jars", "bin/sagemakerpyspark-emr-jars"],
102-
103105
install_requires=[
104-
"pyspark==2.4.0",
106+
"pyspark==3.3.0",
105107
"numpy",
106108
],
107109
)

sagemaker-pyspark-sdk/src/sagemaker_pyspark/algorithms/XGBoostSageMakerEstimator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ def __init__(self,
380380
if uid is None:
381381
uid = Identifiable._randomUID()
382382

383-
kwargs = locals()
383+
kwargs = locals().copy()
384384
del kwargs['self']
385385
super(XGBoostSageMakerEstimator, self).__init__(**kwargs)
386386

sagemaker-pyspark-sdk/tests/namepolicy_test.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,29 +28,29 @@ def with_spark_context():
2828
def test_CustomNamePolicyFactory():
2929
policy_factory = CustomNamePolicyFactory("jobName", "modelname", "epconfig", "ep")
3030
java_obj = policy_factory._to_java()
31-
assert(isinstance(java_obj, JavaObject))
32-
assert(java_obj.getClass().getSimpleName() == "CustomNamePolicyFactory")
31+
assert (isinstance(java_obj, JavaObject))
32+
assert (java_obj.getClass().getSimpleName() == "CustomNamePolicyFactory")
3333
policy_name = java_obj.createNamePolicy().getClass().getSimpleName()
34-
assert(policy_name == "CustomNamePolicy")
34+
assert (policy_name == "CustomNamePolicy")
3535

3636

3737
def test_CustomNamePolicyWithTimeStampSuffixFactory():
3838
policy_factory = CustomNamePolicyWithTimeStampSuffixFactory("jobName", "modelname",
3939
"epconfig", "ep")
4040
java_obj = policy_factory._to_java()
41-
assert(isinstance(java_obj, JavaObject))
41+
assert (isinstance(java_obj, JavaObject))
4242
assert (java_obj.getClass().getSimpleName() == "CustomNamePolicyWithTimeStampSuffixFactory")
4343
policy_name = java_obj.createNamePolicy().getClass().getSimpleName()
44-
assert(policy_name == "CustomNamePolicyWithTimeStampSuffix")
44+
assert (policy_name == "CustomNamePolicyWithTimeStampSuffix")
4545

4646

4747
def test_CustomNamePolicyWithTimeStampSuffix():
4848
name_policy = CustomNamePolicyWithTimeStampSuffix("jobName", "modelname", "epconfig", "ep")
49-
assert(isinstance(name_policy._to_java(), JavaObject))
50-
assert(name_policy._call_java("trainingJobName") != "jobName")
51-
assert(name_policy._call_java("modelName") != "modelname")
52-
assert(name_policy._call_java("endpointConfigName") != "epconfig")
53-
assert(name_policy._call_java("endpointName") != "ep")
49+
assert (isinstance(name_policy._to_java(), JavaObject))
50+
assert (name_policy._call_java("trainingJobName") != "jobName")
51+
assert (name_policy._call_java("modelName") != "modelname")
52+
assert (name_policy._call_java("endpointConfigName") != "epconfig")
53+
assert (name_policy._call_java("endpointName") != "ep")
5454

5555
assert (name_policy._call_java("trainingJobName").startswith("jobName"))
5656
assert (name_policy._call_java("modelName").startswith("modelname"))

sagemaker-pyspark-sdk/tox.ini

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[tox]
2-
envlist = flake8,twine,sphinx,py36,stats
2+
envlist = flake8,twine,sphinx,py37,stats
33
skip_missing_interpreters = False
44

55
[testenv]
@@ -38,8 +38,8 @@ basepython = python3
3838
deps =
3939
twine>=1.12.0
4040
commands =
41-
python setup.py sdist
42-
twine check dist/*.tar.gz
41+
- python setup.py sdist
42+
- twine check dist/*.tar.gz
4343

4444
[testenv:flake8]
4545
basepython=python3

sagemaker-spark-sdk/build.sbt

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ scmInfo := Some(
1414
)
1515
licenses := Seq("Apache License, Version 2.0" -> url("https://aws.amazon.com/apache2.0"))
1616

17-
scalaVersion := "2.11.7"
17+
scalaVersion := "2.12.16"
1818

1919
// to change the version of spark add -DSPARK_VERSION=2.x.x when running sbt
2020
// for example: "sbt -DSPARK_VERSION=2.1.1 clean compile test doc package"
21-
val sparkVersion = System.getProperty("SPARK_VERSION", "2.4.0")
21+
val sparkVersion = System.getProperty("SPARK_VERSION", "3.3.0")
2222

2323
lazy val SageMakerSpark = (project in file("."))
2424

@@ -29,16 +29,18 @@ version := {
2929
}
3030

3131
libraryDependencies ++= Seq(
32-
"org.apache.hadoop" % "hadoop-aws" % "2.8.1",
33-
"com.amazonaws" % "aws-java-sdk-s3" % "1.11.835",
34-
"com.amazonaws" % "aws-java-sdk-sts" % "1.11.835",
35-
"com.amazonaws" % "aws-java-sdk-sagemaker" % "1.11.835",
36-
"com.amazonaws" % "aws-java-sdk-sagemakerruntime" % "1.11.835",
32+
"org.apache.hadoop" % "hadoop-aws" % "3.3.1",
33+
"com.amazonaws" % "aws-java-sdk-s3" % "1.12.262",
34+
"com.amazonaws" % "aws-java-sdk-sts" % "1.12.262",
35+
"com.amazonaws" % "aws-java-sdk-sagemaker" % "1.12.262",
36+
"com.amazonaws" % "aws-java-sdk-sagemakerruntime" % "1.12.262",
3737
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
3838
"org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
3939
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
40-
"org.scalatest" %% "scalatest" % "3.0.4" % "test",
41-
"org.mockito" % "mockito-all" % "1.10.19" % "test"
40+
"org.scoverage" %% "scalac-scoverage-plugin" % "1.4.2" % "provided",
41+
"org.scalatest" %% "scalatest" % "3.0.9" % "test",
42+
"org.scala-sbt" %% "compiler-bridge" % "1.7.1" % "test",
43+
"org.mockito" % "mockito-all" % "2.0.2-beta" % "test"
4244
)
4345

4446
// add a task to print the classpath. Also use the packaged JAR instead
@@ -48,8 +50,11 @@ lazy val printClasspath = taskKey[Unit]("Dump classpath")
4850
printClasspath := (fullClasspath in Runtime value) foreach { e => println(e.data) }
4951

5052
// set coverage threshold
51-
coverageMinimum := 90
5253
coverageFailOnMinimum := true
54+
coverageMinimumStmtTotal := 90
55+
coverageMinimumBranchTotal := 90
56+
coverageMinimumStmtPerPackage := 83
57+
coverageMinimumBranchPerPackage := 75
5358

5459
// make scalastyle gate the build
5560
(compile in Compile) := {
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
sbt.version=1.2.1
1+
sbt.version=1.7.1
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
22
addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0")
33
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8.1")
4-
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0")
4+
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.0")

0 commit comments

Comments
 (0)