diff --git a/.gitignore b/.gitignore index c58d83b..0f86cc0 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,9 @@ project/plugins/project/ # Scala-IDE specific .scala_dependencies .worksheet + +# emacs stuff +\#*\# +\.\#* +*~ +sbt/*launch*.jar \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..520a5bf --- /dev/null +++ b/.travis.yml @@ -0,0 +1,38 @@ +language: scala +sudo: false +apt: + - pandas + - numpy +cache: + directories: + - $HOME/.ivy2 + - $HOME/spark + - $HOME/.cache/pip + - $HOME/.sbt/launchers +scala: + - 2.11.6 +jdk: + - oraclejdk8 +apt: + sources: + - ubuntu-toolchain-r-test + packages: + - gfortran + - gcc + - binutils + - python-pip +r_packages: + - Imap +before_install: + - pip install --user codecov unittest2 nose pep8 pylint --download-cache $HOME/.pip-cache +script: + - "export SPARK_CONF_DIR=./log4j/" + - sbt clean coverage compile test + - "[ -f spark] || mkdir spark && cd spark && wget http://d3kbcqa49mib13.cloudfront.net/spark-1.6.1-bin-hadoop2.6.tgz && cd .." + - "tar -xf ./spark/spark-1.6.1-bin-hadoop2.6.tgz" + - "export SPARK_HOME=`pwd`/spark-1.6.1-bin-hadoop2.6" + - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH" + - "nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov" +after_success: +# For now no coverage report + - codecov \ No newline at end of file diff --git a/LICENSE b/LICENSE index 8f71f43..80f405b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,6 @@ +Individual components under resources are available under their own licenses. + * MySQL connector is GPL +The source code in this repo is available under the Apache License Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ diff --git a/README.md b/README.md index a7f4184..551928f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,10 @@ # high-performance-spark-examples Examples for High Performance Spark + +# Building + +Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake. + +# Tests + +The full test suite depends on having the C and Fortran components built as well as a local R installation available. diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..a9e9098 --- /dev/null +++ b/build.sbt @@ -0,0 +1,90 @@ +organization := "com.highperformancespark" + +name := "examples" + +publishMavenStyle := true + +version := "0.0.1" + +scalaVersion := "2.11.6" + +crossScalaVersions := Seq("2.11.6") + +javacOptions ++= Seq("-source", "1.8", "-target", "1.8") + +sparkVersion := "1.6.1" + +//tag::sparkComponents[] +// TODO(Holden): re-add hive-thriftserver post Spark 2.0 +sparkComponents ++= Seq("core", "streaming", "mllib") +//end::sparkComponents[] +//tag::addSQLHiveComponent[] +sparkComponents ++= Seq("sql", "hive") +//end::addSQLHiveComponent[] + + +parallelExecution in Test := false + +fork := true + +javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled") + +// additional libraries +libraryDependencies ++= Seq( + "org.scalatest" %% "scalatest" % "2.2.1", + "org.scalacheck" %% "scalacheck" % "1.12.4", + "junit" % "junit" % "4.10", + // Temporary hack until Spark 2.0 + "org.apache.spark" % "spark-hive-thriftserver_2.10" % "1.6.1" % "provided" intransitive(), + //tag::sparkCSV[] + "com.databricks" % "spark-csv_2.10" % "1.3.0", + //end::sparkCSV[] + "com.holdenkarau" % "spark-testing-base_2.11" % "1.6.1_0.3.3", + "org.eclipse.jetty" % "jetty-util" % "9.3.2.v20150730", + "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8", + "com.novocode" % "junit-interface" % "0.10" % "test->default") + + +scalacOptions ++= Seq("-deprecation", "-unchecked") + +pomIncludeRepository := { x => false } + +resolvers ++= Seq( + "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", + "Spray Repository" at "http://repo.spray.cc/", + "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", + "Akka Repository" at "http://repo.akka.io/releases/", + "Twitter4J Repository" at "http://twitter4j.org/maven2/", + "Apache HBase" at "https://repository.apache.org/content/repositories/releases", + "Twitter Maven Repo" at "http://maven.twttr.com/", + "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", + "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", + "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", + "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", + "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", + Resolver.sonatypeRepo("public"), + Resolver.bintrayRepo("jodersky", "sbt-jni-macros"), + "jodersky" at "https://dl.bintray.com/jodersky/maven/" +) + +licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) + +mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => + { + case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard + case m if m.startsWith("META-INF") => MergeStrategy.discard + case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first + case PathList("org", "apache", xs @ _*) => MergeStrategy.first + case PathList("org", "jboss", xs @ _*) => MergeStrategy.first + case "log4j.properties" => MergeStrategy.discard + case "about.html" => MergeStrategy.rename + case "reference.conf" => MergeStrategy.concat + case _ => MergeStrategy.first + } +} + +// JNI + +enablePlugins(JniNative) + +sourceDirectory in nativeCompile := sourceDirectory.value diff --git a/conf/log4j.properties b/conf/log4j.properties new file mode 100644 index 0000000..e90a817 --- /dev/null +++ b/conf/log4j.properties @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=ERROR, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=ERROR + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark-project.jetty=ERROR +log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/high_performance_pyspark/SQLLineage.py b/high_performance_pyspark/SQLLineage.py new file mode 100644 index 0000000..c9d77a2 --- /dev/null +++ b/high_performance_pyspark/SQLLineage.py @@ -0,0 +1,38 @@ +""" +>>> from pyspark.context import SparkContext +>>> from pyspark.sql import SQLContext, Row, DataFrame +>>> sc = SparkContext('local', 'test') +... +>>> sc.setLogLevel("ERROR") +>>> sqlCtx = SQLContext(sc) +... +>>> rdd = sc.parallelize(range(1, 100)).map(lambda x: Row(i = x)) +>>> df = rdd.toDF() +>>> df2 = cutLineage(df) +>>> df.head() == df2.head() +True +>>> df.schema == df2.schema +True +""" + +from pyspark.sql import DataFrame + +#tag::cutLineage[] +def cutLineage(df): + """ + Cut the lineage of a DataFrame - used for iterative algorithms + + .. Note: This uses internal members and may break between versions + """ + jRDD = df._jdf.toJavaRDD() + jSchema = df._jdf.schema() + jRDD.cache() + sqlCtx = df.sql_ctx + try: + javaSqlCtx = sqlCtx._jsqlContext + except: + javaSqlCtx = sqlCtx._ssql_ctx + newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema) + newDF = DataFrame(newJavaDF, sqlCtx) + return newDF +#end::cutLineage[] diff --git a/high_performance_pyspark/__init__.py b/high_performance_pyspark/__init__.py new file mode 100644 index 0000000..7741593 --- /dev/null +++ b/high_performance_pyspark/__init__.py @@ -0,0 +1,25 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +""" +Python version of selected examples from High Performance Spark +""" + +import os +import sys + diff --git a/high_performance_pyspark/simple_perf_test.py b/high_performance_pyspark/simple_perf_test.py new file mode 100644 index 0000000..e5d6cbb --- /dev/null +++ b/high_performance_pyspark/simple_perf_test.py @@ -0,0 +1,91 @@ +# When running this example make sure to include the built Scala jar : +# $SPARK_HOME/bin/pyspark --jars ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar +# This example illustrates how to interface Scala and Python code, but caution +# should be taken as it depends on many private members that may change in +# future releases of Spark. + +from pyspark.sql.types import * +from pyspark.sql import DataFrame +import timeit +import time + +def generate_scale_data(sqlCtx, rows, numCols): + """ + Generate scale data for the performance test. + + This also illustrates calling custom Scala code from the driver. + + .. Note: This depends on many internal methods and may break between versions. + """ + sc = sqlCtx._sc + # Get the SQL Context, 2.0 and pre-2.0 syntax + try: + javaSqlCtx = sqlCtx._jsqlContext + except: + javaSqlCtx = sqlCtx._ssql_ctx + jsc = sc._jsc + scalasc = jsc.sc() + gateway = sc._gateway + # Call a java method that gives us back an RDD of JVM Rows (Int, Double) + # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are different, so we + # can't directly wrap this. + # This returns a Java RDD of Rows - normally it would better to + # return a DataFrame directly, but for illustration we will work with an RDD + # of Rows. + java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData. \ + generateMiniScaleRows(scalasc, rows, numCols) + # Schemas are serialized to JSON and sent back and forth + # Construct a Python Schema and turn it into a Java Schema + schema = StructType([StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())]) + jschema = javaSqlCtx.parseDataType(schema.json()) + # Convert the Java RDD to Java DataFrame + java_dataframe = javaSqlCtx.createDataFrame(java_rdd, jschema) + # Wrap the Java DataFrame into a Python DataFrame + python_dataframe = DataFrame(java_dataframe, sqlCtx) + # Convert the Python DataFrame into an RDD + pairRDD = python_dataframe.rdd.map(lambda row: (row[0], row[1])) + return (python_dataframe, pairRDD) + +def runOnDF(df): + result = df.groupBy("zip").avg("fuzzyness").count() + return result + +def runOnRDD(rdd): + result = rdd.map(lambda (x, y): (x, (y, 1))). \ + reduceByKey(lambda x, y: (x[0] + y [0], x[1] + y[1])). \ + count() + return result + +def groupOnRDD(rdd): + return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count() + +def run(sc, sqlCtx, scalingFactor, size): + (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size) + input_rdd.cache().count() + rddTimeings = timeit.repeat(stmt=lambda: runOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') + groupTimeings = timeit.repeat(stmt=lambda: groupOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') + input_df.cache().count() + dfTimeings = timeit.repeat(stmt=lambda: runOnDF(input_df), repeat=10, number=1, timer=time.time, setup='gc.enable()') + print "RDD:" + print rddTimeings + print "group:" + print groupTimeings + print "df:" + print dfTimeings + print "yay" + +if __name__ == "__main__": + + """ + Usage: simple_perf_test scalingFactor size + """ + import sys + from pyspark import SparkContext + from pyspark.sql import SQLContext + scalingFactor = int(sys.argv[1]) + size = int(sys.argv[2]) + sc = SparkContext(appName="SimplePythonPerf") + sqlCtx = SQLContext(sc) + run(sc, sqlCtx, scalingFactor, size) + + sc.stop() diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..253c5a6 --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,21 @@ +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0") + +resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" + +resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/" + + +resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" + +// Temporary hack for bintray being sad + +resolvers += Resolver.bintrayRepo("jodersky", "sbt-jni-macros") +resolvers += "jodersky" at "https://dl.bintray.com/jodersky/maven/" + +addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2") + +//addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") + +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.3.3") + +addSbtPlugin("ch.jodersky" % "sbt-jni" % "1.0.0-RC3") diff --git a/resources/mysql-connector-java-5.1.38.jar b/resources/mysql-connector-java-5.1.38.jar new file mode 100644 index 0000000..be09493 Binary files /dev/null and b/resources/mysql-connector-java-5.1.38.jar differ diff --git a/resources/rawpanda.json b/resources/rawpanda.json new file mode 100644 index 0000000..1d9940d --- /dev/null +++ b/resources/rawpanda.json @@ -0,0 +1,2 @@ +{"name":"mission","pandas":[{"id":1,"zip":"94110","pt":"giant", "happy":true, + "attributes":[0.4,0.5]}]} diff --git a/sbt/sbt b/sbt/sbt new file mode 100755 index 0000000..aac1085 --- /dev/null +++ b/sbt/sbt @@ -0,0 +1,52 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script launches sbt for this project. If present it uses the system +# version of sbt. If there is no system version of sbt it attempts to download +# sbt locally. +SBT_VERSION=0.13.9 +URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar +URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar +JAR=sbt/sbt-launch-${SBT_VERSION}.jar + +# Download sbt launch jar if it hasn't been downloaded yet +if [ ! -f ${JAR} ]; then + # Download + printf "Attempting to fetch sbt\n" + set -x + JAR_DL=${JAR}.part + if hash wget 2>/dev/null; then + (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} + elif hash axel 2>/dev/null; then + (axel ${URL1} -o ${JAR_DL} || axel ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR} + else + printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" + exit -1 + fi +fi +if [ ! -f ${JAR} ]; then + # We failed to download + printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" + exit -1 +fi +printf "Launching sbt from ${JAR}\n" +java \ + -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ + -jar ${JAR} \ + "$@" diff --git a/shell-scripts/launch-with-mysql-jdbc b/shell-scripts/launch-with-mysql-jdbc new file mode 100644 index 0000000..90ac352 --- /dev/null +++ b/shell-scripts/launch-with-mysql-jdbc @@ -0,0 +1,5 @@ +ASSEMBLY_JAR=./target/scala-2.10/examples_2.10.jar +CLASS="com.highperformancespark.dataframe.mysqlload" +#tag:[submit] +spark-submit --jars ./resources/mysql-connector-java-5.1.38.jar $ASSEMBLY_JAR $CLASS +#end:[submit] \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..4d3442b --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,64 @@ +################################################################ +# A minimal CMake file that is compatible with sbt-jni # +# # +# All settings required by sbt-jni have been marked so, please # +# add/modify/remove settings to build your specific library. # +################################################################ + +cmake_minimum_required(VERSION 2.6) + +# Define project and related variables +# +project (high-performance-spark) + +# Set versions and library name +# (required by sbt-jni) please use semantic versioning +# +set (VERSION_MAJOR 0) +set (VERSION_MINOR 0) +set (VERSION_PATCH 0) +# (required by sbt-jni) major version will always be appended to library name +set (LIB_NAME ${CMAKE_PROJECT_NAME}${VERSION_MAJOR}) + +# Command-line options +# +# (set by sbt-jni) +set (LIB_INSTALL_DIR lib CACHE PATH "Path in which to install libraries (equivalent to Autoconf --libdir).") +# (set by sbt-jni) +set (LIB_ENABLE_MINOR_VERSIONS ON CACHE BOOLEAN "Build libraries with minor and patch versions appended.") + +# Setup JNI +find_package(JNI REQUIRED) +if (JNI_FOUND) + message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}") +endif() + +# Include directories +include_directories(.) +include_directories(./main/c) +include_directories(include) +include_directories(${JNI_INCLUDE_DIRS}) + +# Setup main shared library +file(GLOB LIB_SRC + "*.c" + "*.cpp" + "./main/c/*.c" + "./main/c/*.cpp" +) +add_library(${LIB_NAME} SHARED ${LIB_SRC}) + +# By default, in a regular build, minor and patch versions are added to the generated files. +# When built through sbt-jni however, LIB_ENABLE_MINOR_VERSIONS is deactivated and only a +# major-versioned library file is built. +if (LIB_ENABLE_MINOR_VERSIONS) + set_target_properties( + ${LIB_NAME} + PROPERTIES + VERSION 0.${VERSION_MINOR}.${VERSION_PATCH} # major version always 0, it is included in library name + SOVERSION 0 + ) +endif() + +# Installation targets +install(TARGETS ${LIB_NAME} LIBRARY DESTINATION ${LIB_INSTALL_DIR}) diff --git a/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h b/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h new file mode 100644 index 0000000..75be264 --- /dev/null +++ b/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h @@ -0,0 +1,21 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class com_highperformancespark_examples_ffi_SumJNI */ + +#ifndef _Included_com_highperformancespark_examples_ffi_SumJNI +#define _Included_com_highperformancespark_examples_ffi_SumJNI +#ifdef __cplusplus +extern "C" { +#endif +/* + * Class: com_highperformancespark_examples_ffi_SumJNI + * Method: sum + * Signature: ([I)I + */ +JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum + (JNIEnv *, jobject, jintArray); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/main/c/sum.c b/src/main/c/sum.c new file mode 100644 index 0000000..f571aad --- /dev/null +++ b/src/main/c/sum.c @@ -0,0 +1,9 @@ +#include "sum.h" + +int sum(int input[], int num_elem) { + int c, ret = 0; + for (c = 0; c < num_elem; c++) { + ret += input[c]; + } + return ret; +} diff --git a/src/main/c/sum.h b/src/main/c/sum.h new file mode 100644 index 0000000..d04be96 --- /dev/null +++ b/src/main/c/sum.h @@ -0,0 +1,6 @@ +#ifndef _SUM_H +#define _SUM_H + +int sum(int input[], int num_elem); + +#endif /* _SUM_H */ diff --git a/src/main/c/sum_wrapper.c b/src/main/c/sum_wrapper.c new file mode 100644 index 0000000..a499d3e --- /dev/null +++ b/src/main/c/sum_wrapper.c @@ -0,0 +1,16 @@ +#include "sum.h" +#include "include/com_highperformancespark_examples_ffi_SumJNI.h" +#include +#include + +/* + * Class: com_highperformancespark_examples_ffi_SumJNI + * Method: sum + * Signature: ([I)I + */ +JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum +(JNIEnv *env, jobject obj, jintArray ja) { + jsize size = (*env)->GetArrayLength(env, ja); + jint *a = (*env)->GetIntArrayElements(env, ja, 0); + return sum(a, size); +} diff --git a/src/main/java/com/highperformancespark/examples/JavaInterop.java b/src/main/java/com/highperformancespark/examples/JavaInterop.java new file mode 100644 index 0000000..3b37093 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/JavaInterop.java @@ -0,0 +1,36 @@ +package com.highperformancespark.examples; + +import scala.reflect.*; +import scala.Tuple2; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.spark.sql.functions.*; + +public class JavaInterop { + + //tag::realClassTag[] + public static JavaPairRDD wrapPairRDD(RDD> rdd) { + // Construct the class tags + ClassTag strCt = ClassTag$.MODULE$.apply(String.class); + ClassTag longCt = ClassTag$.MODULE$.apply(scala.Long.class); + return new JavaPairRDD(rdd, strCt, longCt); + } + //end::realClassTag[] + + //tag::fakeClassTag[] + public static JavaPairRDD wrapPairRDDFakeCt(RDD> rdd) { + // Construct the class tags by casting AnyRef - this would be more commonly done with + // generic or templated code where we can't explicitly construct the correct class tag + // as using fake class tags may result in degraded performance. + ClassTag fake = ClassTag$.MODULE$.AnyRef(); + return new JavaPairRDD(rdd, fake, fake); + } + //end::fakeClassTag[] +} diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java new file mode 100644 index 0000000..bc93163 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java @@ -0,0 +1,210 @@ +package com.highperformancespark.examples.dataframe; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.expressions.Window; +import org.apache.spark.sql.expressions.WindowSpec; +import org.apache.spark.sql.hive.HiveContext; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.spark.sql.functions.*; + +public class JavaHappyPandas { + + /** + * Creates SQLContext with an existing SparkContext. + */ + public static SQLContext sqlContext(JavaSparkContext jsc) { + SQLContext sqlContext = new SQLContext(jsc); + return sqlContext; + } + + /** + * Creates HiveContext with an existing SparkContext. + */ + public static HiveContext hiveContext(JavaSparkContext jsc) { + HiveContext hiveContext = new HiveContext(jsc); + return hiveContext; + } + + /** + * Illustrate loading some JSON data. + */ + public static DataFrame loadDataSimple(JavaSparkContext jsc, SQLContext sqlContext, String path) { + DataFrame df1 = sqlContext.read().json(path); + + DataFrame df2 = sqlContext.read().format("json").option("samplingRatio", "1.0").load(path); + + JavaRDD jsonRDD = jsc.textFile(path); + DataFrame df3 = sqlContext.read().json(jsonRDD); + + return df1; + } + + public static DataFrame jsonLoadFromRDD(SQLContext sqlContext, JavaRDD input) { + JavaRDD rdd = input.filter(e -> e.contains("panda")); + DataFrame df = sqlContext.read().json(rdd); + return df; + } + + // Here will be some examples on PandaInfo DataFrame + + /** + * Gets the percentage of happy pandas per place. + * + * @param pandaInfo the input DataFrame + * @return Returns DataFrame of (place, percentage of happy pandas) + */ + public static DataFrame happyPandasPercentage(DataFrame pandaInfo) { + DataFrame happyPercentage = pandaInfo.select(pandaInfo.col("place"), + (pandaInfo.col("happyPandas").divide(pandaInfo.col("totalPandas"))).as("percentHappy")); + return happyPercentage; + } + + /** + * Encodes pandaType to Integer values instead of String values. + * + * @param pandaInfo the input DataFrame + * @return Returns a DataFrame of pandaId and integer value for pandaType. + */ + public static DataFrame encodePandaType(DataFrame pandaInfo) { + DataFrame encodedDF = pandaInfo.select(pandaInfo.col("id"), + when(pandaInfo.col("pt").equalTo("giant"), 0). + when(pandaInfo.col("pt").equalTo("red"), 1). + otherwise(2).as("encodedType")); + + return encodedDF; + } + + /** + * Gets places with happy pandas more than minHappinessBound. + */ + public static DataFrame minHappyPandas(DataFrame pandaInfo, int minHappyPandas) { + return pandaInfo.filter(pandaInfo.col("happyPandas").geq(minHappyPandas)); + } + + /** + * Find pandas that are sad. + */ + public static DataFrame sadPandas(DataFrame pandaInfo) { + return pandaInfo.filter(pandaInfo.col("happy").notEqual(true)); + } + + /** + * Find pandas that are happy and fuzzier than squishy. + */ + public static DataFrame happyFuzzyPandas(DataFrame pandaInfo) { + DataFrame df = pandaInfo.filter( + pandaInfo.col("happy").and(pandaInfo.col("attributes").apply(0)).gt(pandaInfo.col("attributes").apply(1)) + ); + + return df; + } + + /** + * Gets places that contains happy pandas more than unhappy pandas. + */ + public static DataFrame happyPandasPlaces(DataFrame pandaInfo) { + return pandaInfo.filter(pandaInfo.col("happyPandas").geq(pandaInfo.col("totalPandas").divide(2))); + } + + /** + * Remove duplicate pandas by id. + */ + public static DataFrame removeDuplicates(DataFrame pandas) { + DataFrame df = pandas.dropDuplicates(new String[]{"id"}); + return df; + } + + public static DataFrame describePandas(DataFrame pandas) { + return pandas.describe(); + } + + public static DataFrame maxPandaSizePerZip(DataFrame pandas) { + return pandas.groupBy(pandas.col("zip")).max("pandaSize"); + } + + public static DataFrame minMaxPandaSizePerZip(DataFrame pandas) { + return pandas.groupBy(pandas.col("zip")).agg(min("pandaSize"), max("pandaSize")); + } + + public static DataFrame minPandaSizeMaxAgePerZip(DataFrame pandas) { + Map map = new HashMap<>(); + map.put("pandaSize", "min"); + map.put("age", "max"); + + DataFrame df = pandas.groupBy(pandas.col("zip")).agg(map); + return df; + } + + public static DataFrame minMeanSizePerZip(DataFrame pandas) { + return pandas.groupBy(pandas.col("zip")).agg(min(pandas.col("pandaSize")), mean(pandas.col("pandaSize"))); + } + + public static DataFrame simpleSqlExample(DataFrame pandas) { + SQLContext sqlContext = pandas.sqlContext(); + pandas.registerTempTable("pandas"); + + DataFrame miniPandas = sqlContext.sql("SELECT * FROM pandas WHERE pandaSize < 12"); + return miniPandas; + } + + /** + * Orders pandas by size ascending and by age descending. + * Pandas will be sorted by "size" first and if two pandas + * have the same "size" will be sorted by "age". + */ + public static DataFrame orderPandas(DataFrame pandas) { + return pandas.orderBy(pandas.col("pandaSize").asc(), pandas.col("age").desc()); + } + + public static DataFrame computeRelativePandaSizes(DataFrame pandas) { + //tag::relativePandaSizesWindow[] + WindowSpec windowSpec = Window + .orderBy(pandas.col("age")) + .partitionBy(pandas.col("zip")) + .rowsBetween(-10, 10); // can use rangeBetween for range instead + //end::relativePandaSizesWindow[] + + //tag::relativePandaSizesQuery[] + Column pandaRelativeSizeCol = pandas.col("pandaSize").minus(avg(pandas.col("pandaSize")).over(windowSpec)); + + return pandas.select(pandas.col("name"), pandas.col("zip"), pandas.col("pandaSize"), + pandas.col("age"), pandaRelativeSizeCol.as("panda_relative_size")); + //end::relativePandaSizesQuery[] + } + + public static void joins(DataFrame df1, DataFrame df2) { + //tag::innerJoin[] + // Inner join implicit + df1.join(df2, df1.col("name").equalTo(df2.col("name"))); + // Inner join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "inner"); + //end::innerJoin[] + + //tag::leftouterJoin[] + // Left outer join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "left_outer"); + //end::leftouterJoin[] + + //tag::rightouterJoin[] + // Right outer join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "right_outer"); + //end::rightouterJoin[] + + //tag::leftsemiJoin[] + // Left semi join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "leftsemi"); + //end::leftsemiJoin[] + } + + public static DataFrame selfJoin(DataFrame df) { + return (df.as("a")).join(df.as("b")).where("a.name = b.name"); + } + +} diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java new file mode 100644 index 0000000..9d36dd8 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java @@ -0,0 +1,140 @@ +package com.highperformancespark.examples.dataframe; + +import com.highperformancespark.examples.objects.JavaPandaPlace; +import com.highperformancespark.examples.objects.JavaRawPanda; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.*; + +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; + +public class JavaLoadSave { + private SQLContext sqlContext; + + public JavaLoadSave(SQLContext sqlContext) { + this.sqlContext = sqlContext; + } + + //tag::createFromRDD[] + public DataFrame createFromJavaBean(JavaRDD input) { + // Create DataFrame using Java Bean + DataFrame df1 = sqlContext.createDataFrame(input, JavaPandaPlace.class); + + // Create DataFrame using JavaRDD + JavaRDD rowRDD = input.map(pm -> RowFactory.create(pm.getName(), + pm.getPandas().stream() + .map(pi -> RowFactory.create(pi.getId(), pi.getZip(), pi.isHappy(), pi.getAttributes())) + .collect(Collectors.toList()))); + + ArrayType pandasType = DataTypes.createArrayType(new StructType( + new StructField[]{ + new StructField("id", DataTypes.LongType, true, Metadata.empty()), + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("happy", DataTypes.BooleanType, true, Metadata.empty()), + new StructField("attributes", DataTypes.createArrayType(DataTypes.FloatType), true, Metadata.empty()) + } + )); + + StructType schema = new StructType(new StructField[]{ + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("pandas", pandasType, true, Metadata.empty()) + }); + + DataFrame df2 = sqlContext.createDataFrame(rowRDD, schema); + return df2; + } + //end::createFromRDD[] + + //tag::createFromLocal[] + public DataFrame createFromLocal(List input) { + return sqlContext.createDataFrame(input, PandaPlace.class); + } + //end::createFromLocal[] + + //tag::collectResults[] + public Row[] collectDF(DataFrame df) { + return df.collect(); + } + //end::collectResults[] + + //tag::toRDD[] + public JavaRDD toRDD(DataFrame input) { + JavaRDD rdd = input.javaRDD().map(row -> new JavaRawPanda(row.getLong(0), row.getString(1), + row.getString(2), row.getBoolean(3), row.getList(4))); + return rdd; + } + //end::toRDD[] + + //tag::partitionedOutput[] + public void writeOutByZip(DataFrame input) { + input.write().partitionBy("zipcode").format("json").save("output/"); + } + //end::partitionedOutput[] + + //tag::saveAppend[] + public void writeAppend(DataFrame input) { + input.write().mode(SaveMode.Append).save("output/"); + } + //end::saveAppend[] + + public DataFrame createJDBC() { + //tag::createJDBC[] + DataFrame df1 = sqlContext.read().jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties()); + + DataFrame df2 = sqlContext.read().format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("dbtable", "table").load(); + + return df2; + //end::createJDBC[] + } + + public void writeJDBC(DataFrame df) { + //tag::writeJDBC[] + df.write().jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties()); + + df.write().format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("user", "user") + .option("password", "pass") + .option("dbtable", "table").save(); + //end::writeJDBC[] + } + + //tag::loadParquet[] + public DataFrame loadParquet(String path) { + // Configure Spark to read binary data as string, note: must be configured on SQLContext + sqlContext.setConf("spark.sql.parquet.binaryAsString", "true"); + + // Load parquet data using merge schema (configured through option) + DataFrame df = sqlContext.read() + .option("mergeSchema", "true") + .format("parquet") + .load(path); + + return df; + } + //end::loadParquet[] + + //tag::writeParquet[] + public void writeParquet(DataFrame df, String path) { + df.write().format("parquet").save(path); + } + //end::writeParquet[] + + //tag::loadHiveTable[] + public DataFrame loadHiveTable() { + return sqlContext.read().table("pandas"); + } + //end::loadHiveTable[] + + //tag::saveManagedTable[] + public void saveManagedTable(DataFrame df) { + df.write().saveAsTable("pandas"); + } + //end::saveManagedTable[] +} diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java new file mode 100644 index 0000000..dd23616 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java @@ -0,0 +1,76 @@ +package com.highperformancespark.examples.dataframe; + +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.expressions.MutableAggregationBuffer; +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; +import org.apache.spark.sql.types.*; + +public class JavaUDFs { + + public static void setupUDFs(SQLContext sqlContext) { + //tag::basicUDF[] + sqlContext.udf().register("strlen", (String s) -> s.length(), DataTypes.StringType); + //end::basicUDF[] + } + + public static void setupUDAFs(SQLContext sqlContext) { + + class Avg extends UserDefinedAggregateFunction { + + @Override + public StructType inputSchema() { + StructType inputSchema = + new StructType(new StructField[]{new StructField("value", DataTypes.DoubleType, true, Metadata.empty())}); + return inputSchema; + } + + @Override + public StructType bufferSchema() { + StructType bufferSchema = + new StructType(new StructField[]{ + new StructField("count", DataTypes.LongType, true, Metadata.empty()), + new StructField("sum", DataTypes.DoubleType, true, Metadata.empty()) + }); + + return bufferSchema; + } + + @Override + public DataType dataType() { + return DataTypes.DoubleType; + } + + @Override + public boolean deterministic() { + return true; + } + + @Override + public void initialize(MutableAggregationBuffer buffer) { + buffer.update(0, 0L); + buffer.update(1, 0.0); + } + + @Override + public void update(MutableAggregationBuffer buffer, Row input) { + buffer.update(0, buffer.getLong(0) + 1); + buffer.update(1, buffer.getDouble(1) + input.getDouble(0)); + } + + @Override + public void merge(MutableAggregationBuffer buffer1, Row buffer2) { + buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0)); + buffer1.update(1, buffer1.getDouble(1) + buffer2.getDouble(1)); + } + + @Override + public Object evaluate(Row buffer) { + return buffer.getDouble(1) / buffer.getLong(0); + } + } + + Avg average = new Avg(); + sqlContext.udf().register("ourAvg", average); + } +} diff --git a/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksFirstTry.java b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksFirstTry.java new file mode 100644 index 0000000..d8ffcc9 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksFirstTry.java @@ -0,0 +1,264 @@ +package com.highperformancespark.examples.goldilocks; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFlatMapFunction; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.storage.StorageLevel; +import scala.Tuple2; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class JavaGoldiLocksFirstTry { + + /** + * Find nth target rank for every column. + * + * For example: + * + * dataframe: + * (0.0, 4.5, 7.7, 5.0) + * (1.0, 5.5, 6.7, 6.0) + * (2.0, 5.5, 1.5, 7.0) + * (3.0, 5.5, 0.5, 7.0) + * (4.0, 5.5, 0.5, 8.0) + * + * targetRanks: + * 1, 3 + * + * The output will be: + * 0 -> (0.0, 2.0) + * 1 -> (4.5, 5.5) + * 2 -> (7.7, 1.5) + * 3 -> (5.0, 7.0) + * + * @param dataframe dataframe of doubles + * @param targetRanks the required ranks for every column + * + * @return map of (column index, list of target ranks) + */ + public static Map> findRankStatistics(DataFrame dataframe, List targetRanks) { + JavaPairRDD valueColumnPairs = getValueColumnPairs(dataframe); + + JavaPairRDD sortedValueColumnPairs = valueColumnPairs.sortByKey(); + sortedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK()); + + int numOfColumns = dataframe.schema().length(); + List>> partitionColumnsFreq = + getColumnsFreqPerPartition(sortedValueColumnPairs, numOfColumns); + + List>>> ranksLocations = + getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, numOfColumns); + + JavaPairRDD targetRanksValues = findTargetRanksIteratively(sortedValueColumnPairs, ranksLocations); + + return targetRanksValues.groupByKey().collectAsMap(); + } + + /** + * Step 1. Map the rows to pairs of (value, column Index). + * + * For example: + * + * dataFrame: + * 1.5, 1.25, 2.0 + * 5.25, 2.5, 1.5 + * + * The output RDD will be: + * (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) (2.5, 1) (1.5, 2) + * + * @param dataframe dateframe of doubles + * + * @return RDD of pairs (value, column Index) + */ + private static JavaPairRDD getValueColumnPairs(DataFrame dataframe) { + JavaPairRDD value_ColIndex = + dataframe.javaRDD().flatMapToPair((PairFlatMapFunction) row -> { + List rowList = (List) (Object) toList(row.toSeq()); + List> list = zipWithIndex(rowList); + return list; + }); + + return value_ColIndex; + } + + /** + * Step 2. Find the number of elements for each column in each partition. + * + * For Example: + * + * sortedValueColumnPairs: + * Partition 1: (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) + * Partition 2: (7.5, 1) (9.5, 2) + * + * numOfColumns: 3 + * + * The output will be: + * [(0, [2, 1, 1]), (1, [0, 1, 1])] + * + * @param sortedValueColumnPairs - sorted RDD of (value, column Index) pairs + * @param numOfColumns the number of columns + * + * @return Array that contains (partition index, number of elements from every column on this partition) + */ + private static List>> getColumnsFreqPerPartition(JavaPairRDD sortedValueColumnPairs, int numOfColumns) { + List>> columsFreqPerPartition = + sortedValueColumnPairs.mapPartitionsWithIndex((partitionIndex, valueColumnPairs) -> { + Long[] freq = new Long[numOfColumns]; + Arrays.fill(freq, 0L); + + while(valueColumnPairs.hasNext()) { + int colIndex = valueColumnPairs.next()._2; + freq[colIndex] = freq[colIndex] + 1; + } + + List freqList = Arrays.asList(freq); + List>> partitionList = Arrays.asList(new Tuple2<>(partitionIndex, freqList)); + return partitionList.iterator(); + }, false).collect(); + + return columsFreqPerPartition; + } + + /** + * Step 3: For each Partition determine the index of the elements that are desired rank statistics + * + * For Example: + * targetRanks: 5 + * partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])] + * numOfColumns: 2 + * + * The output will be: + * [(0, []), (1, [(colIdx=0, rankLocation=3)]), (2, [(colIndex=1, rankLocation=1)])] + * + * @param partitionColumnsFreq Array of (partition index, columns frequencies per this partition) + * + * @return Array that contains (partition index, relevantIndexList where relevantIndexList(i) = the index + * of an element on this partition that matches one of the target ranks) + */ + private static List>>> getRanksLocationsWithinEachPart(List targetRanks, + List>> partitionColumnsFreq, int numOfColumns) { + + long[] runningTotal = new long[numOfColumns]; + + List>>> ranksLocations = + partitionColumnsFreq + .stream() + .sorted((o1, o2) -> o1._1.compareTo(o2._1)) + .map(partitionIndex_columnsFreq -> { + int partitionIndex = partitionIndex_columnsFreq._1; + List columnsFreq = partitionIndex_columnsFreq._2; + + List> relevantIndexList = new ArrayList<>(); + + zipWithIndex(columnsFreq).stream().forEach(colCount_colIndex -> { + long colCount = colCount_colIndex._1; + int colIndex = colCount_colIndex._2; + + long runningTotalCol = runningTotal[colIndex]; + Stream ranksHere = + targetRanks.stream().filter(rank -> runningTotalCol < rank && runningTotalCol + colCount >= rank); + + // for each of the rank statistics present add this column index and the index it will be at + // on this partition (the rank - the running total) + relevantIndexList.addAll( + ranksHere.map(rank -> new Tuple2<>(colIndex, rank - runningTotalCol)).collect(Collectors.toList())); + + runningTotal[colIndex] += colCount; + }); + + + return new Tuple2<>(partitionIndex, relevantIndexList); + }).collect(Collectors.toList()); + + return ranksLocations; + } + + /** + * Finds rank statistics elements using ranksLocations. + * + * @param sortedValueColumnPairs - sorted RDD of (value, colIndex) pairs + * @param ranksLocations Array of (partition Index, list of (column index, rank index of this column at this partition)) + * + * @return returns RDD of the target ranks (column index, value) + */ + private static JavaPairRDD findTargetRanksIteratively(JavaPairRDD sortedValueColumnPairs, + List>>> ranksLocations) { + + JavaRDD> targetRanks = sortedValueColumnPairs.mapPartitionsWithIndex( + (partitionIndex, valueColumnPairs) -> { + List> targetsInThisPart = ranksLocations.get(partitionIndex)._2; + List> result = new ArrayList<>(); + + if (!targetsInThisPart.isEmpty()) { + Map> columnsRelativeIndex = groupByKey(targetsInThisPart); + Set columnsInThisPart = columnsRelativeIndex.keySet(); + + Map runningTotals = toMap(columnsInThisPart); + + // filter this iterator, so that it contains only those (value, columnIndex) that are the ranks statistics on this partition + // I.e. Keep track of the number of elements we have seen for each columnIndex using the + // running total hashMap. Keep those pairs for which value is the nth element for that columnIndex that appears on this partition + // and the map contains (columnIndex, n). + + while (valueColumnPairs.hasNext()) { + Tuple2 value_colIndex = valueColumnPairs.next(); + double value = value_colIndex._1; + int colIndex = value_colIndex._2; + + if (columnsInThisPart.contains(colIndex)) { + long total = runningTotals.get(colIndex) + 1L; + runningTotals.put(colIndex, total); + if (columnsRelativeIndex.get(colIndex).contains(total)) { + result.add(value_colIndex.swap()); + } + } + } + } + + return result.iterator(); + }, false); + + return targetRanks.mapToPair((PairFunction, Integer, Double>) t -> t); + } + + private static Map toMap(Set set) { + Map map = new HashMap<>(); + for (int k: set) + map.put(k, 0L); + + return map; + } + + private static Map> groupByKey(List> list) { + Map> map = new HashMap<>(); + for (int i = 0; i < list.size(); i++) { + Tuple2 curr = list.get(i); + if (!map.containsKey(curr._1)) + map.put(curr._1, new ArrayList<>()); + + map.get(curr._1).add(curr._2); + } + + return map; + } + + private static List toList(scala.collection.Seq seq) { + return scala.collection.JavaConversions.seqAsJavaList(seq); + } + + private static List> zipWithIndex(List list) { + List> indexedList = new ArrayList<>(); + for (int i = 0; i < list.size(); i++) + indexedList.add(new Tuple2<>(list.get(i), i)); + + return indexedList; + } + +} + diff --git a/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksGroupByKey.java b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksGroupByKey.java new file mode 100644 index 0000000..f5f72ce --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksGroupByKey.java @@ -0,0 +1,32 @@ +package com.highperformancespark.examples.goldilocks; + +import org.apache.spark.api.java.JavaPairRDD; + +import java.util.*; +import java.util.stream.Collectors; + +public class JavaGoldiLocksGroupByKey { + //tag::groupByKey[] + public Map> findRankStatistics( + JavaPairRDD pairRDD, List ranks) { + + Map> element_ranks = pairRDD.groupByKey().mapValues(iter -> { + List values = new ArrayList<>(); + Iterator iterator = iter.iterator(); + while (iterator.hasNext()) + values.add(iterator.next()); + Collections.sort(values); + + List result = + ranks.stream() + .map(n -> values.get(new Long(n).intValue())) + .collect(Collectors.toList()); + + return result; + }).collectAsMap(); + + return element_ranks; + } + //end::groupByKey[] + +} diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java b/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java new file mode 100644 index 0000000..e3f5325 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java @@ -0,0 +1,29 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; + +public class JavaCoffeeShop implements Serializable { + private String zip; + private String name; + + public JavaCoffeeShop(String zip, String name) { + this.zip = zip; + this.name = name; + } + + public String getZip() { + return zip; + } + + public void setZip(String zip) { + this.zip = zip; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } +} \ No newline at end of file diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaGoldiLocksRow.java b/src/main/java/com/highperformancespark/examples/objects/JavaGoldiLocksRow.java new file mode 100644 index 0000000..82cafe9 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaGoldiLocksRow.java @@ -0,0 +1,49 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; + +public class JavaGoldiLocksRow implements Serializable { + private double a; + private double b; + private double c; + private double d; + + public JavaGoldiLocksRow(double a, double b, double c, double d) { + this.a = a; + this.b = b; + this.c = c; + this.d = d; + } + + public double getA() { + return a; + } + + public void setA(double a) { + this.a = a; + } + + public double getB() { + return b; + } + + public void setB(double b) { + this.b = b; + } + + public double getC() { + return c; + } + + public void setC(double c) { + this.c = c; + } + + public double getD() { + return d; + } + + public void setD(double d) { + this.d = d; + } +} \ No newline at end of file diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java new file mode 100644 index 0000000..c2b7847 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java @@ -0,0 +1,56 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; + +public class JavaPandaInfo implements Serializable { + private String place; + private String pandaType; + private int happyPandas; + private int totalPandas; + + /** + * @param place name of place + * @param pandaType type of pandas in this place + * @param happyPandas number of happy pandas in this place + * @param totalPandas total number of pandas in this place + */ + public JavaPandaInfo(String place, String pandaType, int happyPandas, int totalPandas) { + this.place = place; + this.pandaType = pandaType; + this.happyPandas = happyPandas; + this.totalPandas = totalPandas; + } + + public String getPlace() { + return place; + } + + public void setPlace(String place) { + this.place = place; + } + + public String getPandaType() { + return pandaType; + } + + public void setPandaType(String pandaType) { + this.pandaType = pandaType; + } + + public int getHappyPandas() { + return happyPandas; + } + + public void setHappyPandas(int happyPandas) { + this.happyPandas = happyPandas; + } + + public int getTotalPandas() { + return totalPandas; + } + + public void setTotalPandas(int totalPandas) { + this.totalPandas = totalPandas; + } + +} diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java new file mode 100644 index 0000000..dc33d9c --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java @@ -0,0 +1,34 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; +import java.util.List; + +public class JavaPandaPlace implements Serializable { + private String name; + private List pandas; + + /** + * @param name place name + * @param pandas pandas in that place + */ + public JavaPandaPlace(String name, List pandas) { + this.name = name; + this.pandas = pandas; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getPandas() { + return pandas; + } + + public void setPandas(List pandas) { + this.pandas = pandas; + } +} \ No newline at end of file diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java new file mode 100644 index 0000000..f73e93f --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java @@ -0,0 +1,56 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; + +public class JavaPandas implements Serializable { + private String name; + private String zip; + private int pandaSize; + private int age; + + /** + * @param name name of panda + * @param zip zip code + * @param pandaSize size of panda in KG + * @param age age of panda + */ + public JavaPandas(String name, String zip, int pandaSize, int age) { + this.name = name; + this.zip = zip; + this.pandaSize = pandaSize; + this.age = age; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getZip() { + return zip; + } + + public void setZip(String zip) { + this.zip = zip; + } + + public int getPandaSize() { + return pandaSize; + } + + public void setPandaSize(int pandaSize) { + this.pandaSize = pandaSize; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + +} diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java b/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java new file mode 100644 index 0000000..7d2be17 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java @@ -0,0 +1,67 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; +import java.util.List; + +public class JavaRawPanda implements Serializable { + private long id; + private String zip; + private String pt; + private boolean happy; + private List attributes; + + /** + * @param id panda id + * @param zip zip code of panda residence + * @param pt Type of panda as a string + * @param happy if panda is happy + * @param attributes array of panada attributes + */ + public JavaRawPanda(long id, String zip, String pt, boolean happy, List attributes) { + this.attributes = attributes; + this.id = id; + this.zip = zip; + this.pt = pt; + this.happy = happy; + } + + public long getId() { + return id; + } + + public void setId(long id) { + this.id = id; + } + + public String getZip() { + return zip; + } + + public void setZip(String zip) { + this.zip = zip; + } + + public String getPt() { + return pt; + } + + public void setPt(String pt) { + this.pt = pt; + } + + public boolean isHappy() { + return happy; + } + + public void setHappy(boolean happy) { + this.happy = happy; + } + + public List getAttributes() { + return attributes; + } + + public void setAttributes(List attributes) { + this.attributes = attributes; + } +} \ No newline at end of file diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksFirstTry.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksFirstTry.scala new file mode 100644 index 0000000..3d9eb94 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksFirstTry.scala @@ -0,0 +1,211 @@ +package com.highperformancespark.examples.goldilocks + +import scala.collection.{Map, mutable} +import scala.collection.mutable.{ArrayBuffer, MutableList} + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame +import org.apache.spark.storage.StorageLevel + +object GoldiLocksGroupByKey { + //tag::groupByKey[] + def findRankStatistics( + pairRDD: RDD[(Int, Double)], + ranks: List[Long]): Map[Int, List[Double]] = { + pairRDD.groupByKey().mapValues(iter => { + val ar = iter.toArray.sorted + ranks.map(n => ar(n.toInt)) + }).collectAsMap() + } + //end::groupByKey[] +} + +//tag::firstTry[] +object GoldiLocksFirstTry { + + /** + * Find nth target rank for every column. + * + * For example: + * + * dataframe: + * (0.0, 4.5, 7.7, 5.0) + * (1.0, 5.5, 6.7, 6.0) + * (2.0, 5.5, 1.5, 7.0) + * (3.0, 5.5, 0.5, 7.0) + * (4.0, 5.5, 0.5, 8.0) + * + * targetRanks: + * 1, 3 + * + * The output will be: + * 0 -> (0.0, 2.0) + * 1 -> (4.5, 5.5) + * 2 -> (7.7, 1.5) + * 3 -> (5.0, 7.0) + * + * @param dataframe dataframe of doubles + * @param targetRanks the required ranks for every column + * @return map of (column index, list of target ranks) + */ + def findRankStatistics(dataframe: DataFrame, targetRanks: List[Long]): + Map[Int, Iterable[Double]] = { + + val valueColumnPairs: RDD[(Double, Int)] = getValueColumnPairs(dataframe) + val sortedValueColumnPairs = valueColumnPairs.sortByKey() + sortedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK) + + val numOfColumns = dataframe.schema.length + val partitionColumnsFreq = getColumnsFreqPerPartition(sortedValueColumnPairs, numOfColumns) + val ranksLocations = getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, numOfColumns) + + val targetRanksValues = findTargetRanksIteratively(sortedValueColumnPairs, ranksLocations) + targetRanksValues.groupByKey().collectAsMap() + } + + /** + * Step 1. Map the rows to pairs of (value, column Index). + * + * For example: + * + * dataFrame: + * 1.5, 1.25, 2.0 + * 5.25, 2.5, 1.5 + * + * The output RDD will be: + * (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) (2.5, 1) (1.5, 2) + * + * @param dataframe dateframe of doubles + * + * @return RDD of pairs (value, column Index) + */ + private def getValueColumnPairs(dataframe : DataFrame): RDD[(Double, Int)] = { + dataframe.flatMap(row => row.toSeq.zipWithIndex.map{ case (v, index) => + (v.toString.toDouble, index)}) + } + + /** + * Step 2. Find the number of elements for each column in each partition. + * + * For Example: + * + * sortedValueColumnPairs: + * Partition 1: (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) + * Partition 2: (7.5, 1) (9.5, 2) + * + * numOfColumns: 3 + * + * The output will be: + * [(0, [2, 1, 1]), (1, [0, 1, 1])] + * + * @param sortedValueColumnPairs - sorted RDD of (value, column Index) pairs + * @param numOfColumns the number of columns + * + * @return Array that contains (partition index, number of elements from every column on this partition) + */ + private def getColumnsFreqPerPartition(sortedValueColumnPairs: RDD[(Double, Int)], numOfColumns : Int): + Array[(Int, Array[Long])] = { + + val zero = Array.fill[Long](numOfColumns)(0) + + def aggregateColumnFrequencies (partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) = { + val columnsFreq : Array[Long] = valueColumnPairs.aggregate(zero)( + (a : Array[Long], v : (Double ,Int)) => { + val (value, colIndex) = v + a(colIndex) = a(colIndex) + 1L + a + }, + (a : Array[Long], b : Array[Long]) => { + a.zip(b).map{ case(aVal, bVal) => aVal + bVal} + }) + + Iterator((partitionIndex, columnsFreq)) + } + + sortedValueColumnPairs.mapPartitionsWithIndex(aggregateColumnFrequencies).collect() + } + + /** + * Step 3: For each Partition determine the index of the elements that are desired rank statistics + * + * For Example: + * targetRanks: 5 + * partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])] + * numOfColumns: 2 + * + * The output will be: + * [(0, []), (1, [(colIdx=0, rankLocation=3)]), (2, [(colIndex=1, rankLocation=1)])] + * + * @param partitionColumnsFreq Array of (partition index, columns frequencies per this partition) + * + * @return Array that contains (partition index, relevantIndexList where relevantIndexList(i) = the index + * of an element on this partition that matches one of the target ranks) + */ + private def getRanksLocationsWithinEachPart(targetRanks : List[Long], + partitionColumnsFreq : Array[(Int, Array[Long])], + numOfColumns : Int) : Array[(Int, List[(Int, Long)])] = { + + val runningTotal = Array.fill[Long](numOfColumns)(0) + + partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq) => + val relevantIndexList = new MutableList[(Int, Long)]() + + columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => + val runningTotalCol = runningTotal(colIndex) + val ranksHere: List[Long] = targetRanks.filter(rank => + runningTotalCol < rank && runningTotalCol + colCount >= rank) + + // for each of the rank statistics present add this column index and the index it will be at + // on this partition (the rank - the running total) + relevantIndexList ++= ranksHere.map(rank => (colIndex, rank - runningTotalCol)) + + runningTotal(colIndex) += colCount + } + + (partitionIndex, relevantIndexList.toList) + } + } + + /** + * Finds rank statistics elements using ranksLocations. + * + * @param sortedValueColumnPairs - sorted RDD of (value, colIndex) pairs + * @param ranksLocations Array of (partition Index, list of (column index, rank index of this column at this partition)) + * + * @return returns RDD of the target ranks (column index, value) + */ + private def findTargetRanksIteratively(sortedValueColumnPairs : RDD[(Double, Int)], + ranksLocations : Array[(Int, List[(Int, Long)])]): RDD[(Int, Double)] = { + + sortedValueColumnPairs.mapPartitionsWithIndex((partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) => { + val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2 + if (targetsInThisPart.nonEmpty) { + val columnsRelativeIndex: Map[Int, List[Long]] = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + val columnsInThisPart = targetsInThisPart.map(_._1) + + val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() + runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap + + // filter this iterator, so that it contains only those (value, columnIndex) that are the ranks statistics on this partition + // I.e. Keep track of the number of elements we have seen for each columnIndex using the + // running total hashMap. Keep those pairs for which value is the nth element for that columnIndex that appears on this partition + // and the map contains (columnIndex, n). + valueColumnPairs.filter{ + case(value, colIndex) => + //rely on lazy evaluation. If we have already seen this column index, then evalute this + // block in which we increment the running totals and return if this element's count appears in the map. + lazy val thisPairIsTheRankStatistic: Boolean = { + val total = runningTotals(colIndex) + 1L + runningTotals.update(colIndex, total) + columnsRelativeIndex(colIndex).contains(total) + } + (runningTotals contains colIndex) && thisPairIsTheRankStatistic + }.map(_.swap) + } + else { + Iterator.empty + } + }) + } +} +//end::firstTry[] diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksWithHashMap.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksWithHashMap.scala new file mode 100644 index 0000000..fc37f31 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksWithHashMap.scala @@ -0,0 +1,326 @@ +package com.highperformancespark.examples.goldilocks + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame +import org.apache.spark.storage.StorageLevel + +import scala.Predef +import scala.collection.{mutable, Map} +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.MutableList + +//tag::hashMap[] +object GoldiLocksWithHashMap { + + /** + * Find nth target rank for every column. + * + * For example: + * + * dataframe: + * (0.0, 4.5, 7.7, 5.0) + * (1.0, 5.5, 6.7, 6.0) + * (2.0, 5.5, 1.5, 7.0) + * (3.0, 5.5, 0.5, 7.0) + * (4.0, 5.5, 0.5, 8.0) + * + * targetRanks: + * 1, 3 + * + * The output will be: + * 0 -> (0.0, 2.0) + * 1 -> (4.5, 5.5) + * 2 -> (7.7, 1.5) + * 3 -> (5.0, 7.0) + * + * @param dataFrame dataframe of doubles + * @param targetRanks the required ranks for every column + * + * @return map of (column index, list of target ranks) + */ + def findRankStatistics(dataFrame: DataFrame, targetRanks: List[Long]): + Map[Int, Iterable[Double]] = { + + val aggregatedValueColumnPairs: RDD[((Double, Int), Long)] = getAggregatedValueColumnPairs(dataFrame) + val sortedAggregatedValueColumnPairs = aggregatedValueColumnPairs.sortByKey() + sortedAggregatedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK) + + val numOfColumns = dataFrame.schema.length + val partitionColumnsFreq = getColumnsFreqPerPartition(sortedAggregatedValueColumnPairs, numOfColumns) + val ranksLocations = getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, numOfColumns) + + val targetRanksValues = findTargetRanksIteratively(sortedAggregatedValueColumnPairs, ranksLocations) + targetRanksValues.groupByKey().collectAsMap() + } + + /** + * Step 1. Map the rows to pairs of ((value, colIndex), count) where count is the number of times + * that value and that pair appear on this partition + * + * For example: + * + * dataFrame: + * 1.5, 1.25, 2.0 + * 1.5, 2.5, 2.0 + * + * The output RDD will be: + * ((1.5, 0), 2) ((1.25, 1), 1) ((2.5, 1), 1) ((2.0, 2), 2) + * + * @param dataFrame of double columns to compute the rank statistics for + * + * @return returns RDD of ((value, column index), count) + */ + def getAggregatedValueColumnPairs(dataFrame : DataFrame) : RDD[((Double, Int), Long)] = { + val aggregatedValueColumnRDD = dataFrame.rdd.mapPartitions(rows => { + val valueColumnMap = new mutable.HashMap[(Double, Int), Long]() + rows.foreach(row => { + row.toSeq.zipWithIndex.foreach{ case (value, columnIndex) => { + val key = (value.toString.toDouble, columnIndex) + val count = valueColumnMap.getOrElseUpdate(key, 0) + valueColumnMap.update(key, count + 1) + }} + }) + + valueColumnMap.toIterator + }) + + aggregatedValueColumnRDD + } + + /** + * Step 2. Find the number of elements for each column in each partition. + * + * For Example: + * + * sortedValueColumnPairs: + * Partition 1: ((1.5, 0), 2) ((2.0, 0), 1) + * Partition 2: ((4.0, 0), 3) ((3.0, 1), 1) + * + * numOfColumns: 3 + * + * The output will be: + * [(0, [3, 0]), (1, [3, 1])] + * + * @param sortedAggregatedValueColumnPairs - sortedAggregatedValueColumnPairs RDD of ((value, column index), count) + * @param numOfColumns the number of columns + * + * @return Array that contains (partition index, number of elements from every column on this partition) + */ + private def getColumnsFreqPerPartition(sortedAggregatedValueColumnPairs: RDD[((Double, Int), Long)], + numOfColumns : Int): Array[(Int, Array[Long])] = { + + val zero = Array.fill[Long](numOfColumns)(0) + def aggregateColumnFrequencies(partitionIndex : Int, pairs : Iterator[((Double, Int), Long)]) = { + val columnsFreq : Array[Long] = pairs.aggregate(zero)( + (a : Array[Long], v : ((Double,Int), Long)) => { + val ((value, colIndex), count) = v + a(colIndex) = a(colIndex) + count + a}, + (a : Array[Long], b : Array[Long]) => { + a.zip(b).map{ case(aVal, bVal) => aVal + bVal} + }) + + Iterator((partitionIndex, columnsFreq)) + } + + sortedAggregatedValueColumnPairs.mapPartitionsWithIndex(aggregateColumnFrequencies).collect() + } + + /** + * Step 3: For each Partition determine the index of the elements that are desired rank statistics + * + * For Example: + * targetRanks: 5 + * partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])] + * numOfColumns: 2 + * + * The output will be: + * [(0, []), (1, [(0, 3)]), (2, [(1, 1)])] + * + * @param partitionColumnsFreq Array of (partition index, columns frequencies per this partition) + * + * @return Array that contains (partition index, relevantIndexList where relevantIndexList(i) = the index + * of an element on this partition that matches one of the target ranks) + */ + private def getRanksLocationsWithinEachPart(targetRanks : List[Long], + partitionColumnsFreq : Array[(Int, Array[Long])], + numOfColumns : Int) : Array[(Int, List[(Int, Long)])] = { + + val runningTotal = Array.fill[Long](numOfColumns)(0) + + partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq)=> { + val relevantIndexList = new MutableList[(Int, Long)]() + + columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => { + val runningTotalCol = runningTotal(colIndex) + + val ranksHere: List[Long] = targetRanks.filter(rank => + (runningTotalCol < rank && runningTotalCol + colCount >= rank)) + relevantIndexList ++= ranksHere.map(rank => (colIndex, rank - runningTotalCol)) + + runningTotal(colIndex) += colCount + }} + + (partitionIndex, relevantIndexList.toList) + }} + } + + /** + * Finds rank statistics elements using ranksLocations. + * + * @param sortedAggregatedValueColumnPairs - sorted RDD of (value, colIndex) pairs + * @param ranksLocations Array of (partition Index, list of (column index, rank index of this column at this partition)) + * + * @return returns RDD of the target ranks (column index, value) + */ + //tag::mapPartitionsExample[] + private def findTargetRanksIteratively(sortedAggregatedValueColumnPairs : RDD[((Double, Int), Long)], + ranksLocations : Array[(Int, List[(Int, Long)])] + ): RDD[(Int, Double)] = { + + sortedAggregatedValueColumnPairs.mapPartitionsWithIndex((partitionIndex : Int, + aggregatedValueColumnPairs : Iterator[((Double, Int), Long)]) => { + + val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2 + if (!targetsInThisPart.isEmpty) { + FindTargetsSubRoutine.asIteratorToIteratorTransformation(aggregatedValueColumnPairs, + targetsInThisPart) + } + else Iterator.empty + }) + } + //end::mapPartitionsExample[] + /** + * We will want to use this in some chapter where we talk about check pointing + * @param valPairs + * @param colIndexList + * @param targetRanks + * @param storageLevel + * @param checkPoint + * @param directory + * @return + */ + def findQuantilesWithCustomStorage(valPairs: RDD[((Double, Int), Long)], + colIndexList: List[Int], + targetRanks: List[Long], + storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK, + checkPoint : Boolean, directory : String = "") = { + + val n = colIndexList.last+1 + val sorted = valPairs.sortByKey() + if (storageLevel != StorageLevel.NONE) + sorted.persist(storageLevel) + + if (checkPoint) { + sorted.sparkContext.setCheckpointDir(directory) + sorted.checkpoint() + } + + val partitionColumnsFreq = getColumnsFreqPerPartition(sorted, n) + val ranksLocations = getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, n) + val targetRanksValues = findTargetRanksIteratively(sorted, ranksLocations) + targetRanksValues.groupByKey().collectAsMap() + } +} +//end::hashMap[] + + +object FindTargetsSubRoutine extends Serializable { + + //tag::notIter[] + /** + * This sub routine returns an Iterator of (columnIndex, value) that correspond to one of the + desired rank statistics on this partition. + + Because in the original iterator, the pairs are distinct + and include the count, one row of the original iterator could map to multiple elements in the output. + I.e. if we were looking for the 2nd and 3rd element in column index 4 on this partition. And the head + of this partition is ((3249.0, 4), 23) (i.e. the element 3249.0 in the 4 th column appears 23 times), + then we would output (4, 3249.0) twice in the final iterator. Once because 3249.0 is the 2nd element and + once because it is the third element on that partition for that column index + and we are looking for both the second and third element. + + * @param valueColumnPairsIter - passed in from the mapPartitions function. An iterator of the sorted + * ((value, columnIndex), count) tupples. + * @param targetsInThisPart - (columnIndex, index-on-partition pairs). In the above example this would + * include (4, 2) and (4,3) since we desire the 2nd element for column + * index 4 on this partition and the 3rd element. + * @return All of the rank statistics that live in this partition as an iterator of (columnIndex, value pairs) + */ + def withArrayBuffer(valueColumnPairsIter : Iterator[((Double, Int), Long)], + targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = { + + val columnsRelativeIndex: Predef.Map[Int, List[Long]] = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + + //the column indices of the pairs that are desired rank statistics that live in this partition. + val columnsInThisPart: List[Int] = targetsInThisPart.map(_._1).distinct + + //a HashMap with the running totals of each column index. As we loop through the iterator + //we will update the hashmap as we see elements of each column index. + val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() + runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap + + //we use an array buffer to build the resulting iterator + val result: ArrayBuffer[(Int, Double)] = new scala.collection.mutable.ArrayBuffer() + + valueColumnPairsIter.foreach { + case ((value, colIndex), count) => + + if (columnsInThisPart contains colIndex) { + + val total = runningTotals(colIndex) + //the ranks that are contains by this element of the input iterator. + //get by filtering the + val ranksPresent = columnsRelativeIndex(colIndex) + .filter(index => (index <= count + total) && (index > total)) + + ranksPresent.foreach(r => result += ((colIndex, value))) + + //update the running totals. + runningTotals.update(colIndex, total + count) + } + } + //convert + result.toIterator + } + //end::notIter[] + + //tag::iterToIter[] + /** + * Same function as above but rather than building the result from an array buffer we use + * a flatMap on the iterator to get the resulting iterator. + */ + def asIteratorToIteratorTransformation(valueColumnPairsIter : Iterator[((Double, Int), Long)], + targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = { + + val columnsRelativeIndex = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + val columnsInThisPart = targetsInThisPart.map(_._1).distinct + + val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() + runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap + + //filter out the pairs that don't have a column index that is in this part + val pairsWithRanksInThisPart = valueColumnPairsIter.filter{ + case (((value, colIndex), count)) => + columnsInThisPart contains colIndex + } + + //map the valueColumn pairs to a list of (colIndex, value) pairs that correspond to one of the + //desired rank statistics on this partition. + pairsWithRanksInThisPart.flatMap{ + + case (((value, colIndex), count)) => + + val total = runningTotals(colIndex) + val ranksPresent: List[Long] = columnsRelativeIndex(colIndex) + .filter(index => (index <= count + total) && (index > total)) + + val nextElems: Iterator[(Int, Double)] = ranksPresent.map(r => (colIndex, value)).toIterator + + //update the running totals + runningTotals.update(colIndex, total + count) + nextElems + } + } + //end::iterToIter[] +} \ No newline at end of file diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/RDDJoinExamples.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/RDDJoinExamples.scala new file mode 100644 index 0000000..9578ea5 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/RDDJoinExamples.scala @@ -0,0 +1,96 @@ +package com.highperformancespark.examples.goldilocks + +import org.apache.spark.HashPartitioner +import org.apache.spark.rdd.RDD + +object RDDJoinExamples { + + /* For Example, suppose we have one RDD with some data in the form (Panda id, score) + and another RDD with (Panda id, address), and we want to send each Panda some mail + with her best score. We could join the RDDs on ID and then compute the best score + for each address. Like this: + + 'ToDo: Insert Example' + + However, this is slower than first reducing the score data, so that the + //first dataset contains only one row for each Panda with her best score and then + //joining that data with the address data. + + 'ToDO: Insert an example of this' */ + //tag::joinScoresWithAddress[] + def joinScoresWithAddress1( scoreRDD : RDD[(Long, Double)], + addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= { + val joinedRDD = scoreRDD.join(addressRDD) + joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y ) + } + //end::joinScoresWithAddress[] + + //tag::leftOuterJoinScoresWithAddress[] + def outerJoinScoresWithAddress( scoreRDD : RDD[(Long, Double)], + addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, Option[String]))]= { + val joinedRDD = scoreRDD.leftOuterJoin(addressRDD) + joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y ) + } + //end::leftOuterJoinScoresWithAddress[] + + //tag::joinScoresWithAddressFast[] + def joinScoresWithAddress2( scoreRDD : RDD[(Long, Double)], + addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= { + //stuff + val bestScoreData = scoreRDD.reduceByKey((x, y) => if(x > y) x else y) + bestScoreData.join(addressRDD) + + } + //end::joinScoresWithAddressFast[] +/* + We could make the example in the previous section even faster, + by using the partitioner for the address data as an argument for + the reduce by key step. + 'ToDO: Insert the code to show this here' */ + //tag::joinScoresWithAddress3[] + def joinScoresWithAddress3( scoreRDD : RDD[(Long, Double)], + addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= { + //if addressRDD has a known partitioner we should use that, + //otherwise it has a default hash parttioner, which we can reconstrut by getting the umber of + // partitions. + val addressDataPartitioner = addressRDD.partitioner match { + case (Some(p)) => p + case (None) => new HashPartitioner(addressRDD.partitions.length) + } + val bestScoreData = scoreRDD.reduceByKey(addressDataPartitioner, (x, y) => if(x > y) x else y) + bestScoreData.join(addressRDD) + } + //end::joinScoresWithAddress3[] + + def debugString( scoreRDD : RDD[(Long, Double)], + addressRDD : RDD[(Long, String )]) = { + //tag::debugString[] + scoreRDD.join(addressRDD).toDebugString + //end::debugString[] + } + + /* + * Suppose we had two datasets of information about each panda, + * one with the scores, and one with there favorite foods. + * We could use cogroup to associate each Pandas id with an iterator + * of their scores and another iterator of their favorite foods. + */ + + + def coGroupExample( scoreRDD : RDD[(Long, Double)], foodRDD : RDD[(Long, String )], + addressRDD : RDD[(Long, String )]) = { + //tag::coGroupExample1[] + val cogroupedRDD: RDD[(Long, (Iterable[Double], Iterable[String]))] = scoreRDD.cogroup(foodRDD) + //end::coGroupExample1[] + + /* + * For example, if we needed to join the panda score data with both address + * and favorite foods, it would be better to use co group than two + * join operations. + */ + + //tag::coGroupExample2[] + val addressScoreFood = addressRDD.cogroup(scoreRDD, foodRDD) + //end::coGroupExample2[] + } + } diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/SecondarySort.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/SecondarySort.scala new file mode 100644 index 0000000..d7d92a0 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/SecondarySort.scala @@ -0,0 +1,102 @@ +package com.highperformancespark.examples.goldilocks + +import scala.collection.mutable.ArrayBuffer +import scala.reflect.ClassTag + +import org.apache.spark.{HashPartitioner, Partitioner} +import org.apache.spark.rdd.RDD + +object SecondarySort { + + //tag::sortByTwoKeys[] + def sortByTwoKeys[K : Ordering : ClassTag , S, V : ClassTag](pairRDD : RDD[((K, S), V)], partitions : Int ) = { + val colValuePartitioner = new PrimaryKeyPartitioner[K, S](partitions) + implicit val ordering: Ordering[(K, S)] = Ordering.by(_._1) + val sortedWithinParts = pairRDD.repartitionAndSortWithinPartitions( + colValuePartitioner) + sortedWithinParts + } + //end::sortByTwoKeys[] + + //tag::sortAndGroup[] + def groupByKeyAndSortBySecondaryKey[K : Ordering : ClassTag, S, V : ClassTag](pairRDD : RDD[((K, S), V)], partitions : Int ) = { + val colValuePartitioner = new PrimaryKeyPartitioner[Double, Int](partitions) + implicit val ordering: Ordering[(K, S)] = Ordering.by(_._1) + val sortedWithinParts = pairRDD.repartitionAndSortWithinPartitions( + colValuePartitioner) + sortedWithinParts.mapPartitions( iter => groupSorted[K, S, V](iter) ) + } + + def groupSorted[K,S,V]( + it: Iterator[((K, S), V)]): Iterator[(K, List[(S, V)])] = { + val res = List[(K, ArrayBuffer[(S, V)])]() + it.foldLeft(res)((list, next) => list match { + case Nil => + val ((firstKey, secondKey), value) = next + List((firstKey, ArrayBuffer((secondKey, value)))) + + case head :: rest => + val (curKey, valueBuf) = head + val ((firstKey, secondKey), value) = next + if (!firstKey.equals(curKey) ) { + (firstKey, ArrayBuffer((secondKey, value))) :: list + } else { + valueBuf.append((secondKey, value)) + list + } + + }).map { case (key, buf) => (key, buf.toList) }.iterator + } + //end::sortAndGroup[] + +} + +//tag::primaryKeyPartitioner[] +class PrimaryKeyPartitioner[K, S](partitions: Int) extends Partitioner { + /** + * We create a hash partitioner and use it with the first set of keys. + */ + val delegatePartitioner = new HashPartitioner(partitions) + + override def numPartitions = delegatePartitioner.numPartitions + + /** + * Partition according to the hash value of the first key + */ + override def getPartition(key: Any): Int = { + val k = key.asInstanceOf[(K, S)] + delegatePartitioner.getPartition(k._1) + } +} +//end::primaryKeyPartitioner[] + +object CoPartitioningLessons { + + def coLocated(a : RDD[(Int, String)], b : RDD[(Int, String)], + partitionerX : Partitioner, partitionerY :Partitioner): Unit = { + + //tag::coLocated + val rddA = a.partitionBy(partitionerX) + rddA.cache() + val rddB = b.partitionBy(partitionerY) + rddB.cache() + val rddC = a.cogroup(b) + rddC.count() + //end::coLocated[] + } + + def notCoLocated(a : RDD[(Int, String)], b : RDD[(Int, String )], + partitionerX : Partitioner, partitionerY :Partitioner): Unit = { + + //tag::notCoLocated + val rddA = a.partitionBy(partitionerX) + rddA.cache() + val rddB = b.partitionBy(partitionerY) + rddB.cache() + val rddC = a.cogroup(b) + rddA.count() + rddB.count() + rddC.count() + //end::notCoLocated[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala new file mode 100644 index 0000000..f91728d --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -0,0 +1,302 @@ +/** + * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +//tag::sparkSQLImports[] +import org.apache.spark.sql.{DataFrame, SQLContext, Row} +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.functions._ +//end::sparkSQLImports[] + +//tag::sparkHiveImports[] +// Additional imports for using HiveContext +import org.apache.spark.sql.hive._ +import org.apache.spark.sql.hive.thriftserver._ +//end::sparkHiveImports[] + +object HappyPandas { + /** + * Creates SQLContext with an existing SparkContext. + */ + def sqlContext(sc: SparkContext): SQLContext = { + //tag::createSQLContext[] + val sqlContext = new SQLContext(sc) + // Import the implicits, unlike in core Spark the implicits are defined on the context + import sqlContext.implicits._ + //end::createSQLContext[] + sqlContext + } + + /** + * Creates HiveContext with an existing SparkContext. + */ + def hiveContext(sc: SparkContext): HiveContext = { + //tag::createHiveContext[] + val hiveContext = new HiveContext(sc) + // Import the implicits, unlike in core Spark the implicits are defined on the context + import hiveContext.implicits._ + //end::createHiveContext[] + hiveContext + } + + /** + * Illustrate loading some JSON data. + */ + def loadDataSimple(sc: SparkContext, sqlCtx: SQLContext, path: String): DataFrame = { + //tag::loadPandaJSONSimple[] + val df1 = sqlCtx.read.json(path) + //end::loadPandaJSONSimple[] + //tag::loadPandaJSONComplex[] + val df2 = sqlCtx.read.format("json").option("samplingRatio", "1.0").load(path) + //end::loadPandaJSONComplex[] + val jsonRDD = sc.textFile(path) + //tag::loadPandaJsonRDD[] + val df3 = sqlCtx.read.json(jsonRDD) + //end::loadPandaJSONRDD[] + df1 + } + + def jsonLoadFromRDD(sqlCtx: SQLContext, input: RDD[String]): DataFrame = { + //tag::loadPandaJSONRDD[] + val rdd: RDD[String] = input.filter(_.contains("panda")) + val df = sqlCtx.read.json(rdd) + //end::loadPandaJSONRDD[] + df + } + + // Here will be some examples on PandaInfo DataFrame + + /** + * @param place name of place + * @param pandaType type of pandas in this place + * @param happyPandas number of happy pandas in this place + * @param totalPandas total number of pandas in this place + */ + case class PandaInfo(place: String, pandaType: String, happyPandas: Integer, totalPandas: Integer) + + /** + * Gets the percentage of happy pandas per place. + * + * @param pandaInfo the input DataFrame + * @return Returns DataFrame of (place, percentage of happy pandas) + */ + def happyPandasPercentage(pandaInfo: DataFrame): DataFrame = { + pandaInfo.select(pandaInfo("place"), (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy")) + } + + //tag::encodePandaType[] + /** + * Encodes pandaType to Integer values instead of String values. + * + * @param pandaInfo the input DataFrame + * @return Returns a DataFrame of pandaId and integer value for pandaType. + */ + def encodePandaType(pandaInfo: DataFrame): DataFrame = { + pandaInfo.select(pandaInfo("id"), + (when(pandaInfo("pt") === "giant", 0). + when(pandaInfo("pt") === "red", 1). + otherwise(2)).as("encodedType") + ) + } + //end::encodePandaType[] + + /** + * Gets places with happy pandas more than minHappinessBound. + */ + def minHappyPandas(pandaInfo: DataFrame, minHappyPandas: Int): DataFrame = { + pandaInfo.filter(pandaInfo("happyPandas") >= minHappyPandas) + } + + /** + * Extra the panda info from panda places and compute the squisheness of the panda + */ + def squishPandaFromPace(pandaPlace: DataFrame): DataFrame = { + //tag::selectExplode[] + val pandaInfo = pandaPlace.explode(pandaPlace("pandas")){ + case Row(pandas: Seq[Row]) => + pandas.map{ + case Row(id: Long, zip: String, pt: String, happy: Boolean, attrs: Seq[Double]) => + RawPanda(id, zip, pt, happy, attrs.toArray) + }} + pandaInfo.select( + (pandaInfo("attributes")(0) / pandaInfo("attributes")(1)) + .as("squishyness")) + //end::selectExplode[] + } + + /** + * Find pandas that are sad + */ + def sadPandas(pandaInfo: DataFrame): DataFrame = { + //tag::simpleFilter[] + pandaInfo.filter(pandaInfo("happy") !== true) + //end::simpleFilter[] + } + + /** + * Find pandas that are happy and fuzzier than squishy. + */ + def happyFuzzyPandas(pandaInfo: DataFrame): DataFrame = { + //tag::complexFilter[] + pandaInfo.filter( + pandaInfo("happy").and(pandaInfo("attributes")(0) > pandaInfo("attributes")(1)) + ) + //end::complexFilter[] + } + + /** + * Gets places that contains happy pandas more than unhappy pandas. + */ + def happyPandasPlaces(pandaInfo: DataFrame): DataFrame = { + pandaInfo.filter(pandaInfo("happyPandas") >= pandaInfo("totalPandas") / 2) + } + + + /** + * Remove duplicate pandas by id. + */ + def removeDuplicates(pandas: DataFrame): DataFrame = { + //tag::dropDuplicatePandaIds[] + pandas.dropDuplicates(List("id")) + //end::dropDuplicatePandaIds[] + } + + /** + * @param name name of panda + * @param zip zip code + * @param pandaSize size of panda in KG + * @param age age of panda + */ + case class Pandas(name: String, zip: String, pandaSize: Integer, age: Integer) + + def describePandas(pandas: DataFrame): DataFrame = { + //tag::pandaSizeRangeVarDescribe[] + pandas.describe() + //end::pandaSizeRangeVarDescribe[] + } + + //tag::maxPandaSizePerZip[] + def maxPandaSizePerZip(pandas: DataFrame): DataFrame = { + pandas.groupBy(pandas("zip")).max("pandaSize") + } + //end::maxPandaSizePerZip[] + + //tag::minMaxPandasSizePerZip[] + def minMaxPandaSizePerZip(pandas: DataFrame): DataFrame = { + pandas.groupBy(pandas("zip")).agg(min("pandaSize"), max("pandaSize")) + } + //end::minMaxPandasSizePerZip[] + + def minPandaSizeMaxAgePerZip(pandas: DataFrame): DataFrame = { + // this query can be written in two methods + + // 1 + pandas.groupBy(pandas("zip")).agg(("pandaSize", "min"), ("age", "max")) + + // 2 + pandas.groupBy(pandas("zip")).agg(Map("pandaSize" -> "min", "age" -> "max")) + } + + //tag::complexAggPerZip[] + def minMeanSizePerZip(pandas: DataFrame): DataFrame = { + // Compute the min and mean + pandas.groupBy(pandas("zip")).agg(min(pandas("pandaSize")), mean(pandas("pandaSize"))) + } + //end::complexAggPerZip[] + + def simpleSqlExample(pandas: DataFrame): DataFrame = { + val sqlCtx = pandas.sqlContext + //tag::pandasSQLQuery[] + pandas.registerTempTable("pandas") + val miniPandas = sqlCtx.sql("SELECT * FROM pandas WHERE pandaSize < 12") + //end::pandasSQLQuery[] + miniPandas + } + + def startJDBCServer(sqlContext: HiveContext): Unit = { + //tag::startJDBC[] + sqlContext.setConf("hive.server2.thrift.port", "9090") + HiveThriftServer2.startWithContext(sqlContext) + //end::startJDBC[] + } + + /** + * Orders pandas by size ascending and by age descending. + * Pandas will be sorted by "size" first and if two pandas have the same "size" + * will be sorted by "age". + */ + def orderPandas(pandas: DataFrame): DataFrame = { + //tag::simpleSort[] + pandas.orderBy(pandas("pandaSize").asc, pandas("age").desc) + //end::simpleSort[] + } + + def computeRelativePandaSizes(pandas: DataFrame): DataFrame = { + //tag::relativePandaSizesWindow[] + val windowSpec = Window + .orderBy(pandas("age")) + .partitionBy(pandas("zip")) + .rowsBetween(start = -10, end = 10) // can use rangeBetween for range instead + //end::relativePandaSizesWindow[] + + //tag::relativePandaSizesQuery[] + val pandaRelativeSizeCol = pandas("pandaSize") - + avg(pandas("pandaSize")).over(windowSpec) + + pandas.select(pandas("name"), pandas("zip"), pandas("pandaSize"), pandas("age"), + pandaRelativeSizeCol.as("panda_relative_size")) + //end::relativePandaSizesQuery[] + } + + // Join DataFrames of Pandas and Sizes with + def joins(df1: DataFrame, df2: DataFrame): Unit = { + + //tag::innerJoin[] + // Inner join implicit + df1.join(df2, df1("name") === df2("name")) + // Inner join explicit + df1.join(df2, df1("name") === df2("name"), "inner") + //end::innerJoin[] + + //tag::leftouterJoin[] + // Left outer join explicit + df1.join(df2, df1("name") === df2("name"), "left_outer") + //end::leftouterJoin[] + + //tag::rightouterJoin[] + // Right outer join explicit + df1.join(df2, df1("name") === df2("name"), "right_outer") + //end::rightouterJoin[] + + //tag::leftsemiJoin[] + // Left semi join explicit + df1.join(df2, df1("name") === df2("name"), "leftsemi") + //end::leftsemiJoin[] + } + + /** + * Cut the lineage of a DataFrame which has too long a query plan. + */ + def cutLineage(df: DataFrame): DataFrame = { + val sqlCtx = df.sqlContext + //tag::cutLineage[] + val rdd = df.rdd + rdd.cache() + sqlCtx.createDataFrame(rdd, df.schema) + //end::cutLineage[] + } + + // Self join + def selfJoin(df: DataFrame): DataFrame = { + val sqlCtx = df.sqlContext + import sqlCtx.implicits._ + //tag::selfJoin[] + val joined = df.as("a").join(df.as("b")).where($"a.name" === $"b.name") + //end::selfJoin[] + joined + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala new file mode 100644 index 0000000..2c865f8 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala @@ -0,0 +1,127 @@ +/** + * Load and save data to/from DataFrames + */ +package com.highperformancespark.examples.dataframe + +import java.util.Properties + +import org.apache.spark.rdd._ +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +case class LoadSave(sqlContext: SQLContext) { + import sqlContext.implicits._ + //tag::createFromRDD[] + def createFromCaseClassRDD(input: RDD[PandaPlace]) = { + // Create DataFrame explicitly using sqlContext and schema inference + val df1 = sqlContext.createDataFrame(input) + + // Create DataFrame using sqlContext implicits and schema inference + val df2 = input.toDF() + + // Create a Row RDD from our RDD of case classes + val rowRDD = input.map(pm => Row(pm.name, + pm.pandas.map(pi => Row(pi.id, pi.zip, pi.happy, pi.attributes)))) + + val pandasType = ArrayType(StructType(List( + StructField("id", LongType, true), + StructField("zip", StringType, true), + StructField("happy", BooleanType, true), + StructField("attributes", ArrayType(FloatType), true)))) + + // Create DataFrame explicitly with specified schema + val schema = StructType(List(StructField("name", StringType, true), + StructField("pandas", pandasType))) + + val df3 = sqlContext.createDataFrame(rowRDD, schema) + } + //end::createFromRDD[] + + //tag::createFromLocal[] + def createFromLocal(input: Seq[PandaPlace]) = { + sqlContext.createDataFrame(input) + } + //end::createFromLocal[] + + //tag::collectResults[] + def collectDF(df: DataFrame) = { + val result: Array[Row] = df.collect() + result + } + //end::collectResults[] + + //tag::toRDD[] + def toRDD(input: DataFrame): RDD[RawPanda] = { + val rdd: RDD[Row] = input.rdd + rdd.map(row => RawPanda(row.getAs[Long](0), row.getAs[String](1), + row.getAs[String](2), row.getAs[Boolean](3), row.getAs[Array[Double]](4))) + } + //end::toRDD[] + + //tag::partitionedOutput[] + def writeOutByZip(input: DataFrame): Unit = { + input.write.partitionBy("zipcode").format("json").save("output/") + } + //end::partitionedOutput[] + + //tag::saveAppend[] + def writeAppend(input: DataFrame): Unit = { + input.write.mode(SaveMode.Append).save("output/") + } + //end::saveAppend[] + + def createJDBC() = { + //tag::createJDBC[] + sqlContext.read.jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties) + + sqlContext.read.format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("dbtable", "table").load() + //end::createJDBC[] + } + + def writeJDBC(df: DataFrame) = { + //tag::writeJDBC[] + df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties) + + df.write.format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("user", "user") + .option("password", "pass") + .option("dbtable", "table").save() + //end::writeJDBC[] + } + + //tag::loadParquet[] + def loadParquet(path: String): DataFrame = { + // Configure Spark to read binary data as string, note: must be configured on SQLContext + sqlContext.setConf("spark.sql.parquet.binaryAsString", "true") + + // Load parquet data using merge schema (configured through option) + sqlContext.read + .option("mergeSchema", "true") + .format("parquet") + .load(path) + } + //end::loadParquet[] + + //tag::writeParquet[] + def writeParquet(df: DataFrame, path: String) = { + df.write.format("parquet").save(path) + } + //end::writeParquet[] + + //tag::loadHiveTable[] + def loadHiveTable(): DataFrame = { + sqlContext.read.table("pandas") + } + //end::loadHiveTable[] + + //tag::saveManagedTable[] + def saveManagedTable(df: DataFrame): Unit = { + df.write.saveAsTable("pandas") + } + //end::saveManagedTable[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala new file mode 100644 index 0000000..83e4d86 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala @@ -0,0 +1,140 @@ +/** + * A sample mixing relational & functional transformations with Datasets. + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +// Additional imports for using HiveContext +import org.apache.spark.sql.hive._ +import org.apache.spark.sql.hive.thriftserver._ + +class MixedDataset(sqlCtx: SQLContext) { + import sqlCtx.implicits._ + + /** + * A sample function on a Dataset of RawPandas. + * This is contrived, since our reduction could also be done with SQL aggregates, but + * we can see the flexibility of being able to specify arbitrary Scala code. + */ + def happyPandaSums(ds: Dataset[RawPanda]): Double = { + ds.toDF().filter($"happy" === true).as[RawPanda]. + select($"attributes"(0).as[Double]). + reduce((x, y) => x + y) + } + + /** + * A sample function on a Dataset of RawPandas. + * Use the first attribute to deterimine if a panda is squishy. + */ + //tag::basicSelect[] + def squishyPandas(ds: Dataset[RawPanda]): Dataset[(Long, Boolean)] = { + ds.select($"id".as[Long], ($"attributes"(0) > 0.5).as[Boolean]) + } + //end::basicSelect[] + + /** + * Union happy and sad pandas + */ + //tag::basicUnion[] + def unionPandas(happyPandas: Dataset[RawPanda], sadPandas: Dataset[RawPanda]) = { + happyPandas.union(sadPandas) + } + //end::basicUnion[] + + /** + * Functional map + Dataset, sums the positive attributes for the pandas + */ + //tag::functionalQuery[] + def funMap(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.map{rp => rp.attributes.filter(_ > 0).sum} + } + //end::functionalQuery[] + + //tag::maxPandaSizePerZip[] + def maxPandaSizePerZip(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { + ds.groupBy($"zip").keyAs[String].agg(max("attributes(2)").as[Double]) + } + //end::maxPandaSizePerZip[] + + //tag::maxPandaSizePerZipScala[] + def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { + ds.groupBy($"zip").keyAs[String].mapGroups{ case (g, iter) => + (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _))) + } + } + //end::maxPandaSizePerZipScala[] + + /** + * Illustrate how we make typed queries, using some of the float properties to produce boolean + * values. + */ + def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.select($"attributes"(0).as[Double]) + } + + /** + * Illustrate Dataset joins + */ + def joinSample(pandas: Dataset[RawPanda], coffeeShops: Dataset[CoffeeShop]): + Dataset[(RawPanda, CoffeeShop)] = { + //tag::joinWith[] + val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops, + $"zip" === $"zip") + //end::joinWith[] + result + } + + /** + * Illustrate a self join to compare pandas in the same zip code + */ + def selfJoin(pandas: Dataset[RawPanda]): + Dataset[(RawPanda, RawPanda)] = { + //tag::selfJoin[] + val result: Dataset[(RawPanda, RawPanda)] = pandas.joinWith(pandas, + $"zip" === $"zip") + //end::selfJoin[] + result + } + + //tag::fromRDD[] + /** + * Illustrate converting an RDD to DS + */ + def fromRDD(rdd: RDD[RawPanda]): Dataset[RawPanda] = { + rdd.toDS + } + + //end::fromRDD[] + + //tag::toRDDDF[] + /** + * Illustrate converting a Dataset to an RDD + */ + def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = { + ds.rdd + } + + /** + * Illustrate converting a Dataset to a DataFrame + */ + def toDF(ds: Dataset[RawPanda]): DataFrame = { + ds.toDF() + } + //end::toRDDDF[] + + /** + * Illustrate DataFrame to Dataset. Its important to note that if the schema does not match what + * is expected by the Dataset this fails fast. + */ + //tag::DataFrameAsDataset[] + def fromDF(df: DataFrame): Dataset[RawPanda] = { + df.as[RawPanda] + } + //end::DataFrameAsDataset[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back new file mode 100644 index 0000000..cdae7c1 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back @@ -0,0 +1,67 @@ +/** + * A sample mixing relational & functional transformations with Datasets. + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +// Additional imports for using HiveContext +import org.apache.spark.sql.hive._ +import org.apache.spark.sql.hive.thriftserver._ + +class MixedDataset(sqlCtx: SQLContext) { + import sqlCtx.implicits._ + + /** + * A sample function on a Dataset of RawPandas. + * This is contrived, since our reduction could also be done with SQL aggregates, but + * we can see the flexibility of being able to specify arbitrary Scala code. + */ + def happyPandaSums(ds: Dataset[RawPanda]): Double = { + ds.toDF().filter($"happy" === true).as[RawPanda]. + select($"attributes"(0).as[Double]). + reduce((x, y) => x + y) + } + + /** + * Functional map + Dataset, sums the positive attributes for the pandas + */ + def funMap(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.map{rp => rp.attributes.filter(_ > 0).sum} + } + + /** + * Illustrate how we make typed queries, using some of the float properties to produce boolean + * values. + */ + def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.select($"attributes"(0).as[Double]) + } + + /** + * Illustrate converting a Dataset to an RDD + */ + def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = { + ds.rdd + } + + /** + * Illustrate converting a Dataset to a DataFrame + */ + def toDF(ds: Dataset[RawPanda]): DataFrame = { + ds.toDF() + } + + /** + * Illustrate DataFrame to Dataset. Its important to note that if the schema does not match what + * is expected by the Dataset this fails fast. + */ + def fromDF(df: DataFrame): Dataset[RawPanda] = { + df.as[RawPanda] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala new file mode 100644 index 0000000..d118130 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala @@ -0,0 +1,17 @@ +package com.highperformancespark.examples.dataframe +/** + * @param id panda id + * @param zip zip code of panda residence + * @param pt Type of panda as a string + * @param happy if panda is happy + * @param attributes array of panada attributes + */ +case class RawPanda(id: Long, zip: String, pt: String, happy: Boolean, attributes: Array[Double]) + +/** + * @param name place name + * @param pandas pandas in that place + */ +case class PandaPlace(name: String, pandas: Array[RawPanda]) + +case class CoffeeShop(zip: String, name: String) diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala new file mode 100644 index 0000000..a25a97f --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala @@ -0,0 +1,29 @@ +/** + * Using plain-old-sql + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql._ + +case class RegularSQL(sqlContext: SQLContext) { + + //tag::queryTable[] + def querySQL(): DataFrame = { + sqlContext.sql("SELECT * FROM pandas WHERE size > 0") + } + //end::queryTable[] + + // TODO: Holden: include a parquet example file and point this to that. + //tag::queryRawFile[] + def queryRawFile(): DataFrame = { + sqlContext.sql("SELECT * FROM parquet.`path_to_parquet_file`") + } + //end::queryRawFile[] + + //tag::registerTable[] + def registerTable(df: DataFrame): Unit = { + df.registerTempTable("pandas") + df.saveAsTable("perm_pandas") + } + //end::registerTable[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala new file mode 100644 index 0000000..56d4beb --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala @@ -0,0 +1,58 @@ +/** + * Example UDFs + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.types._ + +object UDFs { + //tag::setupUDFs[] + def setupUDFs(sqlCtx: SQLContext) = { + sqlCtx.udf.register("strLen", (s: String) => s.length()) + } + //end::setupUDFs[] + + //tag::setupUDAFs[] + def setupUDAFs(sqlCtx: SQLContext) = { + class Avg extends UserDefinedAggregateFunction { + // Input type + def inputSchema: org.apache.spark.sql.types.StructType = + StructType(StructField("value", DoubleType) :: Nil) + + def bufferSchema: StructType = StructType( + StructField("count", LongType) :: + StructField("sum", DoubleType) :: Nil + ) + + // Return type + def dataType: DataType = DoubleType + + def deterministic: Boolean = true + + def initialize(buffer: MutableAggregationBuffer): Unit = { + buffer(0) = 0L + buffer(1) = 0.0 + } + + def update(buffer: MutableAggregationBuffer,input: Row): Unit = { + buffer(0) = buffer.getAs[Long](0) + 1 + buffer(1) = buffer.getAs[Double](1) + input.getAs[Double](0) + } + + def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { + buffer1(0) = buffer1.getAs[Long](0) + buffer2.getAs[Long](0) + buffer1(1) = buffer1.getAs[Double](1) + buffer2.getAs[Double](1) + } + + def evaluate(buffer: Row): Any = { + buffer.getDouble(1) / buffer.getLong(0) + } + } + // Optionally register + val avg = new Avg + sqlCtx.udf.register("ourAvg", avg) + } + //end::setupUDAFs[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala new file mode 100644 index 0000000..198518d --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala @@ -0,0 +1,9 @@ +package com.highperformancespark.examples.ffi + +import org.apache.spark.rdd.RDD + +object NativeExample { + def jniSum(input: RDD[(String, Array[Int])]): RDD[(String, Int)] = { + input.mapValues(values => new SumJNI().sum(values)) + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala new file mode 100644 index 0000000..7a83aa4 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala @@ -0,0 +1,8 @@ +package com.highperformancespark.examples.ffi + +object StandAlone { + def main(args: Array[String]) { + System.loadLibrary("highPerformanceSpark0") + println(new SumJNI().sum(Array(1,2,3))) + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala new file mode 100644 index 0000000..de848bb --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala @@ -0,0 +1,8 @@ +package com.highperformancespark.examples.ffi + +import ch.jodersky.jni.nativeLoader + +@nativeLoader("high-performance-spark0") +class SumJNI { + @native def sum(n: Array[Int]): Int +} diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala new file mode 100644 index 0000000..9629451 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.highperformancespark.examples.perf + +import com.highperformancespark.examples.dataframe.RawPanda +import com.highperformancespark.examples.tools._ + +import org.apache.spark.rdd._ +import org.apache.spark.{SparkContext, SparkConf} +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.hive.HiveContext +import org.apache.spark.sql.types._ + +/** + * A simple performance test to compare a simple sort between DataFrame, and RDD + */ +object SimplePerfTest { + def main(args: Array[String]) = { + val sparkConf = new SparkConf().setAppName("simple-perf-test") + val sc = new SparkContext(sparkConf) + val sqlCtx = new HiveContext(sc) + val scalingFactor = if (args.length > 0) args(0).toLong else 100L + val size = if (args.length > 1) args(1).toInt else 50 + run(sc, sqlCtx, scalingFactor, size) + } + + def run(sc: SparkContext, sqlCtx: HiveContext, scalingFactor: Long, size: Int) = { + import sqlCtx.implicits._ + val inputRDD = GenerateScalingData.generateFullGoldilocks(sc, scalingFactor, size) + val pairRDD = inputRDD.map(p => (p.zip.toInt, p.attributes(0))) + pairRDD.cache() + pairRDD.count() + val rddTimeings = 1.to(10).map(x => time(testOnRDD(pairRDD))) + val groupTimeings = 1.to(10).map(x => time(groupOnRDD(pairRDD))) + val df = inputRDD.toDF() + val inputDataFrame = df.select(df("zip").cast(IntegerType), df("attributes")(0).as("fuzzyness").cast(DoubleType)) + inputDataFrame.cache() + inputDataFrame.count() + val dataFrameTimeings = 1.to(10).map(x => time(testOnDataFrame(inputDataFrame))) + println(rddTimeings.map(_._2).mkString(",")) + println(groupTimeings.map(_._2).mkString(",")) + println(dataFrameTimeings.map(_._2).mkString(",")) + } + + def testOnRDD(rdd: RDD[(Int, Double)]) = { + rdd.map{case (x, y) => (x, (y, 1))}.reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count() + } + + def groupOnRDD(rdd: RDD[(Int, Double)]) = { + rdd.groupByKey().mapValues{v => + v.aggregate((0.0, 0))({case (x, y) => (x._1 + y, x._2 + 1)}, + {case (x, y) => (x._1 + y._1, x._2 + y._2)})}.count() + } + + def testOnDataFrame(df: DataFrame) = { + df.groupBy("zip").avg("fuzzyness").count() + } + + def time[R](block: => R): (R, Long) = { + val t0 = System.nanoTime() + val result = block // call-by-name + val t1 = System.nanoTime() + println(s"Time ${t1 - t0}ns") + (result, t1 - t0) + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala b/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala new file mode 100644 index 0000000..7fb4177 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala @@ -0,0 +1,21 @@ +package com.highperformancespark.example.tokenize + +import org.apache.spark.rdd.RDD + +object SampleTokenize { + //tag::DIFFICULT[] + def difficultTokenizeRDD(input: RDD[String]) = { + input.flatMap(_.split(" ")) + } + //end::DIFFICULT[] + + //tag::EASY[] + def tokenizeRDD(input: RDD[String]) = { + input.flatMap(tokenize) + } + + protected[tokenize] def tokenize(input: String) = { + input.split(" ") + } + //end::EASY[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala new file mode 100644 index 0000000..0fbe944 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala @@ -0,0 +1,19 @@ +package com.highperformancespark.examples.tools + +import scala.collection.immutable.HashSet + +import com.highperformancespark.examples.dataframe.RawPanda + +import org.apache.spark._ +import org.apache.spark.rdd.RDD + +object FilterInvalidPandas { + + def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = { + //tag::broadcast[] + val invalid = HashSet() ++ invalidPandas + val invalidBroadcast = sc.broadcast(invalid) + input.filter{panda => !invalidBroadcast.value.contains(panda.id)} + //end::broadcast[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala new file mode 100644 index 0000000..66d01d4 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala @@ -0,0 +1,65 @@ +package com.highperformancespark.examples.tools + +import com.highperformancespark.examples.dataframe.RawPanda + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Row +import org.apache.spark.mllib.random.RandomRDDs +import org.apache.spark.mllib.linalg.Vector + +// TODO: Add tests for this +object GenerateScalingData { + /** + * Generate a Goldilocks data set. We expect the zip code to follow an exponential + * distribution and the data its self to be normal + * @param rows number of rows in the RDD (approximate) + * @param size number of value elements + */ + def generateFullGoldilocks(sc: SparkContext, rows: Long, numCols: Int): RDD[RawPanda] = { + val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows).map(_.toInt.toString) + val valuesRDD = RandomRDDs.normalVectorRDD(sc, numRows = rows, numCols = numCols).repartition(zipRDD.partitions.size) + val keyRDD = sc.parallelize(1L.to(rows), zipRDD.getNumPartitions) + keyRDD.zipPartitions(zipRDD, valuesRDD){ + (i1, i2, i3) => + new Iterator[(Long, String, Vector)] { + def hasNext: Boolean = (i1.hasNext, i2.hasNext, i3.hasNext) match { + case (true, true, true) => true + case (false, false, false) => false + case _ => false // Note: this is unsafe (we throw away data when one of our partitions has run out). + } + def next(): (Long, String, Vector) = (i1.next(), i2.next(), i3.next()) + } + }.map{case (k, z, v) => + RawPanda(k, z, "giant", v(0) > 0.5, v.toArray)} + } + + /** + * Transform it down to just the data used for the benchmark + */ + def generateMiniScale(sc: SparkContext, rows: Long, numCols: Int): RDD[(Int, Double)] = { + generateFullGoldilocks(sc, rows, numCols).map(p => (p.zip.toInt, p.attributes(0))) + } + + /** + * Transform it down to just the data used for the benchmark + */ + def generateMiniScaleRows(sc: SparkContext, rows: Long, numCols: Int): RDD[Row] = { + generateMiniScale(sc, rows, numCols).map{case (zip, fuzzy) => Row(zip, fuzzy)} + } + + // tag::MAGIC_PANDA[] + /** + * Generate a Goldilocks data set all with the same id. + * We expect the zip code to follow an exponential + * distribution and the data its self to be normal. + * Simplified to avoid a 3-way zip. + */ + def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int): RDD[RawPanda] = { + val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows).map(_.toInt.toString) + val valuesRDD = RandomRDDs.normalVectorRDD(sc, numRows = rows, numCols = numCols) + zipRDD.zip(valuesRDD).map{case (z, v) => + RawPanda(1, z, "giant", v(0) > 0.5, v.toArray)} + } + // end::MAGIC_PANDA[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala new file mode 100644 index 0000000..18934f8 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala @@ -0,0 +1,54 @@ +import scala.util.Random +import scala.reflect.{ClassTag} + +import org.apache.spark.rdd.RDD + +/** + * Sample our production data to be able to use it for tests + */ +object SampleData { + /** + * Sample the input down to k % for usage in tests + */ + def sampleInput[T](rdd: RDD[T]): RDD[T] = { + // tag::randomSampleInput[] + rdd.sample(withReplacement=false, fraction=0.1) + // end::randomSampleInput[] + } + + /** + * Construct a stratified sample + */ + def stratifiedSample(rdd: RDD[(String, Array[Double])]): RDD[(String, Array[Double])] = { + // tag::stratifiedSample[] + // 5% of the red pandas, and 50% of the giant pandas + val stratas = Map("red" -> 0.05, "giant" -> 0.50) + rdd.sampleByKey(withReplacement=false, fractions = stratas) + // end::stratifiedSample[] + } + + /** + * Custom random sample with RNG. This is intended as an example of how to save setup overhead. + */ + def slowSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = { + rdd.flatMap{x => val r = new Random() + if (r.nextInt(10) == 0) { + Some(x) + } else { + None + }} + } + + /** + * Custom random sample with RNG. This is intended as an example of how to save setup overhead. + */ + def customSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = { + //tag::mapPartitions[] + rdd.mapPartitions{itr => + // Only create once RNG per partitions + val r = new Random() + itr.filter(x => r.nextInt(10) == 0) + } + //end::mapPartitions[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala new file mode 100644 index 0000000..a781ecd --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -0,0 +1,46 @@ +/** + * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.transformations + +import com.highperformancespark.examples.dataframe.RawPanda + +import org.apache.spark._ +import org.apache.spark.rdd._ + +object Accumulators { + /** + * Compute the total fuzzyness with an accumulator while generating an id and zip pair for sorting + */ + //tag::sumFuzzyAcc[] + def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): (RDD[(String, Long)], Double) = { + val acc = sc.accumulator(0.0) // Create an accumulator with the initial value of 0.0 + val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} + // accumulator still has zero value + transformed.count() // force evaluation + // Note: This example is dangerous since the transformation may be evaluated multiple times + (transformed, acc.value) + } + //end::sumFuzzyAcc[] + + /** + * Compute the max fuzzyness with an accumulator while generating an id and zip pair for sorting + */ + //tag::maxFuzzyAcc[] + def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): (RDD[(String, Long)], Double) = { + object MaxDoubleParam extends AccumulatorParam[Double] { + override def zero(initValue: Double) = initValue + override def addInPlace(r1: Double, r2: Double): Double = { + Math.max(r1, r2) + } + } + // Create an accumulator with the initial value of Double.MinValue + val acc = sc.accumulator(Double.MinValue)(MaxDoubleParam) + val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} + // accumulator still has Double.MinValue + transformed.count() // force evaluation + // Note: This example is dangerous since the transformation may be evaluated multiple times + (transformed, acc.value) + } + //end::maxFuzzyAcc[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala b/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala new file mode 100644 index 0000000..d341cb8 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala @@ -0,0 +1,39 @@ + +package com.highperformancespark.examples.transformations + +import org.apache.spark.rdd.RDD + + +object NarrowAndWide { + + //toDO: Probably should write some sort of test for this. + //this is used in chapter 4 for the stage diagram + def sillySparkProgram(rdd1 : RDD[Int]) = { + + //tag::narrowWide[] + + //Narrow dependency. Map the rdd to tuples of (x, 1) + val rdd2 = rdd1.map((_, 1)) + //wide dependency groupByKey + val rdd3 = rdd2.groupByKey() + //end::narrowWide[] + + rdd3 + } + //this is used in chapter two for the stage diagram. + + //tag::stageDiagram[] + def simpleSparkProgram(rdd : RDD[Double]): Long ={ + //stage1 + rdd.filter(_< 1000.0) + .map(x => (x , x) ) + //stage2 + .groupByKey() + .map{ case(value, groups) => (groups.sum, value)} + //stage 3 + .sortByKey() + .count() + } + //end::stageDiagram[] + +} diff --git a/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala b/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala new file mode 100644 index 0000000..c89653a --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala @@ -0,0 +1,44 @@ +package com.highperformancespark.examples.wordcount + +/** + * What sort of big data book would this be if we didn't mention wordcount? + */ +import org.apache.spark.rdd._ + +object WordCount { + // bad idea: uses group by key + def badIdea(rdd: RDD[String]): RDD[(String, Int)] = { + val words = rdd.flatMap(_.split(" ")) + val wordPairs = words.map((_, 1)) + val grouped = wordPairs.groupByKey() + val wordCounts = grouped.mapValues(_.sum) + wordCounts + } + + // good idea: doesn't use group by key + //tag::simpleWordCount[] + def simpleWordCount(rdd: RDD[String]): RDD[(String, Int)] = { + val words = rdd.flatMap(_.split(" ")) + val wordPairs = words.map((_, 1)) + val wordCounts = wordPairs.reduceByKey(_ + _) + wordCounts + } + //end::simpleWordCount + + /** + * Come up with word counts but filter out the illegal tokens and stop words + */ + //tag::wordCountStopwords[] + def withStopWordsFiltered(rdd : RDD[String], illegalTokens : Array[Char], + stopWords : Set[String]): RDD[(String, Int)] = { + val seperators = illegalTokens ++ Array[Char](' ') + val tokens: RDD[String] = rdd.flatMap(_.split(seperators). + map(_.trim.toLowerCase)) + val words = tokens.filter(token => + !stopWords.contains(token) && (token.length > 0) ) + val wordPairs = words.map((_, 1)) + val wordCounts = wordPairs.reduceByKey(_ + _) + wordCounts + } + //end::wordCountStopwords[] +} diff --git a/src/test/java/com/highperformancespark/examples/JavaInteropTest.java b/src/test/java/com/highperformancespark/examples/JavaInteropTest.java new file mode 100644 index 0000000..66318f7 --- /dev/null +++ b/src/test/java/com/highperformancespark/examples/JavaInteropTest.java @@ -0,0 +1,43 @@ +package com.highperformancespark.examples; + +import com.holdenkarau.spark.testing.SharedJavaSparkContext; + +import scala.Tuple2; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class JavaInteropTest extends SharedJavaSparkContext { + + @Test + public void wrapPairRDDTest() { + JavaInteropTestHelper helper = new JavaInteropTestHelper(sc()); + JavaInterop ji = new JavaInterop(); + RDD> rdd = helper.generateMiniPairRDD(); + JavaPairRDD prdd = ji.wrapPairRDD(rdd); + List> expected = Arrays.asList(new Tuple2("panda", 12L)); + assertEquals(expected, prdd.collect()); + } + + @Test + public void wrapPairRDDFakeCtTest() { + JavaInteropTestHelper helper = new JavaInteropTestHelper(sc()); + JavaInterop ji = new JavaInterop(); + RDD> rdd = helper.generateMiniPairRDD(); + JavaPairRDD prdd = ji.wrapPairRDDFakeCt(rdd); + List> expected = Arrays.asList(new Tuple2("panda", 12L)); + assertEquals(expected, prdd.collect()); + } +} diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java new file mode 100644 index 0000000..b0d4bdc --- /dev/null +++ b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java @@ -0,0 +1,151 @@ +package com.highperformancespark.examples.dataframe; + +import com.highperformancespark.examples.objects.JavaPandaInfo; +import com.highperformancespark.examples.objects.JavaPandas; +import com.highperformancespark.examples.objects.JavaRawPanda; +import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +public class JavaHappyPandasTest extends JavaDataFrameSuiteBase { + String toronto = "toronto"; + String sandiego = "san diego"; + String virginia = "virginia"; + + List pandaInfoList = Arrays.asList( + new JavaPandaInfo(toronto, "giant", 1, 2), + new JavaPandaInfo(sandiego, "red", 2, 3), + new JavaPandaInfo(virginia, "black", 1, 10) + ); + + List rawPandaList = Arrays.asList( + new JavaRawPanda(10L, "94110", "giant", true, Arrays.asList(1.0, 0.9)), + new JavaRawPanda(11L, "94110", "red", true, Arrays.asList(1.0, 0.9))); + + List pandasList = Arrays.asList( + new JavaPandas("bata", "10010", 10, 2), + new JavaPandas("wiza", "10010", 20, 4), + new JavaPandas("dabdob", "11000", 8, 2), + new JavaPandas("hanafy", "11000", 15, 7), + new JavaPandas("hamdi", "11111", 20, 10) + ); + + @Test + public void simpleSelfJoinTest() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame result = JavaHappyPandas.selfJoin(inputDF).select("a.name", "b.name"); + List resultList = result.collectAsList(); + + resultList.stream().forEach(row -> assertEquals(row.getString(0), row.getString(1))); + } + + @Test + public void verifyhappyPandasPercentage() { + List expectedList = Arrays.asList(RowFactory.create(toronto, 0.5), + RowFactory.create(sandiego, 2 / 3.0), RowFactory.create(virginia, 1/10.0)); + DataFrame expectedDF = sqlContext().createDataFrame( + expectedList, new StructType( + new StructField[]{ + new StructField("place", DataTypes.StringType, true, Metadata.empty()), + new StructField("percentHappy", DataTypes.DoubleType, true, Metadata.empty()) + })); + + DataFrame inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class); + DataFrame resultDF = JavaHappyPandas.happyPandasPercentage(inputDF); + + assertDataFrameApproximateEquals(expectedDF, resultDF, 1E-5); + } + + @Test + public void encodePandaType() { + DataFrame inputDF = sqlContext().createDataFrame(rawPandaList, JavaRawPanda.class); + DataFrame resultDF = JavaHappyPandas.encodePandaType(inputDF); + + List expectedRows = Arrays.asList(RowFactory.create(10L, 0), RowFactory.create(11L, 1)); + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, new StructType(new StructField[]{ + new StructField("id", DataTypes.LongType, false, Metadata.empty()), + new StructField("encodedType", DataTypes.IntegerType, false, Metadata.empty()) + })); + + assertDataFrameEquals(expectedDF, resultDF); + } + + @Test + public void happyPandasPlaces() { + DataFrame inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class); + DataFrame resultDF = JavaHappyPandas.happyPandasPlaces(inputDF); + + List expectedRows = Arrays.asList( + new JavaPandaInfo(toronto, "giant", 1, 2), + new JavaPandaInfo(sandiego, "red", 2, 3)); + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, JavaPandaInfo.class); + + assertDataFrameEquals(expectedDF, resultDF); + } + + @Test + public void maxPandaSizePerZip() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame resultDF = JavaHappyPandas.maxPandaSizePerZip(inputDF); + + List expectedRows = Arrays.asList( + RowFactory.create(pandasList.get(1).getZip(), pandasList.get(1).getPandaSize()), + RowFactory.create(pandasList.get(3).getZip(), pandasList.get(3).getPandaSize()), + RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize()) + ); + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, + new StructType( + new StructField[]{ + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("max(pandaSize)", DataTypes.IntegerType, true, Metadata.empty()) + } + )); + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")); + } + + @Test + public void complexAggPerZip() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame resultDF = JavaHappyPandas.minMeanSizePerZip(inputDF); + + List expectedRows = Arrays.asList( + RowFactory.create(pandasList.get(1).getZip(), pandasList.get(0).getPandaSize(), 15.0), + RowFactory.create(pandasList.get(3).getZip(), pandasList.get(2).getPandaSize(), 11.5), + RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize(), 20.0)); + + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, + new StructType( + new StructField[]{ + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("min(pandaSize)", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("avg(pandaSize)", DataTypes.DoubleType, true, Metadata.empty()) + } + )); + + assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1E-5); + } + + @Test + public void simpleSQLExample() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame resultDF = JavaHappyPandas.simpleSqlExample(inputDF); + + List expectedList = Arrays.asList( + pandasList.get(0), pandasList.get(2) + ); + DataFrame expectedDF = sqlContext().createDataFrame(expectedList, JavaPandas.class); + + assertDataFrameEquals(expectedDF, resultDF); + } + +} \ No newline at end of file diff --git a/src/test/java/com/highperformancespark/examples/goldilocks/JavaQuantileOnlyArtisanalTest.java b/src/test/java/com/highperformancespark/examples/goldilocks/JavaQuantileOnlyArtisanalTest.java new file mode 100644 index 0000000..98110d0 --- /dev/null +++ b/src/test/java/com/highperformancespark/examples/goldilocks/JavaQuantileOnlyArtisanalTest.java @@ -0,0 +1,42 @@ +package com.highperformancespark.examples.goldilocks; + +import com.google.common.collect.Sets; +import com.highperformancespark.examples.objects.JavaGoldiLocksRow; +import com.holdenkarau.spark.testing.SharedJavaSparkContext; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.SQLContext; +import org.junit.Test; + +import java.util.*; + +import static junit.framework.Assert.assertEquals; + +public class JavaQuantileOnlyArtisanalTest extends SharedJavaSparkContext { + + private List inputList = Arrays.asList( + new JavaGoldiLocksRow(0.0, 4.5, 7.7, 5.0), + new JavaGoldiLocksRow(1.0, 5.5, 6.7, 6.0), + new JavaGoldiLocksRow(2.0, 5.5, 1.5, 7.0), + new JavaGoldiLocksRow(3.0, 5.5, 0.5, 7.0), + new JavaGoldiLocksRow(4.0, 5.5, 0.5, 8.0)); + + @Test + public void goldiLocksFirstTry() { + SQLContext sqlContext = new SQLContext(jsc()); + DataFrame input = sqlContext.createDataFrame(inputList, JavaGoldiLocksRow.class); + Map> secondAndThird = JavaGoldiLocksFirstTry.findRankStatistics(input, Arrays.asList(2L, 3L)); + + Map> expectedResult = new HashMap<>(); + expectedResult.put(0, new HashSet<>(Arrays.asList(1.0, 2.0))); + expectedResult.put(1, new HashSet<>(Arrays.asList(5.5, 5.5))); + expectedResult.put(2, new HashSet<>(Arrays.asList(0.5, 1.5))); + expectedResult.put(3, new HashSet<>(Arrays.asList(6.0, 7.0))); + + for (Map.Entry> entry: secondAndThird.entrySet()) { + Set resultSet = Sets.newHashSet(entry.getValue()); + Set expectedSet = expectedResult.get(entry.getKey()); + + assertEquals(expectedSet, resultSet); + } + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/GoldiLocks/EvaluationTests.scala b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/EvaluationTests.scala new file mode 100644 index 0000000..c635184 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/EvaluationTests.scala @@ -0,0 +1,94 @@ +package com.highperformancespark.examples.goldilocks + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + +class EvaluationTests extends FunSuite with SharedSparkContext { + val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0) + val keyValuePairs = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex + val path = "target/testResults" + test("MapValues preserves Partitioning "){ + + val data: RDD[(Double, Int )] = sc.parallelize(keyValuePairs) + // tag::MapValues[] + val sortedData = data.sortByKey() + val mapValues: RDD[(Double, String)] = sortedData.mapValues(_.toString) + assert(mapValues.partitioner.isDefined, "Using Map Values preserves partitioning") + + val map = sortedData.map( pair => (pair._1, pair._2.toString)) + assert(!map.partitioner.isDefined, "Using map does not preserve partitioning") + // end::MapValues[] + } + + test( "Subtract Behavior "){ + // tag::Subtract[] + val a = Array(1, 2, 3 ,4 ,4 ,4 ,4 ) + val b = Array(3, 4 ) + val rddA = sc.parallelize(a) + val rddB = sc.parallelize(b) + val rddC = rddA.subtract(rddB) + assert(rddC.count() < rddA.count() - rddB.count()) + // end::Subtract[] + } + + test( "Intersection Behavior "){ + // tag::Intersect[] + val a = Array(1, 2, 3 ,4 ,4 ,4 ,4 ) + val b = Array(3, 4 ) + val rddA = sc.parallelize(a) + val rddB = sc.parallelize(b) + val intersection = rddA.intersection(rddB) + val subtraction = rddA.subtract(rddB) + val union = intersection.union(subtraction) + assert(!rddA.collect().sorted.sameElements(union.collect().sorted)) + // end::Intersect[] + } + + test("Itereative Computations "){ + def RMSE(rdd : RDD[(Int, Int )]) = { + val n = rdd.count() + math.sqrt(rdd.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n) + } + + val validationSet = sc.parallelize(keyValuePairs) + + // tag::iterativeComp[] + val testSet: Array[RDD[(Double, Int)]] = Array(validationSet.mapValues(_ + 1), validationSet.mapValues(_ + 2), validationSet) + validationSet.persist() //persist since we are using this RDD several times + val errors = testSet.map( rdd => { + RMSE(rdd.join(validationSet).values) + }) + // end::iterativeComp[] + + //the one where we didn't change anything should have the lowest root mean squared error + assert(errors.min == errors(2)) + + } + + test( "Two actions without caching ") { + val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs) + + // tag::TwoActions[] + val sorted = rddA.sortByKey() + val count = sorted.count() + val sample: Long = count / 10 + sorted.take(sample.toInt) + // end::TwoActions[] + } + + test( "Two actions with caching "){ + val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs) + // tag::TwoActionsCache[] + val sorted = rddA.sortByKey() + val count = sorted.count() + val sample: Long = count / 10 + rddA.persist() + sorted.take(sample.toInt) + // end::TwoActionsCache[] + } + + + +} + diff --git a/src/test/scala/com/high-performance-spark-examples/GoldiLocks/QuantileOnlyArtisanalTest.scala b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/QuantileOnlyArtisanalTest.scala new file mode 100644 index 0000000..5126ce4 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/QuantileOnlyArtisanalTest.scala @@ -0,0 +1,97 @@ +package com.highperformancespark.examples.goldilocks + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.apache.spark.sql.SQLContext +import org.scalatest.FunSuite + +// tag::MAGIC_PANDA[] +class QuantileOnlyArtisanalTest extends FunSuite with SharedSparkContext { + + val inputList = List( + GoldiLocksRow(0.0, 4.5, 7.7, 5.0), + GoldiLocksRow(1.0, 5.5, 6.7, 6.0), + GoldiLocksRow(2.0, 5.5, 1.5, 7.0), + GoldiLocksRow(3.0, 5.5, 0.5, 7.0), + GoldiLocksRow(4.0, 5.5, 0.5, 8.0) + ) + + test("Goldilocks first try ") { + val sqlContext = new SQLContext(sc) + val input = sqlContext.createDataFrame(inputList) + val secondAndThird = GoldiLocksFirstTry.findRankStatistics(input, targetRanks = List(2L, 3L)) + val expectedResult = Map[Int, Set[Double]]( + 0 -> Set(1.0, 2.0), + 1 -> Set(5.5, 5.5), + 2 -> Set(0.5, 1.5), + 3 -> Set(6.0, 7.0)) + secondAndThird.foreach(x => println( x._1 +"," + x._2.mkString(" "))) + assert(expectedResult.forall{case ((index, expectedRanks)) => + secondAndThird.get(index).get.toSet.equals(expectedRanks)}) + } + + //tests the edge case in which one partition does not contain any of the elements in one column + test("Goldilocks first try multiplePartitions") { + import org.scalatest.PrivateMethodTester._ + val testData = sc.parallelize(List(1.0, 2.0, 3.0, 4.0).map(x => (x, x)), 3) + val mapPartitions = testData.mapPartitionsWithIndex { + case (index, iter) => + val key = if (index == 1) 1 else 0 + iter.map(x => (x._1, key)) + } + + val getColumnsFreqPerPartition = PrivateMethod[ Array[(Int, Array[Long])]]('getColumnsFreqPerPartition) + val totals = GoldiLocksFirstTry invokePrivate getColumnsFreqPerPartition(mapPartitions, 2) + + totals.foreach(x => println(x._1 + " : " + x._2.mkString(" "))) + val getRanksLocationsWithinEachPart = + PrivateMethod[Array[(Int, List[(Int, Long)])]]('getRanksLocationsWithinEachPart) + + val locations = GoldiLocksFirstTry invokePrivate getRanksLocationsWithinEachPart(List(1L), totals, 2) + locations.foreach(x => println(x._1 + " : " + x._2.mkString(" "))) + + //assert that there is nothing in the column with index 1 on the second partition + assert(totals(1)._2(0) == 0 ) + + val firstPartition = locations(0)._2 + //assertFirstPartitionOnlyContains a target rank for the for columnIndex 0, at index 1 + assert(firstPartition.toSet.equals(Set((0,1))) ) + + //assertSecondPartition only contains rank for columnIndex 1, at index 1 + val secondPartition = locations(1)._2 + assert(secondPartition.toSet.equals(Set((1,1))) ) + + //assert ThirdPartition contains no locations + val thirdPartition = locations(2)._2 + assert(thirdPartition.toSet.equals(Set())) + assert(locations.length == 3) + } + + test("GoldiLocks With Hashmap ") { + val sqlContext = new SQLContext(sc) + val input = sqlContext.createDataFrame(inputList) + val secondAndThird = GoldiLocksWithHashMap.findRankStatistics(input, targetRanks = List(2L, 3L)) + val expectedResult = Map[Int, Set[Double]]( + 0 -> Set(1.0, 2.0), + 1 -> Set(5.5, 5.5), + 2 -> Set(0.5, 1.5), + 3 -> Set(6.0, 7.0)) + secondAndThird.foreach(x => println( x._1 +"," + x._2.mkString(" "))) + assert(expectedResult.forall{case ((index, expectedRanks)) => + secondAndThird.get(index).get.toSet.equals(expectedRanks)}) + } + + test("Secondary Sort"){ + val data = sc.parallelize(Range.apply(0, 10)).flatMap( i => List(20.0, 30.0 , 40.0 ).map(x => ((x, i), 1L ))) + val r = SecondarySort.groupByKeyAndSortBySecondaryKey(data, 3) + r.collect().foreach( v => println( v)) + val rSorted = r.collect().sortWith( + lt = (a, b) => a._1.toDouble > b._1.toDouble ) + assert(r.collect().zipWithIndex.forall{ + case (((key, list), index )) => rSorted(index)._1.equals(key) + }) + } + +} +// end::MAGIC_PANDA[] + +case class GoldiLocksRow(pandaId: Double, softness: Double, fuzziness: Double, size: Double) \ No newline at end of file diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala new file mode 100644 index 0000000..c6d64fe --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -0,0 +1,291 @@ +/** + * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.dataframe + +import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} +import com.holdenkarau.spark.testing._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.scalatest.Matchers._ + +import scala.collection.mutable +import scala.util.Random + +class HappyPandasTest extends DataFrameSuiteBase { + val toronto = "toronto" + val sandiego = "san diego" + val virginia = "virginia" + val pandaInfoList = List( + PandaInfo(toronto, "giant", 1, 2), + PandaInfo(sandiego, "red", 2, 3), + PandaInfo(virginia, "black", 1, 10)) + + val rawPandaList = List( + RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)), + RawPanda(11L, "94110", "red", true, Array(1.0, 0.9))) + + val pandasList = List(Pandas("bata", "10010", 10, 2), + Pandas("wiza", "10010", 20, 4), + Pandas("dabdob", "11000", 8, 2), + Pandas("hanafy", "11000", 15, 7), + Pandas("hamdi", "11111", 20, 10)) + + val pandaPlaces = List(PandaPlace("toronto", rawPandaList.toArray)) + + test("simple self join test") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(pandasList) + val result = HappyPandas.selfJoin(inputDF).select($"a.name", $"b.name") + val rez = result.collect() + rez.foreach{x => assert(x(0) == x(1))} + } + + test("simple explode test") { + val inputDF = sqlContext.createDataFrame(pandaPlaces) + val pandaInfo = sqlContext.createDataFrame(rawPandaList) + val expectedDf = pandaInfo.select((pandaInfo("attributes")(0) / pandaInfo("attributes")(1)).as("squishyness")) + val result = HappyPandas.squishPandaFromPace(inputDF) + + assertDataFrameApproximateEquals(expectedDf, result, 1E-5) + } + + //tag::approxEqualDataFrames[] + + test("verify simple happy pandas Percentage") { + val expectedList = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0)) + val expectedDf = createDF(expectedList, ("place", StringType), + ("percentHappy", DoubleType)) + + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val resultDF = HappyPandas.happyPandasPercentage(inputDF) + + assertDataFrameApproximateEquals(expectedDf, resultDF, 1E-5) + } + //end::approxEqualDataFrames[] + + test("verify approx by hand") { + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val resultDF = HappyPandas.happyPandasPercentage(inputDF) + val resultRows = resultDF.collect() + + val expectedRows = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0)) + + //tag::approxEqualRow[] + assert(expectedRows.length === resultRows.length) + expectedRows.zip(resultRows).foreach{case (r1, r2) => + assert(r1(0) === r2(0)) + assert(r1.getDouble(1) === (r2.getDouble(1) +- 0.001)) + } + //end::approxEqualRow[] + } + + test("test encode Panda type") { + val inputDF = sqlContext.createDataFrame(rawPandaList) + val resultDF = HappyPandas.encodePandaType(inputDF) + + val expectedRows = List(Row(10L, 0), Row(11L, 1)) + val expectedDF = createDF3(expectedRows, ("id", LongType, false), + ("encodedType", IntegerType, false)) + + assertDataFrameEquals(expectedDF, resultDF) + } + + //tag::exactEqualDataFrames[] + test("verify exact equality") { + // test minHappyPandas + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val result = HappyPandas.minHappyPandas(inputDF, 2) + val resultRows = result.collect() + + val expectedRows = List(Row(sandiego, "red", 2, 3)) + assert(expectedRows === resultRows) + } + //end::exactEqualDataFrames[] + + test("test happyPandasPlaces") { + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val resultDF = HappyPandas.happyPandasPlaces(inputDF) + + val expectedRows = List(PandaInfo(toronto, "giant", 1, 2), + PandaInfo(sandiego, "red", 2, 3)) + val expectedDF = sqlContext.createDataFrame(expectedRows) + + assertDataFrameEquals(expectedDF, resultDF) + } + + test("test maxPandaSizePerZip") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.maxPandaSizePerZip(inputDF) + + val expectedRows = List(Row(pandasList(1).zip, pandasList(1).pandaSize), + Row(pandasList(3).zip, pandasList(3).pandaSize), + Row(pandasList(4).zip, pandasList(4).pandaSize)) + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("max(pandaSize)", IntegerType)) + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")) + } + + test("test minMaxPandaSizePerZip"){ + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.minMaxPandaSizePerZip(inputDF) + + val expectedRows = List( + Row(pandasList(1).zip, pandasList(0).pandaSize, pandasList(1).pandaSize), + Row(pandasList(3).zip, pandasList(2).pandaSize, pandasList(3).pandaSize), + Row(pandasList(4).zip, pandasList(4).pandaSize, pandasList(4).pandaSize)) + + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("min(pandaSize)", IntegerType), + ("max(pandaSize)", IntegerType)) + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")) + } + + test("test minPandaSizeMaxAgePerZip") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.minPandaSizeMaxAgePerZip(inputDF) + + val expectedRows = List( + Row(pandasList(1).zip, pandasList(0).pandaSize, pandasList(1).age), + Row(pandasList(3).zip, pandasList(2).pandaSize, pandasList(3).age), + Row(pandasList(4).zip, pandasList(4).pandaSize, pandasList(4).age)) + + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("min(pandaSize)", IntegerType), + ("max(age)", IntegerType)) + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")) + } + + test("test complexAggPerZip") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.minMeanSizePerZip(inputDF) + + val expectedRows = List( + Row(pandasList(1).zip, pandasList(0).pandaSize, 15.0), + Row(pandasList(3).zip, pandasList(2).pandaSize, 11.5), + Row(pandasList(4).zip, pandasList(4).pandaSize, 20.0)) + + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("min(pandaSize)", IntegerType), + ("avg(pandaSize)", DoubleType)) + + assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1e-5) + } + + + test("test Simple SQL example") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.simpleSqlExample(inputDF) + + val expectedRows = List(pandasList(0), pandasList(2)) + val expectedDF = sqlContext.createDataFrame(expectedRows) + + assertDataFrameEquals(expectedDF, resultDF) + } + + test("test Order Pandas") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.orderPandas(inputDF) + + val expectedRows = List(pandasList(2), pandasList(0), pandasList(3), + pandasList(4), pandasList(1)) + val expectedDF = sqlContext.createDataFrame(expectedRows) + + assertDataFrameEquals(expectedDF, resultDF) + } + + + test("test computeRelativePandaSizes") { + val inputPandaList = loadPandaStuffies() + val inputDF = sqlContext.createDataFrame(inputPandaList) + + val resultDF = HappyPandas.computeRelativePandaSizes(inputDF) + + val expectedDF = getExpectedPandasRelativeSize(inputPandaList, -10, 10) + + assertDataFrameApproximateEquals(expectedDF.orderBy("name"), resultDF.orderBy("name"), 1e-5) + } + + private def getExpectedPandasRelativeSize(pandaList: List[Pandas], start: Int, end: Int):DataFrame = { + + val expectedRows = + pandaList + .groupBy(_.zip) + .map(zipPandas => (zipPandas._1, zipPandas._2.sortBy(_.age))) + .flatMap(zipPandas => { + val pandas = zipPandas._2 + val length = pandas.size - 1 + val result = new mutable.MutableList[Row] + + for (i <- 0 to length) { + var totalSum = 0 + val startOffset = math.max(0, i + start) + val endOffset = math.min(length, i + end) + + for (j <- startOffset to endOffset) + totalSum += pandas(j).pandaSize + + val count = endOffset - startOffset + 1 + val average = totalSum.toDouble / count + + val panda = pandas(i) + result += Row(panda.name, panda.zip, panda.pandaSize, panda.age, panda.pandaSize - average) + } + + result + }).toList + + val expectedDF = createDF(expectedRows, ("name", StringType), + ("zip", StringType), + ("pandaSize", IntegerType), + ("age", IntegerType), + ("panda_relative_size", DoubleType)) + + expectedDF + } + + private def loadPandaStuffies(): List[Pandas] = { + val zipCount = 3 + val maxPandasPerZip = 15 + val maxPandaAge = 50 + val maxPandaSize = 500 + val random = new Random() + + val pandas = + (1 to zipCount) + .flatMap(zipId => { + val pandasCount = 1 + random.nextInt(maxPandasPerZip) + val zipName = s"zip($zipId)" + + (1 to pandasCount).map(pandaId => { + val name = s"panda($pandaId)($zipId)" + val size = 1 + random.nextInt(maxPandaSize) + val age = 1 + random.nextInt(maxPandaAge) + + Pandas(name, zipName, size, age) + } + ) + + }) + + pandas.toList + } + + + private def createDF(list: List[Row], fields: (String, DataType)*) = + sqlContext.createDataFrame(sc.parallelize(list), structType2(fields)) + + private def structType2(fields: Seq[(String, DataType)]) = + StructType(fields.map(f => StructField(f._1, f._2)).toList) + + + private def createDF3(list: List[Row], fields: (String, DataType, Boolean)*) = + sqlContext.createDataFrame(sc.parallelize(list), structType3(fields)) + + private def structType3(fields: Seq[(String, DataType, Boolean)]) = + StructType(fields.map(f => StructField(f._1, f._2, f._3)).toList) +} diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala new file mode 100644 index 0000000..3a50e47 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala @@ -0,0 +1,39 @@ +/** + * Test our simple JNI + */ +package com.highperformancespark.examples.ffi + +import com.holdenkarau.spark.testing._ +import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.Prop.forAll +import org.scalatest.FunSuite +import org.scalatest.prop.Checkers +import org.scalatest.Matchers._ + +class NativeExampleSuite extends FunSuite with SharedSparkContext with Checkers { + test("local sum") { + //def magic2() { + val input = Array(1, 2, 3) + val sumMagic = new SumJNI() + val result = sumMagic.sum(input) + val expected = 6 + result === expected + } + + test("super simple test") { + val input = sc.parallelize(List(("hi", Array(1, 2, 3)))) + val result = NativeExample.jniSum(input).collect() + val expected = List(("hi", 6)) + result === expected + } + + test("native call should find sum correctly") { + val property = forAll(RDDGenerator.genRDD[(String, Array[Int])](sc)(Arbitrary.arbitrary[(String, Array[Int])])) { + rdd => + val expected = rdd.mapValues(_.sum) + val result = NativeExample.jniSum(rdd) + RDDComparisons.compareWithOrder(expected, result).isEmpty + } + check(property) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala new file mode 100644 index 0000000..ba90abe --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -0,0 +1,24 @@ +/** + * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.transformations + +import com.highperformancespark.examples.dataframe.RawPanda + +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + +class AccumulatorsTest extends FunSuite with SharedSparkContext { + test("accumulator max should function") { + val input = sc.parallelize(1.to(100)).map(x => RawPanda(1L, "1", "red", true, Array(x.toDouble))) + val (_, max) = Accumulators.computeMaxFuzzyNess(sc, input) + assert(max === 100.0) + } + + test("accumulator sum should function") { + val input = sc.parallelize(1.to(100)).map(x => RawPanda(1L, "1", "red", true, Array(x.toDouble))) + val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input) + assert(sum === 5050.0) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala new file mode 100644 index 0000000..6d8edb9 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala @@ -0,0 +1,24 @@ +package com.highperformancespark.examples.wordcount + + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.FunSuite + +class WordCountTest extends FunSuite with SharedSparkContext { + test("word count with Stop Words Removed"){ + val wordRDD = sc.parallelize(Seq( + "How happy was the panda? You ask.", + "Panda is the most happy panda in all the #$!?ing land!")) + + val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he") + val illegalTokens: Array[Char] = "#$%?!.".toCharArray + + val wordCounts = WordCount.withStopWordsFiltered(wordRDD, illegalTokens, stopWords) + val wordCountsAsMap = wordCounts.collectAsMap() + assert(!wordCountsAsMap.contains("the")) + assert(!wordCountsAsMap.contains("?")) + assert(!wordCountsAsMap.contains("#$!?ing")) + assert(wordCountsAsMap.contains("ing")) + assert(wordCountsAsMap.get("panda").get.equals(3)) + } +} diff --git a/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala b/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala new file mode 100644 index 0000000..4d983a6 --- /dev/null +++ b/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala @@ -0,0 +1,11 @@ +package com.highperformancespark.examples + + +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD + +class JavaInteropTestHelper(sc: SparkContext) { + def generateMiniPairRDD(): RDD[(String, Long)] = { + sc.parallelize(List(("panda", 12L))) + } +}