diff --git a/.gitignore b/.gitignore
index c58d83b..0f86cc0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,9 @@ project/plugins/project/
 # Scala-IDE specific
 .scala_dependencies
 .worksheet
+
+# emacs stuff
+\#*\#
+\.\#*
+*~
+sbt/*launch*.jar
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..520a5bf
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,38 @@
+language: scala
+sudo: false
+apt:
+  - pandas
+  - numpy
+cache:
+  directories:
+    - $HOME/.ivy2
+    - $HOME/spark
+    - $HOME/.cache/pip
+    - $HOME/.sbt/launchers
+scala:
+   - 2.11.6
+jdk:
+  - oraclejdk8
+apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - gfortran
+      - gcc
+      - binutils
+      - python-pip
+r_packages:
+  - Imap
+before_install:
+  - pip install --user codecov unittest2 nose pep8 pylint --download-cache $HOME/.pip-cache
+script:
+  - "export SPARK_CONF_DIR=./log4j/"
+  - sbt clean coverage compile test
+  - "[ -f spark] || mkdir spark && cd spark && wget http://d3kbcqa49mib13.cloudfront.net/spark-1.6.1-bin-hadoop2.6.tgz && cd .."
+  - "tar -xf ./spark/spark-1.6.1-bin-hadoop2.6.tgz"
+  - "export SPARK_HOME=`pwd`/spark-1.6.1-bin-hadoop2.6"
+  - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH"
+  - "nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov"
+after_success:
+# For now no coverage report
+  - codecov
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 8f71f43..80f405b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,6 @@
+Individual components under resources are available under their own licenses.
+	   * MySQL connector is GPL
+The source code in this repo is available under the Apache License
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
diff --git a/README.md b/README.md
index a7f4184..551928f 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,10 @@
 # high-performance-spark-examples
 Examples for High Performance Spark
+
+# Building
+
+Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake.
+
+# Tests
+
+The full test suite depends on having the C and Fortran components built as well as a local R installation available.
diff --git a/build.sbt b/build.sbt
new file mode 100644
index 0000000..a9e9098
--- /dev/null
+++ b/build.sbt
@@ -0,0 +1,90 @@
+organization := "com.highperformancespark"
+
+name := "examples"
+
+publishMavenStyle := true
+
+version := "0.0.1"
+
+scalaVersion := "2.11.6"
+
+crossScalaVersions := Seq("2.11.6")
+
+javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
+
+sparkVersion := "1.6.1"
+
+//tag::sparkComponents[]
+// TODO(Holden): re-add hive-thriftserver post Spark 2.0
+sparkComponents ++= Seq("core", "streaming", "mllib")
+//end::sparkComponents[]
+//tag::addSQLHiveComponent[]
+sparkComponents ++= Seq("sql", "hive")
+//end::addSQLHiveComponent[]
+
+
+parallelExecution in Test := false
+
+fork := true
+
+javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled")
+
+// additional libraries
+libraryDependencies ++= Seq(
+  "org.scalatest" %% "scalatest" % "2.2.1",
+  "org.scalacheck" %% "scalacheck" % "1.12.4",
+  "junit" % "junit" % "4.10",
+  // Temporary hack until Spark 2.0
+  "org.apache.spark" % "spark-hive-thriftserver_2.10" % "1.6.1" % "provided" intransitive(),
+  //tag::sparkCSV[]
+  "com.databricks" % "spark-csv_2.10" % "1.3.0",
+  //end::sparkCSV[]
+  "com.holdenkarau" % "spark-testing-base_2.11" % "1.6.1_0.3.3",
+  "org.eclipse.jetty" % "jetty-util" % "9.3.2.v20150730",
+  "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8",
+  "com.novocode" % "junit-interface" % "0.10" % "test->default")
+
+
+scalacOptions ++= Seq("-deprecation", "-unchecked")
+
+pomIncludeRepository := { x => false }
+
+resolvers ++= Seq(
+  "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
+  "Spray Repository" at "http://repo.spray.cc/",
+  "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
+  "Akka Repository" at "http://repo.akka.io/releases/",
+  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
+  "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
+  "Twitter Maven Repo" at "http://maven.twttr.com/",
+  "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
+  "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/",
+  "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
+  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
+  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
+  Resolver.sonatypeRepo("public"),
+  Resolver.bintrayRepo("jodersky", "sbt-jni-macros"),
+  "jodersky" at "https://dl.bintray.com/jodersky/maven/"
+)
+
+licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html"))
+
+mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
+  {
+    case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
+    case m if m.startsWith("META-INF") => MergeStrategy.discard
+    case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
+    case PathList("org", "apache", xs @ _*) => MergeStrategy.first
+    case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
+    case "log4j.properties" => MergeStrategy.discard
+    case "about.html"  => MergeStrategy.rename
+    case "reference.conf" => MergeStrategy.concat
+    case _ => MergeStrategy.first
+  }
+}
+
+// JNI
+
+enablePlugins(JniNative)
+
+sourceDirectory in nativeCompile := sourceDirectory.value
diff --git a/conf/log4j.properties b/conf/log4j.properties
new file mode 100644
index 0000000..e90a817
--- /dev/null
+++ b/conf/log4j.properties
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=ERROR, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=ERROR
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.spark-project.jetty=ERROR
+log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
diff --git a/high_performance_pyspark/SQLLineage.py b/high_performance_pyspark/SQLLineage.py
new file mode 100644
index 0000000..c9d77a2
--- /dev/null
+++ b/high_performance_pyspark/SQLLineage.py
@@ -0,0 +1,38 @@
+"""
+>>> from pyspark.context import SparkContext
+>>> from pyspark.sql import SQLContext, Row, DataFrame
+>>> sc = SparkContext('local', 'test')
+...
+>>> sc.setLogLevel("ERROR")
+>>> sqlCtx = SQLContext(sc)
+...
+>>> rdd = sc.parallelize(range(1, 100)).map(lambda x: Row(i = x))
+>>> df = rdd.toDF()
+>>> df2 = cutLineage(df)
+>>> df.head() == df2.head()
+True
+>>> df.schema == df2.schema
+True
+"""
+
+from pyspark.sql import DataFrame
+
+#tag::cutLineage[]
+def cutLineage(df):
+    """
+    Cut the lineage of a DataFrame - used for iterative algorithms
+    
+    .. Note: This uses internal members and may break between versions
+    """
+    jRDD = df._jdf.toJavaRDD()
+    jSchema = df._jdf.schema()
+    jRDD.cache()
+    sqlCtx = df.sql_ctx
+    try:
+        javaSqlCtx = sqlCtx._jsqlContext
+    except:
+        javaSqlCtx = sqlCtx._ssql_ctx
+    newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema)
+    newDF = DataFrame(newJavaDF, sqlCtx)
+    return newDF
+#end::cutLineage[]
diff --git a/high_performance_pyspark/__init__.py b/high_performance_pyspark/__init__.py
new file mode 100644
index 0000000..7741593
--- /dev/null
+++ b/high_performance_pyspark/__init__.py
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+"""
+Python version of selected examples from High Performance Spark
+"""
+
+import os
+import sys
+
diff --git a/high_performance_pyspark/simple_perf_test.py b/high_performance_pyspark/simple_perf_test.py
new file mode 100644
index 0000000..e5d6cbb
--- /dev/null
+++ b/high_performance_pyspark/simple_perf_test.py
@@ -0,0 +1,91 @@
+# When running this example make sure to include the built Scala jar :
+# $SPARK_HOME/bin/pyspark --jars ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar
+# This example illustrates how to interface Scala and Python code, but caution
+# should be taken as it depends on many private members that may change in
+# future releases of Spark.
+
+from pyspark.sql.types import *
+from pyspark.sql import DataFrame
+import timeit
+import time
+
+def generate_scale_data(sqlCtx, rows, numCols):
+    """
+    Generate scale data for the performance test.
+
+    This also illustrates calling custom Scala code from the driver.
+
+    .. Note: This depends on many internal methods and may break between versions.
+    """
+    sc = sqlCtx._sc
+    # Get the SQL Context, 2.0 and pre-2.0 syntax
+    try:
+        javaSqlCtx = sqlCtx._jsqlContext
+    except:
+        javaSqlCtx = sqlCtx._ssql_ctx
+    jsc = sc._jsc
+    scalasc = jsc.sc()
+    gateway = sc._gateway
+    # Call a java method that gives us back an RDD of JVM Rows (Int, Double)
+    # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are different, so we
+    # can't directly wrap this.
+    # This returns a Java RDD of Rows - normally it would better to
+    # return a DataFrame directly, but for illustration we will work with an RDD
+    # of Rows.
+    java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData. \
+               generateMiniScaleRows(scalasc, rows, numCols)
+    # Schemas are serialized to JSON and sent back and forth
+    # Construct a Python Schema and turn it into a Java Schema
+    schema = StructType([StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())])
+    jschema = javaSqlCtx.parseDataType(schema.json())
+    # Convert the Java RDD to Java DataFrame
+    java_dataframe = javaSqlCtx.createDataFrame(java_rdd, jschema)
+    # Wrap the Java DataFrame into a Python DataFrame
+    python_dataframe = DataFrame(java_dataframe, sqlCtx)
+    # Convert the Python DataFrame into an RDD
+    pairRDD = python_dataframe.rdd.map(lambda row: (row[0], row[1]))
+    return (python_dataframe, pairRDD)
+
+def runOnDF(df):
+    result = df.groupBy("zip").avg("fuzzyness").count()
+    return result
+
+def runOnRDD(rdd):
+    result = rdd.map(lambda (x, y): (x, (y, 1))). \
+             reduceByKey(lambda x, y: (x[0] + y [0], x[1] + y[1])). \
+             count()
+    return result
+
+def groupOnRDD(rdd):
+    return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count()
+
+def run(sc, sqlCtx, scalingFactor, size):
+    (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size)
+    input_rdd.cache().count()
+    rddTimeings = timeit.repeat(stmt=lambda: runOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()')
+    groupTimeings = timeit.repeat(stmt=lambda: groupOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()')
+    input_df.cache().count()
+    dfTimeings = timeit.repeat(stmt=lambda: runOnDF(input_df), repeat=10, number=1, timer=time.time, setup='gc.enable()')
+    print "RDD:"
+    print rddTimeings
+    print "group:"
+    print groupTimeings
+    print "df:"
+    print dfTimeings
+    print "yay"
+
+if __name__ == "__main__":
+
+    """
+    Usage: simple_perf_test scalingFactor size
+    """
+    import sys
+    from pyspark import SparkContext
+    from pyspark.sql import SQLContext
+    scalingFactor = int(sys.argv[1])
+    size = int(sys.argv[2])
+    sc = SparkContext(appName="SimplePythonPerf")
+    sqlCtx = SQLContext(sc)
+    run(sc, sqlCtx, scalingFactor, size)
+
+    sc.stop()
diff --git a/project/plugins.sbt b/project/plugins.sbt
new file mode 100644
index 0000000..253c5a6
--- /dev/null
+++ b/project/plugins.sbt
@@ -0,0 +1,21 @@
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0")
+
+resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
+
+resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/"
+
+
+resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
+
+// Temporary hack for bintray being sad
+
+resolvers +=  Resolver.bintrayRepo("jodersky", "sbt-jni-macros")
+resolvers += "jodersky" at "https://dl.bintray.com/jodersky/maven/"
+
+addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2")
+
+//addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
+
+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.3.3")
+
+addSbtPlugin("ch.jodersky" % "sbt-jni" % "1.0.0-RC3")
diff --git a/resources/mysql-connector-java-5.1.38.jar b/resources/mysql-connector-java-5.1.38.jar
new file mode 100644
index 0000000..be09493
Binary files /dev/null and b/resources/mysql-connector-java-5.1.38.jar differ
diff --git a/resources/rawpanda.json b/resources/rawpanda.json
new file mode 100644
index 0000000..1d9940d
--- /dev/null
+++ b/resources/rawpanda.json
@@ -0,0 +1,2 @@
+{"name":"mission","pandas":[{"id":1,"zip":"94110","pt":"giant", "happy":true,
+			     "attributes":[0.4,0.5]}]}
diff --git a/sbt/sbt b/sbt/sbt
new file mode 100755
index 0000000..aac1085
--- /dev/null
+++ b/sbt/sbt
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script launches sbt for this project. If present it uses the system 
+# version of sbt. If there is no system version of sbt it attempts to download
+# sbt locally.
+SBT_VERSION=0.13.9
+URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
+URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
+JAR=sbt/sbt-launch-${SBT_VERSION}.jar
+
+# Download sbt launch jar if it hasn't been downloaded yet
+if [ ! -f ${JAR} ]; then
+  # Download
+  printf "Attempting to fetch sbt\n"
+  set -x
+  JAR_DL=${JAR}.part
+  if hash wget 2>/dev/null; then
+    (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
+  elif hash axel 2>/dev/null; then
+    (axel  ${URL1} -o ${JAR_DL} || axel  ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR}
+  else
+    printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
+    exit -1
+  fi
+fi
+if [ ! -f ${JAR} ]; then
+  # We failed to download
+  printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
+  exit -1
+fi
+printf "Launching sbt from ${JAR}\n"
+java \
+  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
+  -jar ${JAR} \
+  "$@"
diff --git a/shell-scripts/launch-with-mysql-jdbc b/shell-scripts/launch-with-mysql-jdbc
new file mode 100644
index 0000000..90ac352
--- /dev/null
+++ b/shell-scripts/launch-with-mysql-jdbc
@@ -0,0 +1,5 @@
+ASSEMBLY_JAR=./target/scala-2.10/examples_2.10.jar
+CLASS="com.highperformancespark.dataframe.mysqlload"
+#tag:[submit]
+spark-submit --jars ./resources/mysql-connector-java-5.1.38.jar $ASSEMBLY_JAR $CLASS 
+#end:[submit]
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..4d3442b
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,64 @@
+################################################################
+# A minimal CMake file that is compatible with sbt-jni         #
+#                                                              #
+# All settings required by sbt-jni have been marked so, please #
+# add/modify/remove settings to build your specific library.   #
+################################################################
+
+cmake_minimum_required(VERSION 2.6)
+
+# Define project and related variables
+#
+project (high-performance-spark)
+
+# Set versions and library name
+# (required by sbt-jni) please use semantic versioning
+#
+set (VERSION_MAJOR 0)
+set (VERSION_MINOR 0)
+set (VERSION_PATCH 0)
+# (required by sbt-jni) major version will always be appended to library name
+set (LIB_NAME ${CMAKE_PROJECT_NAME}${VERSION_MAJOR})
+
+# Command-line options
+#
+# (set by sbt-jni)
+set (LIB_INSTALL_DIR lib CACHE PATH "Path in which to install libraries (equivalent to Autoconf --libdir).")
+# (set by sbt-jni)
+set (LIB_ENABLE_MINOR_VERSIONS ON CACHE BOOLEAN "Build libraries with minor and patch versions appended.")
+
+# Setup JNI
+find_package(JNI REQUIRED)
+if (JNI_FOUND)
+    message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}")
+endif()
+
+# Include directories
+include_directories(.)
+include_directories(./main/c)
+include_directories(include)
+include_directories(${JNI_INCLUDE_DIRS})
+
+# Setup main shared library
+file(GLOB LIB_SRC
+  "*.c"
+  "*.cpp"
+  "./main/c/*.c"
+  "./main/c/*.cpp"
+)
+add_library(${LIB_NAME} SHARED ${LIB_SRC})
+
+# By default, in a regular build, minor and patch versions are added to the generated files.
+# When built through sbt-jni however, LIB_ENABLE_MINOR_VERSIONS is deactivated and only a
+# major-versioned library file is built.
+if (LIB_ENABLE_MINOR_VERSIONS)
+   set_target_properties(
+	${LIB_NAME}
+	PROPERTIES
+	VERSION 0.${VERSION_MINOR}.${VERSION_PATCH} # major version always 0, it is included in library name
+	SOVERSION 0
+   )
+endif()
+
+# Installation targets
+install(TARGETS ${LIB_NAME} LIBRARY DESTINATION ${LIB_INSTALL_DIR})
diff --git a/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h b/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h
new file mode 100644
index 0000000..75be264
--- /dev/null
+++ b/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h
@@ -0,0 +1,21 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class com_highperformancespark_examples_ffi_SumJNI */
+
+#ifndef _Included_com_highperformancespark_examples_ffi_SumJNI
+#define _Included_com_highperformancespark_examples_ffi_SumJNI
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     com_highperformancespark_examples_ffi_SumJNI
+ * Method:    sum
+ * Signature: ([I)I
+ */
+JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum
+  (JNIEnv *, jobject, jintArray);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/main/c/sum.c b/src/main/c/sum.c
new file mode 100644
index 0000000..f571aad
--- /dev/null
+++ b/src/main/c/sum.c
@@ -0,0 +1,9 @@
+#include "sum.h"
+
+int sum(int input[], int num_elem) {
+  int c, ret = 0;
+  for (c = 0; c < num_elem; c++) {
+    ret += input[c];
+  }
+  return ret;
+}
diff --git a/src/main/c/sum.h b/src/main/c/sum.h
new file mode 100644
index 0000000..d04be96
--- /dev/null
+++ b/src/main/c/sum.h
@@ -0,0 +1,6 @@
+#ifndef _SUM_H
+#define _SUM_H
+
+int sum(int input[], int num_elem);
+
+#endif /* _SUM_H */
diff --git a/src/main/c/sum_wrapper.c b/src/main/c/sum_wrapper.c
new file mode 100644
index 0000000..a499d3e
--- /dev/null
+++ b/src/main/c/sum_wrapper.c
@@ -0,0 +1,16 @@
+#include "sum.h"
+#include "include/com_highperformancespark_examples_ffi_SumJNI.h"
+#include <ctype.h>
+#include <jni.h>
+
+/*
+ * Class:     com_highperformancespark_examples_ffi_SumJNI
+ * Method:    sum
+ * Signature: ([I)I
+ */
+JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum
+(JNIEnv *env, jobject obj, jintArray ja) {
+  jsize size = (*env)->GetArrayLength(env, ja);
+  jint *a = (*env)->GetIntArrayElements(env, ja, 0);
+  return sum(a, size);
+}
diff --git a/src/main/java/com/highperformancespark/examples/JavaInterop.java b/src/main/java/com/highperformancespark/examples/JavaInterop.java
new file mode 100644
index 0000000..3b37093
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/JavaInterop.java
@@ -0,0 +1,36 @@
+package com.highperformancespark.examples;
+
+import scala.reflect.*;
+import scala.Tuple2;
+
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.apache.spark.sql.functions.*;
+
+public class JavaInterop {
+
+  //tag::realClassTag[]
+  public static JavaPairRDD wrapPairRDD(RDD<Tuple2<String, Object>> rdd) {
+    // Construct the class tags
+    ClassTag<String> strCt = ClassTag$.MODULE$.apply(String.class);
+    ClassTag<Long> longCt = ClassTag$.MODULE$.apply(scala.Long.class);
+    return new JavaPairRDD(rdd, strCt, longCt);
+  }
+  //end::realClassTag[]
+
+  //tag::fakeClassTag[]
+  public static JavaPairRDD wrapPairRDDFakeCt(RDD<Tuple2<String, Object>> rdd) {
+    // Construct the class tags by casting AnyRef - this would be more commonly done with
+    // generic or templated code where we can't explicitly construct the correct class tag
+    // as using fake class tags may result in degraded performance.
+    ClassTag<Object> fake = ClassTag$.MODULE$.AnyRef();
+    return new JavaPairRDD(rdd, fake, fake);
+  }
+  //end::fakeClassTag[]
+}
diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java
new file mode 100644
index 0000000..bc93163
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java
@@ -0,0 +1,210 @@
+package com.highperformancespark.examples.dataframe;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Column;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.expressions.Window;
+import org.apache.spark.sql.expressions.WindowSpec;
+import org.apache.spark.sql.hive.HiveContext;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.apache.spark.sql.functions.*;
+
+public class JavaHappyPandas {
+
+  /**
+   * Creates SQLContext with an existing SparkContext.
+   */
+  public static SQLContext sqlContext(JavaSparkContext jsc) {
+    SQLContext sqlContext = new SQLContext(jsc);
+    return sqlContext;
+  }
+
+  /**
+   * Creates HiveContext with an existing SparkContext.
+   */
+  public static HiveContext hiveContext(JavaSparkContext jsc) {
+    HiveContext hiveContext = new HiveContext(jsc);
+    return hiveContext;
+  }
+
+  /**
+   * Illustrate loading some JSON data.
+   */
+  public static DataFrame loadDataSimple(JavaSparkContext jsc, SQLContext sqlContext, String path) {
+    DataFrame df1 = sqlContext.read().json(path);
+
+    DataFrame df2 = sqlContext.read().format("json").option("samplingRatio", "1.0").load(path);
+
+    JavaRDD<String> jsonRDD = jsc.textFile(path);
+    DataFrame df3 = sqlContext.read().json(jsonRDD);
+
+    return df1;
+  }
+
+  public static DataFrame jsonLoadFromRDD(SQLContext sqlContext, JavaRDD<String> input) {
+    JavaRDD<String> rdd = input.filter(e -> e.contains("panda"));
+    DataFrame df = sqlContext.read().json(rdd);
+    return df;
+  }
+
+  //  Here will be some examples on PandaInfo DataFrame
+
+  /**
+   * Gets the percentage of happy pandas per place.
+   *
+   * @param pandaInfo the input DataFrame
+   * @return Returns DataFrame of (place, percentage of happy pandas)
+   */
+  public static DataFrame happyPandasPercentage(DataFrame pandaInfo) {
+    DataFrame happyPercentage = pandaInfo.select(pandaInfo.col("place"),
+      (pandaInfo.col("happyPandas").divide(pandaInfo.col("totalPandas"))).as("percentHappy"));
+    return happyPercentage;
+  }
+
+  /**
+   * Encodes pandaType to Integer values instead of String values.
+   *
+   * @param pandaInfo the input DataFrame
+   * @return Returns a DataFrame of pandaId and integer value for pandaType.
+   */
+  public static DataFrame encodePandaType(DataFrame pandaInfo) {
+    DataFrame encodedDF = pandaInfo.select(pandaInfo.col("id"),
+        when(pandaInfo.col("pt").equalTo("giant"), 0).
+        when(pandaInfo.col("pt").equalTo("red"), 1).
+        otherwise(2).as("encodedType"));
+
+    return encodedDF;
+  }
+
+  /**
+   * Gets places with happy pandas more than minHappinessBound.
+   */
+  public static DataFrame minHappyPandas(DataFrame pandaInfo, int minHappyPandas) {
+    return pandaInfo.filter(pandaInfo.col("happyPandas").geq(minHappyPandas));
+  }
+
+  /**
+   * Find pandas that are sad.
+   */
+  public static DataFrame sadPandas(DataFrame pandaInfo) {
+    return pandaInfo.filter(pandaInfo.col("happy").notEqual(true));
+  }
+
+  /**
+   * Find pandas that are happy and fuzzier than squishy.
+   */
+  public static DataFrame happyFuzzyPandas(DataFrame pandaInfo) {
+    DataFrame df = pandaInfo.filter(
+      pandaInfo.col("happy").and(pandaInfo.col("attributes").apply(0)).gt(pandaInfo.col("attributes").apply(1))
+    );
+
+    return df;
+  }
+
+  /**
+   * Gets places that contains happy pandas more than unhappy pandas.
+   */
+  public static DataFrame happyPandasPlaces(DataFrame pandaInfo) {
+    return pandaInfo.filter(pandaInfo.col("happyPandas").geq(pandaInfo.col("totalPandas").divide(2)));
+  }
+
+  /**
+   * Remove duplicate pandas by id.
+   */
+  public static DataFrame removeDuplicates(DataFrame pandas) {
+    DataFrame df = pandas.dropDuplicates(new String[]{"id"});
+    return df;
+  }
+
+  public static DataFrame describePandas(DataFrame pandas) {
+    return pandas.describe();
+  }
+
+  public static DataFrame maxPandaSizePerZip(DataFrame pandas) {
+    return pandas.groupBy(pandas.col("zip")).max("pandaSize");
+  }
+
+  public static DataFrame minMaxPandaSizePerZip(DataFrame pandas) {
+    return pandas.groupBy(pandas.col("zip")).agg(min("pandaSize"), max("pandaSize"));
+  }
+
+  public static DataFrame minPandaSizeMaxAgePerZip(DataFrame pandas) {
+    Map<String, String> map = new HashMap<>();
+    map.put("pandaSize", "min");
+    map.put("age", "max");
+
+    DataFrame df = pandas.groupBy(pandas.col("zip")).agg(map);
+    return df;
+  }
+
+  public static DataFrame minMeanSizePerZip(DataFrame pandas) {
+    return pandas.groupBy(pandas.col("zip")).agg(min(pandas.col("pandaSize")), mean(pandas.col("pandaSize")));
+  }
+
+  public static DataFrame simpleSqlExample(DataFrame pandas) {
+    SQLContext sqlContext = pandas.sqlContext();
+    pandas.registerTempTable("pandas");
+
+    DataFrame miniPandas = sqlContext.sql("SELECT * FROM pandas WHERE pandaSize < 12");
+    return miniPandas;
+  }
+
+  /**
+   * Orders pandas by size ascending and by age descending.
+   * Pandas will be sorted by "size" first and if two pandas
+   * have the same "size"  will be sorted by "age".
+   */
+  public static DataFrame orderPandas(DataFrame pandas) {
+    return pandas.orderBy(pandas.col("pandaSize").asc(), pandas.col("age").desc());
+  }
+
+  public static DataFrame computeRelativePandaSizes(DataFrame pandas) {
+    //tag::relativePandaSizesWindow[]
+    WindowSpec windowSpec = Window
+      .orderBy(pandas.col("age"))
+      .partitionBy(pandas.col("zip"))
+      .rowsBetween(-10, 10); // can use rangeBetween for range instead
+    //end::relativePandaSizesWindow[]
+
+    //tag::relativePandaSizesQuery[]
+    Column pandaRelativeSizeCol = pandas.col("pandaSize").minus(avg(pandas.col("pandaSize")).over(windowSpec));
+
+    return pandas.select(pandas.col("name"), pandas.col("zip"), pandas.col("pandaSize"),
+      pandas.col("age"), pandaRelativeSizeCol.as("panda_relative_size"));
+    //end::relativePandaSizesQuery[]
+  }
+
+  public static void joins(DataFrame df1, DataFrame df2) {
+    //tag::innerJoin[]
+    // Inner join implicit
+    df1.join(df2, df1.col("name").equalTo(df2.col("name")));
+    // Inner join explicit
+    df1.join(df2, df1.col("name").equalTo(df2.col("name")), "inner");
+    //end::innerJoin[]
+
+    //tag::leftouterJoin[]
+    // Left outer join explicit
+    df1.join(df2, df1.col("name").equalTo(df2.col("name")), "left_outer");
+    //end::leftouterJoin[]
+
+    //tag::rightouterJoin[]
+    // Right outer join explicit
+    df1.join(df2, df1.col("name").equalTo(df2.col("name")), "right_outer");
+    //end::rightouterJoin[]
+
+    //tag::leftsemiJoin[]
+    // Left semi join explicit
+    df1.join(df2, df1.col("name").equalTo(df2.col("name")), "leftsemi");
+    //end::leftsemiJoin[]
+  }
+
+  public static DataFrame selfJoin(DataFrame df) {
+    return (df.as("a")).join(df.as("b")).where("a.name = b.name");
+  }
+
+}
diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java
new file mode 100644
index 0000000..9d36dd8
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java
@@ -0,0 +1,140 @@
+package com.highperformancespark.examples.dataframe;
+
+import com.highperformancespark.examples.objects.JavaPandaPlace;
+import com.highperformancespark.examples.objects.JavaRawPanda;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.types.*;
+
+import java.util.List;
+import java.util.Properties;
+import java.util.stream.Collectors;
+
+public class JavaLoadSave {
+  private SQLContext sqlContext;
+
+  public JavaLoadSave(SQLContext sqlContext) {
+    this.sqlContext = sqlContext;
+  }
+
+  //tag::createFromRDD[]
+  public DataFrame createFromJavaBean(JavaRDD<JavaPandaPlace> input) {
+    // Create DataFrame using Java Bean
+    DataFrame df1 = sqlContext.createDataFrame(input, JavaPandaPlace.class);
+
+    // Create DataFrame using JavaRDD<Row>
+    JavaRDD<Row> rowRDD = input.map(pm -> RowFactory.create(pm.getName(),
+      pm.getPandas().stream()
+      .map(pi -> RowFactory.create(pi.getId(), pi.getZip(), pi.isHappy(), pi.getAttributes()))
+      .collect(Collectors.toList())));
+
+    ArrayType pandasType = DataTypes.createArrayType(new StructType(
+      new StructField[]{
+        new StructField("id", DataTypes.LongType, true, Metadata.empty()),
+        new StructField("zip", DataTypes.StringType, true, Metadata.empty()),
+        new StructField("happy", DataTypes.BooleanType, true, Metadata.empty()),
+        new StructField("attributes", DataTypes.createArrayType(DataTypes.FloatType), true, Metadata.empty())
+      }
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("name", DataTypes.StringType, true, Metadata.empty()),
+      new StructField("pandas", pandasType, true, Metadata.empty())
+    });
+
+    DataFrame df2 = sqlContext.createDataFrame(rowRDD, schema);
+    return df2;
+  }
+  //end::createFromRDD[]
+
+  //tag::createFromLocal[]
+  public DataFrame createFromLocal(List<PandaPlace> input) {
+    return sqlContext.createDataFrame(input, PandaPlace.class);
+  }
+  //end::createFromLocal[]
+
+  //tag::collectResults[]
+  public Row[] collectDF(DataFrame df) {
+    return df.collect();
+  }
+  //end::collectResults[]
+
+  //tag::toRDD[]
+  public JavaRDD<JavaRawPanda> toRDD(DataFrame input) {
+    JavaRDD<JavaRawPanda> rdd = input.javaRDD().map(row -> new JavaRawPanda(row.getLong(0), row.getString(1),
+      row.getString(2), row.getBoolean(3), row.getList(4)));
+    return rdd;
+  }
+  //end::toRDD[]
+
+  //tag::partitionedOutput[]
+  public void writeOutByZip(DataFrame input) {
+    input.write().partitionBy("zipcode").format("json").save("output/");
+  }
+  //end::partitionedOutput[]
+
+  //tag::saveAppend[]
+  public void writeAppend(DataFrame input) {
+    input.write().mode(SaveMode.Append).save("output/");
+  }
+  //end::saveAppend[]
+
+  public DataFrame createJDBC() {
+    //tag::createJDBC[]
+    DataFrame df1 = sqlContext.read().jdbc("jdbc:dialect:serverName;user=user;password=pass",
+      "table", new Properties());
+
+    DataFrame df2 = sqlContext.read().format("jdbc")
+      .option("url", "jdbc:dialect:serverName")
+      .option("dbtable", "table").load();
+
+    return df2;
+    //end::createJDBC[]
+  }
+
+  public void writeJDBC(DataFrame df) {
+    //tag::writeJDBC[]
+    df.write().jdbc("jdbc:dialect:serverName;user=user;password=pass",
+      "table", new Properties());
+
+    df.write().format("jdbc")
+      .option("url", "jdbc:dialect:serverName")
+      .option("user", "user")
+      .option("password", "pass")
+      .option("dbtable", "table").save();
+    //end::writeJDBC[]
+  }
+
+  //tag::loadParquet[]
+  public DataFrame loadParquet(String path) {
+    // Configure Spark to read binary data as string, note: must be configured on SQLContext
+    sqlContext.setConf("spark.sql.parquet.binaryAsString", "true");
+
+    // Load parquet data using merge schema (configured through option)
+    DataFrame df = sqlContext.read()
+      .option("mergeSchema", "true")
+      .format("parquet")
+      .load(path);
+
+    return df;
+  }
+  //end::loadParquet[]
+
+  //tag::writeParquet[]
+  public void writeParquet(DataFrame df, String path) {
+    df.write().format("parquet").save(path);
+  }
+  //end::writeParquet[]
+
+  //tag::loadHiveTable[]
+  public DataFrame loadHiveTable() {
+    return sqlContext.read().table("pandas");
+  }
+  //end::loadHiveTable[]
+
+  //tag::saveManagedTable[]
+  public void saveManagedTable(DataFrame df) {
+    df.write().saveAsTable("pandas");
+  }
+  //end::saveManagedTable[]
+}
diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java
new file mode 100644
index 0000000..dd23616
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java
@@ -0,0 +1,76 @@
+package com.highperformancespark.examples.dataframe;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.expressions.MutableAggregationBuffer;
+import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
+import org.apache.spark.sql.types.*;
+
+public class JavaUDFs {
+
+  public static void setupUDFs(SQLContext sqlContext) {
+    //tag::basicUDF[]
+    sqlContext.udf().register("strlen", (String s) -> s.length(), DataTypes.StringType);
+    //end::basicUDF[]
+  }
+
+  public static void setupUDAFs(SQLContext sqlContext) {
+
+    class Avg extends UserDefinedAggregateFunction {
+
+      @Override
+      public StructType inputSchema() {
+        StructType inputSchema =
+          new StructType(new StructField[]{new StructField("value", DataTypes.DoubleType, true, Metadata.empty())});
+        return inputSchema;
+      }
+
+      @Override
+      public StructType bufferSchema() {
+        StructType bufferSchema =
+          new StructType(new StructField[]{
+            new StructField("count", DataTypes.LongType, true, Metadata.empty()),
+            new StructField("sum", DataTypes.DoubleType, true, Metadata.empty())
+          });
+
+        return bufferSchema;
+      }
+
+      @Override
+      public DataType dataType() {
+        return DataTypes.DoubleType;
+      }
+
+      @Override
+      public boolean deterministic() {
+        return true;
+      }
+
+      @Override
+      public void initialize(MutableAggregationBuffer buffer) {
+        buffer.update(0, 0L);
+        buffer.update(1, 0.0);
+      }
+
+      @Override
+      public void update(MutableAggregationBuffer buffer, Row input) {
+        buffer.update(0, buffer.getLong(0) + 1);
+        buffer.update(1, buffer.getDouble(1) + input.getDouble(0));
+      }
+
+      @Override
+      public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
+        buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0));
+        buffer1.update(1, buffer1.getDouble(1) + buffer2.getDouble(1));
+      }
+
+      @Override
+      public Object evaluate(Row buffer) {
+        return buffer.getDouble(1) / buffer.getLong(0);
+      }
+    }
+
+    Avg average = new Avg();
+    sqlContext.udf().register("ourAvg", average);
+  }
+}
diff --git a/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksFirstTry.java b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksFirstTry.java
new file mode 100644
index 0000000..d8ffcc9
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksFirstTry.java
@@ -0,0 +1,264 @@
+package com.highperformancespark.examples.goldilocks;
+
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFlatMapFunction;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.storage.StorageLevel;
+import scala.Tuple2;
+
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class JavaGoldiLocksFirstTry {
+
+  /**
+   * Find nth target rank for every column.
+   *
+   * For example:
+   *
+   * dataframe:
+   *   (0.0, 4.5, 7.7, 5.0)
+   *   (1.0, 5.5, 6.7, 6.0)
+   *   (2.0, 5.5, 1.5, 7.0)
+   *   (3.0, 5.5, 0.5, 7.0)
+   *   (4.0, 5.5, 0.5, 8.0)
+   *
+   * targetRanks:
+   *   1, 3
+   *
+   * The output will be:
+   *   0 -> (0.0, 2.0)
+   *   1 -> (4.5, 5.5)
+   *   2 -> (7.7, 1.5)
+   *   3 -> (5.0, 7.0)
+   *
+   * @param dataframe dataframe of doubles
+   * @param targetRanks the required ranks for every column
+   *
+   * @return map of (column index, list of target ranks)
+   */
+  public static Map<Integer, Iterable<Double>> findRankStatistics(DataFrame dataframe, List<Long> targetRanks) {
+    JavaPairRDD<Double, Integer> valueColumnPairs = getValueColumnPairs(dataframe);
+
+    JavaPairRDD<Double, Integer> sortedValueColumnPairs = valueColumnPairs.sortByKey();
+    sortedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK());
+
+    int numOfColumns = dataframe.schema().length();
+    List<Tuple2<Integer, List<Long>>> partitionColumnsFreq =
+      getColumnsFreqPerPartition(sortedValueColumnPairs, numOfColumns);
+
+    List<Tuple2<Integer, List<Tuple2<Integer, Long>>>> ranksLocations =
+      getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, numOfColumns);
+
+    JavaPairRDD<Integer, Double> targetRanksValues = findTargetRanksIteratively(sortedValueColumnPairs, ranksLocations);
+
+    return targetRanksValues.groupByKey().collectAsMap();
+  }
+
+  /**
+   * Step 1. Map the rows to pairs of (value, column Index).
+   *
+   * For example:
+   *
+   * dataFrame:
+   *     1.5, 1.25, 2.0
+   *    5.25,  2.5, 1.5
+   *
+   * The output RDD will be:
+   *    (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) (2.5, 1) (1.5, 2)
+   *
+   * @param dataframe dateframe of doubles
+   *
+   * @return RDD of pairs (value, column Index)
+   */
+  private static JavaPairRDD<Double, Integer> getValueColumnPairs(DataFrame dataframe) {
+    JavaPairRDD<Double, Integer> value_ColIndex =
+      dataframe.javaRDD().flatMapToPair((PairFlatMapFunction<Row, Double, Integer>) row -> {
+        List<Double> rowList = (List<Double>) (Object) toList(row.toSeq());
+        List<Tuple2<Double, Integer>> list = zipWithIndex(rowList);
+        return list;
+      });
+
+    return value_ColIndex;
+  }
+
+  /**
+   * Step 2. Find the number of elements for each column in each partition.
+   *
+   * For Example:
+   *
+   * sortedValueColumnPairs:
+   *    Partition 1: (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0)
+   *    Partition 2: (7.5, 1) (9.5, 2)
+   *
+   * numOfColumns: 3
+   *
+   * The output will be:
+   *    [(0, [2, 1, 1]), (1, [0, 1, 1])]
+   *
+   * @param sortedValueColumnPairs - sorted RDD of (value, column Index) pairs
+   * @param numOfColumns the number of columns
+   *
+   * @return Array that contains (partition index, number of elements from every column on this partition)
+   */
+  private static List<Tuple2<Integer, List<Long>>> getColumnsFreqPerPartition(JavaPairRDD<Double, Integer> sortedValueColumnPairs, int numOfColumns) {
+    List<Tuple2<Integer, List<Long>>> columsFreqPerPartition =
+      sortedValueColumnPairs.mapPartitionsWithIndex((partitionIndex, valueColumnPairs) -> {
+        Long[] freq = new Long[numOfColumns];
+        Arrays.fill(freq, 0L);
+
+        while(valueColumnPairs.hasNext()) {
+          int colIndex = valueColumnPairs.next()._2;
+          freq[colIndex] = freq[colIndex] + 1;
+        }
+
+        List<Long> freqList = Arrays.asList(freq);
+        List<Tuple2<Integer, List<Long>>> partitionList = Arrays.asList(new Tuple2<>(partitionIndex, freqList));
+        return partitionList.iterator();
+      }, false).collect();
+
+    return columsFreqPerPartition;
+  }
+
+  /**
+   * Step 3: For each Partition determine the index of the elements that are desired rank statistics
+   *
+   * For Example:
+   *    targetRanks: 5
+   *    partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])]
+   *    numOfColumns: 2
+   *
+   * The output will be:
+   *    [(0, []), (1, [(colIdx=0, rankLocation=3)]), (2, [(colIndex=1, rankLocation=1)])]
+   *
+   * @param partitionColumnsFreq Array of (partition index, columns frequencies per this partition)
+   *
+   * @return  Array that contains (partition index, relevantIndexList where relevantIndexList(i) = the index
+   *          of an element on this partition that matches one of the target ranks)
+   */
+  private static List<Tuple2<Integer, List<Tuple2<Integer, Long>>>> getRanksLocationsWithinEachPart(List<Long> targetRanks,
+      List<Tuple2<Integer, List<Long>>> partitionColumnsFreq, int numOfColumns) {
+
+    long[] runningTotal = new long[numOfColumns];
+
+    List<Tuple2<Integer, List<Tuple2<Integer, Long>>>> ranksLocations =
+      partitionColumnsFreq
+        .stream()
+        .sorted((o1, o2) -> o1._1.compareTo(o2._1))
+        .map(partitionIndex_columnsFreq -> {
+          int partitionIndex = partitionIndex_columnsFreq._1;
+          List<Long> columnsFreq = partitionIndex_columnsFreq._2;
+
+          List<Tuple2<Integer, Long>> relevantIndexList = new ArrayList<>();
+
+          zipWithIndex(columnsFreq).stream().forEach(colCount_colIndex -> {
+            long colCount = colCount_colIndex._1;
+            int colIndex = colCount_colIndex._2;
+
+            long runningTotalCol = runningTotal[colIndex];
+            Stream<Long> ranksHere =
+              targetRanks.stream().filter(rank -> runningTotalCol < rank && runningTotalCol + colCount >= rank);
+
+            // for each of the rank statistics present add this column index and the index it will be at
+            // on this partition (the rank - the running total)
+            relevantIndexList.addAll(
+              ranksHere.map(rank -> new Tuple2<>(colIndex, rank - runningTotalCol)).collect(Collectors.toList()));
+
+            runningTotal[colIndex] += colCount;
+          });
+
+
+          return new Tuple2<>(partitionIndex, relevantIndexList);
+        }).collect(Collectors.toList());
+
+    return ranksLocations;
+  }
+
+  /**
+   * Finds rank statistics elements using ranksLocations.
+   *
+   * @param sortedValueColumnPairs - sorted RDD of (value, colIndex) pairs
+   * @param ranksLocations Array of (partition Index, list of (column index, rank index of this column at this partition))
+   *
+   * @return returns RDD of the target ranks (column index, value)
+   */
+  private static JavaPairRDD<Integer, Double> findTargetRanksIteratively(JavaPairRDD<Double, Integer> sortedValueColumnPairs,
+      List<Tuple2<Integer, List<Tuple2<Integer, Long>>>> ranksLocations) {
+
+    JavaRDD<Tuple2<Integer, Double>> targetRanks = sortedValueColumnPairs.mapPartitionsWithIndex(
+      (partitionIndex, valueColumnPairs) -> {
+        List<Tuple2<Integer, Long>> targetsInThisPart = ranksLocations.get(partitionIndex)._2;
+        List<Tuple2<Integer, Double>> result = new ArrayList<>();
+
+        if (!targetsInThisPart.isEmpty()) {
+          Map<Integer, List<Long>> columnsRelativeIndex = groupByKey(targetsInThisPart);
+          Set<Integer> columnsInThisPart = columnsRelativeIndex.keySet();
+
+          Map<Integer, Long> runningTotals = toMap(columnsInThisPart);
+
+          // filter this iterator, so that it contains only those (value, columnIndex) that are the ranks statistics on this partition
+          // I.e. Keep track of the number of elements we have seen for each columnIndex using the
+          // running total hashMap. Keep those pairs for which value is the nth element for that columnIndex that appears on this partition
+          // and the map contains (columnIndex, n).
+
+          while (valueColumnPairs.hasNext()) {
+            Tuple2<Double, Integer> value_colIndex = valueColumnPairs.next();
+            double value = value_colIndex._1;
+            int colIndex = value_colIndex._2;
+
+            if (columnsInThisPart.contains(colIndex)) {
+              long total = runningTotals.get(colIndex) + 1L;
+              runningTotals.put(colIndex, total);
+              if (columnsRelativeIndex.get(colIndex).contains(total)) {
+                result.add(value_colIndex.swap());
+              }
+            }
+          }
+        }
+
+        return result.iterator();
+      }, false);
+
+    return targetRanks.mapToPair((PairFunction<Tuple2<Integer, Double>, Integer, Double>) t -> t);
+  }
+
+  private static Map<Integer,Long> toMap(Set<Integer> set) {
+    Map<Integer, Long> map = new HashMap<>();
+    for (int k: set)
+      map.put(k, 0L);
+
+    return map;
+  }
+
+  private static Map<Integer, List<Long>> groupByKey(List<Tuple2<Integer, Long>> list) {
+    Map<Integer, List<Long>> map = new HashMap<>();
+    for (int i = 0; i < list.size(); i++) {
+      Tuple2<Integer, Long> curr = list.get(i);
+      if (!map.containsKey(curr._1))
+        map.put(curr._1, new ArrayList<>());
+
+      map.get(curr._1).add(curr._2);
+    }
+
+    return map;
+  }
+
+  private static<T> List<T> toList(scala.collection.Seq<T> seq) {
+    return scala.collection.JavaConversions.seqAsJavaList(seq);
+  }
+
+  private static<T> List<Tuple2<T, Integer>> zipWithIndex(List<T> list) {
+    List<Tuple2<T, Integer>> indexedList = new ArrayList<>();
+    for (int i = 0; i < list.size(); i++)
+      indexedList.add(new Tuple2<>(list.get(i), i));
+
+    return indexedList;
+  }
+
+}
+
diff --git a/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksGroupByKey.java b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksGroupByKey.java
new file mode 100644
index 0000000..f5f72ce
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/goldilocks/JavaGoldiLocksGroupByKey.java
@@ -0,0 +1,32 @@
+package com.highperformancespark.examples.goldilocks;
+
+import org.apache.spark.api.java.JavaPairRDD;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class JavaGoldiLocksGroupByKey {
+  //tag::groupByKey[]
+  public Map<Integer, List<Double>> findRankStatistics(
+    JavaPairRDD<Integer, Double> pairRDD, List<Long> ranks) {
+
+    Map<Integer, List<Double>> element_ranks = pairRDD.groupByKey().mapValues(iter -> {
+      List<Double> values = new ArrayList<>();
+      Iterator<Double> iterator = iter.iterator();
+      while (iterator.hasNext())
+        values.add(iterator.next());
+      Collections.sort(values);
+
+      List<Double> result =
+        ranks.stream()
+            .map(n -> values.get(new Long(n).intValue()))
+            .collect(Collectors.toList());
+
+      return result;
+    }).collectAsMap();
+
+    return element_ranks;
+  }
+  //end::groupByKey[]
+
+}
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java b/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java
new file mode 100644
index 0000000..e3f5325
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java
@@ -0,0 +1,29 @@
+package com.highperformancespark.examples.objects;
+
+import java.io.Serializable;
+
+public class JavaCoffeeShop implements Serializable {
+  private String zip;
+  private String name;
+
+  public JavaCoffeeShop(String zip, String name) {
+    this.zip = zip;
+    this.name = name;
+  }
+
+  public String getZip() {
+    return zip;
+  }
+
+  public void setZip(String zip) {
+    this.zip = zip;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaGoldiLocksRow.java b/src/main/java/com/highperformancespark/examples/objects/JavaGoldiLocksRow.java
new file mode 100644
index 0000000..82cafe9
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/objects/JavaGoldiLocksRow.java
@@ -0,0 +1,49 @@
+package com.highperformancespark.examples.objects;
+
+import java.io.Serializable;
+
+public class JavaGoldiLocksRow implements Serializable {
+  private double a;
+  private double b;
+  private double c;
+  private double d;
+
+  public JavaGoldiLocksRow(double a, double b, double c, double d) {
+    this.a = a;
+    this.b = b;
+    this.c = c;
+    this.d = d;
+  }
+
+  public double getA() {
+    return a;
+  }
+
+  public void setA(double a) {
+    this.a = a;
+  }
+
+  public double getB() {
+    return b;
+  }
+
+  public void setB(double b) {
+    this.b = b;
+  }
+
+  public double getC() {
+    return c;
+  }
+
+  public void setC(double c) {
+    this.c = c;
+  }
+
+  public double getD() {
+    return d;
+  }
+
+  public void setD(double d) {
+    this.d = d;
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java
new file mode 100644
index 0000000..c2b7847
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java
@@ -0,0 +1,56 @@
+package com.highperformancespark.examples.objects;
+
+import java.io.Serializable;
+
+public class JavaPandaInfo implements Serializable {
+  private String place;
+  private String pandaType;
+  private int happyPandas;
+  private int totalPandas;
+
+  /**
+   * @param place       name of place
+   * @param pandaType   type of pandas in this place
+   * @param happyPandas number of happy pandas in this place
+   * @param totalPandas total number of pandas in this place
+   */
+  public JavaPandaInfo(String place, String pandaType, int happyPandas, int totalPandas) {
+    this.place = place;
+    this.pandaType = pandaType;
+    this.happyPandas = happyPandas;
+    this.totalPandas = totalPandas;
+  }
+
+  public String getPlace() {
+    return place;
+  }
+
+  public void setPlace(String place) {
+    this.place = place;
+  }
+
+  public String getPandaType() {
+    return pandaType;
+  }
+
+  public void setPandaType(String pandaType) {
+    this.pandaType = pandaType;
+  }
+
+  public int getHappyPandas() {
+    return happyPandas;
+  }
+
+  public void setHappyPandas(int happyPandas) {
+    this.happyPandas = happyPandas;
+  }
+
+  public int getTotalPandas() {
+    return totalPandas;
+  }
+
+  public void setTotalPandas(int totalPandas) {
+    this.totalPandas = totalPandas;
+  }
+
+}
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java
new file mode 100644
index 0000000..dc33d9c
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java
@@ -0,0 +1,34 @@
+package com.highperformancespark.examples.objects;
+
+import java.io.Serializable;
+import java.util.List;
+
+public class JavaPandaPlace implements Serializable {
+  private String name;
+  private List<JavaRawPanda> pandas;
+
+  /**
+   * @param name place name
+   * @param pandas pandas in that place
+   */
+  public JavaPandaPlace(String name, List<JavaRawPanda> pandas) {
+    this.name = name;
+    this.pandas = pandas;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  public List<JavaRawPanda> getPandas() {
+    return pandas;
+  }
+
+  public void setPandas(List<JavaRawPanda> pandas) {
+    this.pandas = pandas;
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java
new file mode 100644
index 0000000..f73e93f
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java
@@ -0,0 +1,56 @@
+package com.highperformancespark.examples.objects;
+
+import java.io.Serializable;
+
+public class JavaPandas implements Serializable {
+  private String name;
+  private String zip;
+  private int pandaSize;
+  private int age;
+
+  /**
+   * @param name      name of panda
+   * @param zip       zip code
+   * @param pandaSize size of panda in KG
+   * @param age       age of panda
+   */
+  public JavaPandas(String name, String zip, int pandaSize, int age) {
+    this.name = name;
+    this.zip = zip;
+    this.pandaSize = pandaSize;
+    this.age = age;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  public String getZip() {
+    return zip;
+  }
+
+  public void setZip(String zip) {
+    this.zip = zip;
+  }
+
+  public int getPandaSize() {
+    return pandaSize;
+  }
+
+  public void setPandaSize(int pandaSize) {
+    this.pandaSize = pandaSize;
+  }
+
+  public int getAge() {
+    return age;
+  }
+
+  public void setAge(int age) {
+    this.age = age;
+  }
+
+}
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java b/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java
new file mode 100644
index 0000000..7d2be17
--- /dev/null
+++ b/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java
@@ -0,0 +1,67 @@
+package com.highperformancespark.examples.objects;
+
+import java.io.Serializable;
+import java.util.List;
+
+public class JavaRawPanda implements Serializable {
+  private long id;
+  private String zip;
+  private String pt;
+  private boolean happy;
+  private List<Double> attributes;
+
+  /**
+   * @param id panda id
+   * @param zip zip code of panda residence
+   * @param pt Type of panda as a string
+   * @param happy if panda is happy
+   * @param attributes array of panada attributes
+   */
+  public JavaRawPanda(long id, String zip, String pt, boolean happy, List<Double> attributes) {
+    this.attributes = attributes;
+    this.id = id;
+    this.zip = zip;
+    this.pt = pt;
+    this.happy = happy;
+  }
+
+  public long getId() {
+    return id;
+  }
+
+  public void setId(long id) {
+    this.id = id;
+  }
+
+  public String getZip() {
+    return zip;
+  }
+
+  public void setZip(String zip) {
+    this.zip = zip;
+  }
+
+  public String getPt() {
+    return pt;
+  }
+
+  public void setPt(String pt) {
+    this.pt = pt;
+  }
+
+  public boolean isHappy() {
+    return happy;
+  }
+
+  public void setHappy(boolean happy) {
+    this.happy = happy;
+  }
+
+  public List<Double> getAttributes() {
+    return attributes;
+  }
+
+  public void setAttributes(List<Double> attributes) {
+    this.attributes = attributes;
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksFirstTry.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksFirstTry.scala
new file mode 100644
index 0000000..3d9eb94
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksFirstTry.scala
@@ -0,0 +1,211 @@
+package com.highperformancespark.examples.goldilocks
+
+import scala.collection.{Map, mutable}
+import scala.collection.mutable.{ArrayBuffer, MutableList}
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.storage.StorageLevel
+
+object GoldiLocksGroupByKey {
+  //tag::groupByKey[]
+  def findRankStatistics(
+    pairRDD: RDD[(Int, Double)],
+    ranks: List[Long]): Map[Int, List[Double]] = {
+    pairRDD.groupByKey().mapValues(iter => {
+      val ar = iter.toArray.sorted
+      ranks.map(n => ar(n.toInt))
+    }).collectAsMap()
+  }
+  //end::groupByKey[]
+}
+
+//tag::firstTry[]
+object GoldiLocksFirstTry {
+
+  /**
+    * Find nth target rank for every column.
+    *
+    * For example:
+    *
+    * dataframe:
+    *   (0.0, 4.5, 7.7, 5.0)
+    *   (1.0, 5.5, 6.7, 6.0)
+    *   (2.0, 5.5, 1.5, 7.0)
+    *   (3.0, 5.5, 0.5, 7.0)
+    *   (4.0, 5.5, 0.5, 8.0)
+    *
+    * targetRanks:
+    *   1, 3
+    *
+    * The output will be:
+    *   0 -> (0.0, 2.0)
+    *   1 -> (4.5, 5.5)
+    *   2 -> (7.7, 1.5)
+    *   3 -> (5.0, 7.0)
+    *
+    * @param dataframe dataframe of doubles
+    * @param targetRanks the required ranks for every column
+    * @return map of (column index, list of target ranks)
+    */
+  def findRankStatistics(dataframe: DataFrame, targetRanks: List[Long]):
+    Map[Int, Iterable[Double]] = {
+
+    val valueColumnPairs: RDD[(Double, Int)] = getValueColumnPairs(dataframe)
+    val sortedValueColumnPairs = valueColumnPairs.sortByKey()
+    sortedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK)
+
+    val numOfColumns = dataframe.schema.length
+    val partitionColumnsFreq = getColumnsFreqPerPartition(sortedValueColumnPairs, numOfColumns)
+    val ranksLocations  = getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, numOfColumns)
+
+    val targetRanksValues = findTargetRanksIteratively(sortedValueColumnPairs, ranksLocations)
+    targetRanksValues.groupByKey().collectAsMap()
+  }
+
+  /**
+   * Step 1. Map the rows to pairs of (value, column Index).
+   *
+   * For example:
+   *
+   * dataFrame:
+   *     1.5, 1.25, 2.0
+   *    5.25,  2.5, 1.5
+   *
+   * The output RDD will be:
+   *    (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) (2.5, 1) (1.5, 2)
+   *
+   * @param dataframe dateframe of doubles
+   *
+   * @return RDD of pairs (value, column Index)
+   */
+  private def getValueColumnPairs(dataframe : DataFrame): RDD[(Double, Int)] = {
+    dataframe.flatMap(row => row.toSeq.zipWithIndex.map{ case (v, index) =>
+      (v.toString.toDouble, index)})
+  }
+
+  /**
+   * Step 2. Find the number of elements for each column in each partition.
+   *
+   * For Example:
+   *
+   * sortedValueColumnPairs:
+   *    Partition 1: (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0)
+   *    Partition 2: (7.5, 1) (9.5, 2)
+   *
+   * numOfColumns: 3
+   *
+   * The output will be:
+   *    [(0, [2, 1, 1]), (1, [0, 1, 1])]
+   *
+   * @param sortedValueColumnPairs - sorted RDD of (value, column Index) pairs
+   * @param numOfColumns the number of columns
+   *
+   * @return Array that contains (partition index, number of elements from every column on this partition)
+   */
+  private def getColumnsFreqPerPartition(sortedValueColumnPairs: RDD[(Double, Int)], numOfColumns : Int):
+    Array[(Int, Array[Long])] = {
+
+    val zero = Array.fill[Long](numOfColumns)(0)
+
+    def aggregateColumnFrequencies (partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) = {
+      val columnsFreq : Array[Long] = valueColumnPairs.aggregate(zero)(
+        (a : Array[Long], v : (Double ,Int)) => {
+          val (value, colIndex) = v
+          a(colIndex) = a(colIndex) + 1L
+          a
+        },
+        (a : Array[Long], b : Array[Long]) => {
+          a.zip(b).map{ case(aVal, bVal) => aVal + bVal}
+        })
+
+      Iterator((partitionIndex, columnsFreq))
+    }
+
+    sortedValueColumnPairs.mapPartitionsWithIndex(aggregateColumnFrequencies).collect()
+  }
+
+  /**
+   * Step 3: For each Partition determine the index of the elements that are desired rank statistics
+   *
+   * For Example:
+   *    targetRanks: 5
+   *    partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])]
+   *    numOfColumns: 2
+   *
+   * The output will be:
+   *    [(0, []), (1, [(colIdx=0, rankLocation=3)]), (2, [(colIndex=1, rankLocation=1)])]
+   *
+   * @param partitionColumnsFreq Array of (partition index, columns frequencies per this partition)
+   *
+   * @return  Array that contains (partition index, relevantIndexList where relevantIndexList(i) = the index
+   *          of an element on this partition that matches one of the target ranks)
+   */
+  private def getRanksLocationsWithinEachPart(targetRanks : List[Long],
+                                              partitionColumnsFreq : Array[(Int, Array[Long])],
+                                              numOfColumns : Int) : Array[(Int, List[(Int, Long)])] = {
+
+    val runningTotal = Array.fill[Long](numOfColumns)(0)
+
+    partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq) =>
+      val relevantIndexList = new MutableList[(Int, Long)]()
+
+      columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex)  =>
+        val runningTotalCol = runningTotal(colIndex)
+        val ranksHere: List[Long] = targetRanks.filter(rank =>
+          runningTotalCol < rank && runningTotalCol + colCount >= rank)
+
+        // for each of the rank statistics present add this column index and the index it will be at
+        // on this partition (the rank - the running total)
+        relevantIndexList ++= ranksHere.map(rank => (colIndex, rank - runningTotalCol))
+
+        runningTotal(colIndex) += colCount
+      }
+
+      (partitionIndex, relevantIndexList.toList)
+    }
+  }
+
+  /**
+    * Finds rank statistics elements using ranksLocations.
+    *
+    * @param sortedValueColumnPairs - sorted RDD of (value, colIndex) pairs
+    * @param ranksLocations Array of (partition Index, list of (column index, rank index of this column at this partition))
+    *
+    * @return returns RDD of the target ranks (column index, value)
+    */
+  private def findTargetRanksIteratively(sortedValueColumnPairs : RDD[(Double, Int)],
+                                      ranksLocations : Array[(Int, List[(Int, Long)])]): RDD[(Int, Double)] = {
+
+    sortedValueColumnPairs.mapPartitionsWithIndex((partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) => {
+      val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2
+      if (targetsInThisPart.nonEmpty) {
+        val columnsRelativeIndex: Map[Int, List[Long]] = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2))
+        val columnsInThisPart = targetsInThisPart.map(_._1)
+
+        val runningTotals : mutable.HashMap[Int, Long]=  new mutable.HashMap()
+        runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap
+
+        // filter this iterator, so that it contains only those (value, columnIndex) that are the ranks statistics on this partition
+        // I.e. Keep track of the number of elements we have seen for each columnIndex using the
+        // running total hashMap. Keep those pairs for which value is the nth element for that columnIndex that appears on this partition
+        // and the map contains (columnIndex, n).
+        valueColumnPairs.filter{
+          case(value, colIndex) =>
+            //rely on lazy evaluation. If we have already seen this column index, then evalute this
+            // block in which we increment the running totals and return if this element's count appears in the map.
+            lazy val thisPairIsTheRankStatistic: Boolean = {
+              val total = runningTotals(colIndex) + 1L
+              runningTotals.update(colIndex, total)
+              columnsRelativeIndex(colIndex).contains(total)
+            }
+             (runningTotals contains colIndex) && thisPairIsTheRankStatistic
+        }.map(_.swap)
+      }
+      else {
+        Iterator.empty
+      }
+    })
+  }
+}
+//end::firstTry[]
diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksWithHashMap.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksWithHashMap.scala
new file mode 100644
index 0000000..fc37f31
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/GoldiLocksWithHashMap.scala
@@ -0,0 +1,326 @@
+package com.highperformancespark.examples.goldilocks
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.storage.StorageLevel
+
+import scala.Predef
+import scala.collection.{mutable, Map}
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.MutableList
+
+//tag::hashMap[]
+object GoldiLocksWithHashMap {
+
+  /**
+    * Find nth target rank for every column.                 
+    *
+    * For example:                                           
+    *
+    * dataframe:                                             
+    *   (0.0, 4.5, 7.7, 5.0)                                 
+    *   (1.0, 5.5, 6.7, 6.0)                                 
+    *   (2.0, 5.5, 1.5, 7.0)                                 
+    *   (3.0, 5.5, 0.5, 7.0)                                 
+    *   (4.0, 5.5, 0.5, 8.0)                                 
+    *
+    * targetRanks:                                           
+    *   1, 3                                                 
+    *
+    * The output will be:                                    
+    *   0 -> (0.0, 2.0)                                      
+    *   1 -> (4.5, 5.5)                                      
+    *   2 -> (7.7, 1.5)                                      
+    *   3 -> (5.0, 7.0)                                      
+    *
+    * @param dataFrame dataframe of doubles                  
+    * @param targetRanks the required ranks for every column 
+    *
+    * @return map of (column index, list of target ranks)    
+    */
+  def findRankStatistics(dataFrame: DataFrame, targetRanks: List[Long]):
+    Map[Int, Iterable[Double]] = {
+    
+    val aggregatedValueColumnPairs: RDD[((Double, Int), Long)] = getAggregatedValueColumnPairs(dataFrame)
+    val sortedAggregatedValueColumnPairs = aggregatedValueColumnPairs.sortByKey()
+    sortedAggregatedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK)
+
+    val numOfColumns =  dataFrame.schema.length
+    val partitionColumnsFreq = getColumnsFreqPerPartition(sortedAggregatedValueColumnPairs, numOfColumns)
+    val ranksLocations  = getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, numOfColumns)
+
+    val targetRanksValues = findTargetRanksIteratively(sortedAggregatedValueColumnPairs, ranksLocations)
+    targetRanksValues.groupByKey().collectAsMap()
+  }
+
+  /**
+   * Step 1. Map the rows to pairs of ((value, colIndex), count) where count is the number of times
+   * that value and that pair appear on this partition
+   *
+   * For example:
+   *
+   * dataFrame:
+   *     1.5, 1.25, 2.0
+   *     1.5,  2.5, 2.0
+   *
+   * The output RDD will be:
+   *    ((1.5, 0), 2) ((1.25, 1), 1) ((2.5, 1), 1) ((2.0, 2), 2)
+   *
+   * @param dataFrame of double columns to compute the rank statistics for
+   *
+   * @return returns RDD of ((value, column index), count)
+   */
+  def getAggregatedValueColumnPairs(dataFrame : DataFrame) : RDD[((Double, Int), Long)] = {
+    val aggregatedValueColumnRDD =  dataFrame.rdd.mapPartitions(rows => {
+      val valueColumnMap = new mutable.HashMap[(Double, Int), Long]()
+      rows.foreach(row => {
+        row.toSeq.zipWithIndex.foreach{ case (value, columnIndex) => {
+          val key = (value.toString.toDouble, columnIndex)
+          val count = valueColumnMap.getOrElseUpdate(key, 0)
+          valueColumnMap.update(key, count + 1)
+        }}
+      })
+
+      valueColumnMap.toIterator
+    })
+
+    aggregatedValueColumnRDD
+  }
+
+  /**
+    * Step 2. Find the number of elements for each column in each partition.
+    *
+    * For Example:
+    *
+    * sortedValueColumnPairs:
+    *    Partition 1: ((1.5, 0), 2) ((2.0, 0), 1)
+    *    Partition 2: ((4.0, 0), 3) ((3.0, 1), 1)
+    *
+    * numOfColumns: 3
+    *
+    * The output will be:
+    *    [(0, [3, 0]), (1, [3, 1])]
+    *
+    * @param sortedAggregatedValueColumnPairs - sortedAggregatedValueColumnPairs RDD of ((value, column index), count)
+    * @param numOfColumns the number of columns
+    *
+    * @return Array that contains (partition index, number of elements from every column on this partition)
+    */
+  private def getColumnsFreqPerPartition(sortedAggregatedValueColumnPairs: RDD[((Double, Int), Long)],
+                                        numOfColumns : Int): Array[(Int, Array[Long])] = {
+
+    val zero = Array.fill[Long](numOfColumns)(0)
+    def aggregateColumnFrequencies(partitionIndex : Int, pairs : Iterator[((Double, Int), Long)]) = {
+      val columnsFreq : Array[Long] = pairs.aggregate(zero)(
+        (a : Array[Long], v : ((Double,Int), Long)) => {
+          val ((value, colIndex), count) = v
+          a(colIndex) = a(colIndex) + count
+          a},
+        (a : Array[Long], b : Array[Long]) => {
+          a.zip(b).map{ case(aVal, bVal) => aVal + bVal}
+        })
+
+      Iterator((partitionIndex, columnsFreq))
+    }
+
+    sortedAggregatedValueColumnPairs.mapPartitionsWithIndex(aggregateColumnFrequencies).collect()
+  }
+
+  /**
+    * Step 3: For each Partition determine the index of the elements that are desired rank statistics
+    *
+    * For Example:
+    *    targetRanks: 5
+    *    partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])]
+    *    numOfColumns: 2
+    *
+    * The output will be:
+    *    [(0, []), (1, [(0, 3)]), (2, [(1, 1)])]
+    *
+    * @param partitionColumnsFreq Array of (partition index, columns frequencies per this partition)
+    *
+    * @return  Array that contains (partition index, relevantIndexList where relevantIndexList(i) = the index
+    *          of an element on this partition that matches one of the target ranks)
+    */
+  private def getRanksLocationsWithinEachPart(targetRanks : List[Long],
+                                              partitionColumnsFreq : Array[(Int, Array[Long])],
+                                              numOfColumns : Int) : Array[(Int, List[(Int, Long)])]  = {
+
+    val runningTotal = Array.fill[Long](numOfColumns)(0)
+
+    partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq)=> {
+      val relevantIndexList = new MutableList[(Int, Long)]()
+
+      columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex)  => {
+        val runningTotalCol = runningTotal(colIndex)
+
+        val ranksHere: List[Long] = targetRanks.filter(rank =>
+          (runningTotalCol < rank && runningTotalCol + colCount >= rank))
+        relevantIndexList ++= ranksHere.map(rank => (colIndex, rank - runningTotalCol))
+
+        runningTotal(colIndex) += colCount
+      }}
+
+      (partitionIndex, relevantIndexList.toList)
+    }}
+  }
+
+  /**
+    * Finds rank statistics elements using ranksLocations.
+    *
+    * @param sortedAggregatedValueColumnPairs - sorted RDD of (value, colIndex) pairs
+    * @param ranksLocations Array of (partition Index, list of (column index, rank index of this column at this partition))
+    *
+    * @return returns RDD of the target ranks (column index, value)
+    */
+  //tag::mapPartitionsExample[]
+  private def findTargetRanksIteratively(sortedAggregatedValueColumnPairs : RDD[((Double, Int), Long)],
+                                         ranksLocations : Array[(Int, List[(Int, Long)])]
+  ): RDD[(Int, Double)] = {
+
+    sortedAggregatedValueColumnPairs.mapPartitionsWithIndex((partitionIndex : Int,
+      aggregatedValueColumnPairs : Iterator[((Double, Int), Long)]) => {
+
+      val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2
+     if (!targetsInThisPart.isEmpty) {
+        FindTargetsSubRoutine.asIteratorToIteratorTransformation(aggregatedValueColumnPairs,
+          targetsInThisPart)
+      }
+      else Iterator.empty
+    })
+  }
+  //end::mapPartitionsExample[]
+  /**
+   * We will want to use this in some chapter where we talk about check pointing
+   * @param valPairs
+   * @param colIndexList
+   * @param targetRanks
+   * @param storageLevel
+   * @param checkPoint
+   * @param directory
+   * @return
+   */
+  def findQuantilesWithCustomStorage(valPairs: RDD[((Double, Int), Long)],
+    colIndexList: List[Int],
+    targetRanks: List[Long],
+    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
+    checkPoint : Boolean, directory : String = "") = {
+
+    val n = colIndexList.last+1
+    val sorted  = valPairs.sortByKey()
+    if (storageLevel != StorageLevel.NONE)
+      sorted.persist(storageLevel)
+
+    if (checkPoint) {
+      sorted.sparkContext.setCheckpointDir(directory)
+      sorted.checkpoint()
+    }
+
+    val partitionColumnsFreq = getColumnsFreqPerPartition(sorted, n)
+    val ranksLocations  = getRanksLocationsWithinEachPart(targetRanks, partitionColumnsFreq, n)
+    val targetRanksValues = findTargetRanksIteratively(sorted, ranksLocations)
+    targetRanksValues.groupByKey().collectAsMap()
+  }
+}
+//end::hashMap[]
+
+
+object FindTargetsSubRoutine extends Serializable {
+
+  //tag::notIter[]
+  /**
+    * This sub routine returns an Iterator of (columnIndex, value) that correspond to one of the
+    desired rank statistics on this partition.
+
+    Because in the original iterator, the pairs are distinct
+    and include the count, one row of the original iterator could map to multiple elements in the output.
+    I.e. if we were looking for the 2nd and 3rd element in column index 4 on this partition. And the head
+    of this partition is ((3249.0, 4), 23) (i.e. the element 3249.0 in the 4 th column appears 23 times),
+    then we would output (4, 3249.0) twice in the final iterator. Once because 3249.0 is the 2nd element and
+    once because it is the third element on that partition for that column index
+    and we are looking for both the second and third element.
+
+    * @param valueColumnPairsIter - passed in from the mapPartitions function. An iterator of the sorted
+    *                             ((value, columnIndex), count) tupples.
+    * @param targetsInThisPart - (columnIndex, index-on-partition pairs). In the above example this would
+    *                          include (4, 2) and (4,3) since we desire the 2nd element for column
+    *                          index 4 on this partition and the 3rd element.
+    * @return All of the rank statistics that live in this partition as an iterator of (columnIndex, value pairs)
+    */
+  def withArrayBuffer(valueColumnPairsIter : Iterator[((Double, Int), Long)],
+    targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = {
+
+      val columnsRelativeIndex: Predef.Map[Int, List[Long]] = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2))
+
+    //the column indices of the pairs that are desired rank statistics that live in this partition.
+      val columnsInThisPart: List[Int] = targetsInThisPart.map(_._1).distinct
+
+    //a HashMap with the running totals of each column index. As we loop through the iterator
+    //we will update the hashmap as we see elements of each column index.
+      val runningTotals : mutable.HashMap[Int, Long]=  new mutable.HashMap()
+      runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap
+
+    //we use an array buffer to build the resulting iterator
+      val result: ArrayBuffer[(Int, Double)] = new scala.collection.mutable.ArrayBuffer()
+
+      valueColumnPairsIter.foreach {
+        case ((value, colIndex), count) =>
+
+          if (columnsInThisPart contains colIndex) {
+
+            val total = runningTotals(colIndex)
+            //the ranks that are contains by this element of the input iterator.
+            //get by filtering the
+            val ranksPresent =  columnsRelativeIndex(colIndex)
+                              .filter(index => (index <= count + total) && (index > total))
+
+            ranksPresent.foreach(r => result += ((colIndex, value)))
+
+            //update the running totals.
+            runningTotals.update(colIndex, total + count)
+        }
+      }
+    //convert
+    result.toIterator
+  }
+   //end::notIter[]
+
+   //tag::iterToIter[]
+  /**
+    * Same function as above but rather than building the result from an array buffer we use
+    * a flatMap on the iterator to get the resulting iterator.
+    */
+  def asIteratorToIteratorTransformation(valueColumnPairsIter : Iterator[((Double, Int), Long)],
+    targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = {
+
+    val columnsRelativeIndex = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2))
+    val columnsInThisPart = targetsInThisPart.map(_._1).distinct
+
+    val runningTotals : mutable.HashMap[Int, Long]=  new mutable.HashMap()
+     runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap
+
+    //filter out the pairs that don't have a column index that is in this part
+    val pairsWithRanksInThisPart =  valueColumnPairsIter.filter{
+      case (((value, colIndex), count)) =>
+        columnsInThisPart contains colIndex
+     }
+
+    //map the valueColumn pairs to a list of (colIndex, value) pairs that correspond to one of the
+    //desired rank statistics on this partition.
+    pairsWithRanksInThisPart.flatMap{
+
+      case (((value, colIndex), count)) =>
+
+          val total = runningTotals(colIndex)
+          val ranksPresent: List[Long] = columnsRelativeIndex(colIndex)
+                                         .filter(index => (index <= count + total) && (index > total))
+
+          val nextElems: Iterator[(Int, Double)] = ranksPresent.map(r => (colIndex, value)).toIterator
+
+          //update the running totals
+          runningTotals.update(colIndex, total + count)
+          nextElems
+    }
+  }
+  //end::iterToIter[]
+}
\ No newline at end of file
diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/RDDJoinExamples.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/RDDJoinExamples.scala
new file mode 100644
index 0000000..9578ea5
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/RDDJoinExamples.scala
@@ -0,0 +1,96 @@
+package  com.highperformancespark.examples.goldilocks
+
+import org.apache.spark.HashPartitioner
+import org.apache.spark.rdd.RDD
+
+object RDDJoinExamples {
+
+ /* For Example, suppose we have one RDD with some data in the form (Panda id, score)
+ and another RDD with (Panda id, address), and we want to send each Panda some mail
+ with her best score. We could join the RDDs on ID and then compute the best score
+ for each address. Like this:
+
+  'ToDo: Insert Example'
+
+  However, this is  slower than first reducing the score data, so that the
+  //first dataset contains only one row for each Panda with her best score and then
+   //joining that data with the address data.
+
+  'ToDO: Insert an example of this' */
+ //tag::joinScoresWithAddress[]
+  def joinScoresWithAddress1( scoreRDD : RDD[(Long, Double)],
+   addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= {
+     val joinedRDD = scoreRDD.join(addressRDD)
+      joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y )
+  }
+  //end::joinScoresWithAddress[]
+
+  //tag::leftOuterJoinScoresWithAddress[]
+  def outerJoinScoresWithAddress( scoreRDD : RDD[(Long, Double)],
+   addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, Option[String]))]= {
+     val joinedRDD = scoreRDD.leftOuterJoin(addressRDD)
+      joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y )
+ }
+  //end::leftOuterJoinScoresWithAddress[]
+
+  //tag::joinScoresWithAddressFast[]
+  def joinScoresWithAddress2( scoreRDD : RDD[(Long, Double)],
+    addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= {
+    //stuff
+   val bestScoreData = scoreRDD.reduceByKey((x, y) => if(x > y) x else y)
+   bestScoreData.join(addressRDD)
+
+  }
+  //end::joinScoresWithAddressFast[]
+/*
+ We could make the example in the previous section even faster,
+ by using the partitioner for the address data as an argument for
+ the reduce by key step.
+ 'ToDO: Insert the code to show this here' */
+  //tag::joinScoresWithAddress3[]
+  def joinScoresWithAddress3( scoreRDD : RDD[(Long, Double)],
+   addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= {
+   //if addressRDD has a known partitioner we should use that,
+   //otherwise it has a default hash parttioner, which we can reconstrut by getting the umber of
+   // partitions.
+   val addressDataPartitioner = addressRDD.partitioner match {
+    case (Some(p)) => p
+    case (None) => new HashPartitioner(addressRDD.partitions.length)
+   }
+   val bestScoreData = scoreRDD.reduceByKey(addressDataPartitioner, (x, y) => if(x > y) x else y)
+   bestScoreData.join(addressRDD)
+  }
+ //end::joinScoresWithAddress3[]
+
+  def debugString( scoreRDD : RDD[(Long, Double)],
+   addressRDD : RDD[(Long, String )])  = {
+   //tag::debugString[]
+   scoreRDD.join(addressRDD).toDebugString
+   //end::debugString[]
+  }
+
+ /*
+  *  Suppose we had two datasets of information about each panda,
+  *  one with the scores, and one with there favorite foods.
+  *  We could use cogroup to associate each Pandas id with an iterator
+  *  of their scores and another iterator of their favorite foods.
+  */
+
+
+ def coGroupExample( scoreRDD : RDD[(Long, Double)], foodRDD : RDD[(Long, String )],
+  addressRDD : RDD[(Long, String )])  = {
+  //tag::coGroupExample1[]
+   val cogroupedRDD: RDD[(Long, (Iterable[Double], Iterable[String]))] = scoreRDD.cogroup(foodRDD)
+  //end::coGroupExample1[]
+
+  /*
+   * For example, if we needed to join the panda score data with both address
+   * and favorite foods, it would be better to use co group than two
+   * join operations.
+   */
+
+  //tag::coGroupExample2[]
+  val addressScoreFood = addressRDD.cogroup(scoreRDD, foodRDD)
+  //end::coGroupExample2[]
+   }
+ }
diff --git a/src/main/scala/com/high-performance-spark-examples/GoldiLocks/SecondarySort.scala b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/SecondarySort.scala
new file mode 100644
index 0000000..d7d92a0
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/GoldiLocks/SecondarySort.scala
@@ -0,0 +1,102 @@
+package  com.highperformancespark.examples.goldilocks
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+import org.apache.spark.{HashPartitioner, Partitioner}
+import org.apache.spark.rdd.RDD
+
+object SecondarySort {
+
+  //tag::sortByTwoKeys[]
+  def sortByTwoKeys[K : Ordering : ClassTag , S, V : ClassTag](pairRDD : RDD[((K, S), V)], partitions : Int ) = {
+    val colValuePartitioner = new PrimaryKeyPartitioner[K, S](partitions)
+    implicit val ordering: Ordering[(K, S)] = Ordering.by(_._1)
+    val sortedWithinParts = pairRDD.repartitionAndSortWithinPartitions(
+      colValuePartitioner)
+    sortedWithinParts
+  }
+  //end::sortByTwoKeys[]
+
+  //tag::sortAndGroup[]
+  def groupByKeyAndSortBySecondaryKey[K : Ordering : ClassTag, S, V : ClassTag](pairRDD : RDD[((K, S), V)], partitions : Int ) = {
+    val colValuePartitioner = new PrimaryKeyPartitioner[Double, Int](partitions)
+    implicit val ordering: Ordering[(K, S)] = Ordering.by(_._1)
+    val sortedWithinParts = pairRDD.repartitionAndSortWithinPartitions(
+      colValuePartitioner)
+    sortedWithinParts.mapPartitions( iter => groupSorted[K, S, V](iter) )
+  }
+
+  def groupSorted[K,S,V](
+    it: Iterator[((K, S), V)]): Iterator[(K, List[(S, V)])] = {
+    val res = List[(K, ArrayBuffer[(S, V)])]()
+    it.foldLeft(res)((list, next) => list match {
+      case Nil =>
+        val ((firstKey, secondKey), value) = next
+        List((firstKey, ArrayBuffer((secondKey, value))))
+
+      case head :: rest =>
+        val (curKey, valueBuf) = head
+        val ((firstKey, secondKey), value) = next
+        if (!firstKey.equals(curKey) ) {
+          (firstKey, ArrayBuffer((secondKey, value))) :: list
+        } else {
+          valueBuf.append((secondKey, value))
+          list
+        }
+
+    }).map { case (key, buf) => (key, buf.toList) }.iterator
+  }
+  //end::sortAndGroup[]
+
+}
+
+//tag::primaryKeyPartitioner[]
+class PrimaryKeyPartitioner[K, S](partitions: Int) extends Partitioner {
+  /**
+   * We create a hash partitioner and use it with the first set of keys.
+   */
+  val delegatePartitioner = new HashPartitioner(partitions)
+
+  override def numPartitions = delegatePartitioner.numPartitions
+
+  /**
+   * Partition according to the hash value of the first key
+   */
+  override def getPartition(key: Any): Int = {
+    val k = key.asInstanceOf[(K, S)]
+    delegatePartitioner.getPartition(k._1)
+  }
+}
+//end::primaryKeyPartitioner[]
+
+object CoPartitioningLessons {
+
+  def coLocated(a : RDD[(Int, String)], b : RDD[(Int, String)],
+    partitionerX : Partitioner, partitionerY :Partitioner): Unit = {
+
+    //tag::coLocated
+    val rddA = a.partitionBy(partitionerX)
+    rddA.cache()
+    val rddB = b.partitionBy(partitionerY)
+    rddB.cache()
+    val rddC = a.cogroup(b)
+    rddC.count()
+    //end::coLocated[]
+    }
+
+  def notCoLocated(a : RDD[(Int, String)], b : RDD[(Int, String )],
+    partitionerX : Partitioner, partitionerY :Partitioner): Unit = {
+
+    //tag::notCoLocated
+    val rddA = a.partitionBy(partitionerX)
+    rddA.cache()
+    val rddB = b.partitionBy(partitionerY)
+    rddB.cache()
+    val rddC = a.cogroup(b)
+    rddA.count()
+    rddB.count()
+    rddC.count()
+    //end::notCoLocated[]
+    }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
new file mode 100644
index 0000000..f91728d
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
@@ -0,0 +1,302 @@
+/**
+ * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
+ */
+package com.highperformancespark.examples.dataframe
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+//tag::sparkSQLImports[]
+import org.apache.spark.sql.{DataFrame, SQLContext, Row}
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.expressions._
+import org.apache.spark.sql.functions._
+//end::sparkSQLImports[]
+
+//tag::sparkHiveImports[]
+// Additional imports for using HiveContext
+import org.apache.spark.sql.hive._
+import org.apache.spark.sql.hive.thriftserver._
+//end::sparkHiveImports[]
+
+object HappyPandas {
+  /**
+   * Creates SQLContext with an existing SparkContext.
+   */
+  def sqlContext(sc: SparkContext): SQLContext = {
+    //tag::createSQLContext[]
+    val sqlContext = new SQLContext(sc)
+    // Import the implicits, unlike in core Spark the implicits are defined on the context
+    import sqlContext.implicits._
+    //end::createSQLContext[]
+    sqlContext
+  }
+
+  /**
+   * Creates HiveContext with an existing SparkContext.
+   */
+  def hiveContext(sc: SparkContext): HiveContext = {
+    //tag::createHiveContext[]
+    val hiveContext = new HiveContext(sc)
+    // Import the implicits, unlike in core Spark the implicits are defined on the context
+    import hiveContext.implicits._
+    //end::createHiveContext[]
+    hiveContext
+  }
+
+  /**
+   * Illustrate loading some JSON data.
+   */
+  def loadDataSimple(sc: SparkContext, sqlCtx: SQLContext, path: String): DataFrame = {
+    //tag::loadPandaJSONSimple[]
+    val df1 = sqlCtx.read.json(path)
+    //end::loadPandaJSONSimple[]
+    //tag::loadPandaJSONComplex[]
+    val df2 = sqlCtx.read.format("json").option("samplingRatio", "1.0").load(path)
+    //end::loadPandaJSONComplex[]
+    val jsonRDD = sc.textFile(path)
+    //tag::loadPandaJsonRDD[]
+    val df3 = sqlCtx.read.json(jsonRDD)
+    //end::loadPandaJSONRDD[]
+    df1
+  }
+
+  def jsonLoadFromRDD(sqlCtx: SQLContext, input: RDD[String]): DataFrame = {
+    //tag::loadPandaJSONRDD[]
+    val rdd: RDD[String] = input.filter(_.contains("panda"))
+    val df = sqlCtx.read.json(rdd)
+    //end::loadPandaJSONRDD[]
+    df
+  }
+
+  //  Here will be some examples on PandaInfo DataFrame
+
+  /**
+    * @param place name of place
+    * @param pandaType type of pandas in this place
+    * @param happyPandas number of happy pandas in this place
+    * @param totalPandas total number of pandas in this place
+    */
+  case class PandaInfo(place: String, pandaType: String, happyPandas: Integer, totalPandas: Integer)
+
+  /**
+    * Gets the percentage of happy pandas per place.
+    *
+    * @param pandaInfo the input DataFrame
+    * @return Returns DataFrame of (place, percentage of happy pandas)
+    */
+  def happyPandasPercentage(pandaInfo: DataFrame): DataFrame = {
+    pandaInfo.select(pandaInfo("place"), (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy"))
+  }
+
+  //tag::encodePandaType[]
+  /**
+    * Encodes pandaType to Integer values instead of String values.
+    *
+    * @param pandaInfo the input DataFrame
+    * @return Returns a DataFrame of pandaId and integer value for pandaType.
+    */
+  def encodePandaType(pandaInfo: DataFrame): DataFrame = {
+    pandaInfo.select(pandaInfo("id"),
+      (when(pandaInfo("pt") === "giant", 0).
+      when(pandaInfo("pt") === "red", 1).
+      otherwise(2)).as("encodedType")
+    )
+  }
+  //end::encodePandaType[]
+
+  /**
+    * Gets places with happy pandas more than minHappinessBound.
+    */
+  def minHappyPandas(pandaInfo: DataFrame, minHappyPandas: Int): DataFrame = {
+    pandaInfo.filter(pandaInfo("happyPandas") >= minHappyPandas)
+  }
+
+  /**
+   * Extra the panda info from panda places and compute the squisheness of the panda
+   */
+  def squishPandaFromPace(pandaPlace: DataFrame): DataFrame = {
+    //tag::selectExplode[]
+    val pandaInfo = pandaPlace.explode(pandaPlace("pandas")){
+      case Row(pandas: Seq[Row]) =>
+        pandas.map{
+          case Row(id: Long, zip: String, pt: String, happy: Boolean, attrs: Seq[Double]) =>
+            RawPanda(id, zip, pt, happy, attrs.toArray)
+        }}
+    pandaInfo.select(
+      (pandaInfo("attributes")(0) / pandaInfo("attributes")(1))
+        .as("squishyness"))
+    //end::selectExplode[]
+  }
+
+  /**
+    * Find pandas that are sad
+    */
+  def sadPandas(pandaInfo: DataFrame): DataFrame = {
+    //tag::simpleFilter[]
+    pandaInfo.filter(pandaInfo("happy") !== true)
+    //end::simpleFilter[]
+  }
+
+  /**
+   * Find pandas that are happy and fuzzier than squishy.
+   */
+  def happyFuzzyPandas(pandaInfo: DataFrame): DataFrame = {
+    //tag::complexFilter[]
+    pandaInfo.filter(
+      pandaInfo("happy").and(pandaInfo("attributes")(0) > pandaInfo("attributes")(1))
+    )
+    //end::complexFilter[]
+  }
+
+  /**
+    * Gets places that contains happy pandas more than unhappy pandas.
+    */
+  def happyPandasPlaces(pandaInfo: DataFrame): DataFrame = {
+    pandaInfo.filter(pandaInfo("happyPandas") >= pandaInfo("totalPandas") / 2)
+  }
+
+
+  /**
+   * Remove duplicate pandas by id.
+   */
+  def removeDuplicates(pandas: DataFrame): DataFrame = {
+    //tag::dropDuplicatePandaIds[]
+    pandas.dropDuplicates(List("id"))
+    //end::dropDuplicatePandaIds[]
+  }
+
+  /**
+    * @param name name of panda
+    * @param zip zip code
+    * @param pandaSize size of panda in KG
+    * @param age age of panda
+    */
+  case class Pandas(name: String, zip: String, pandaSize: Integer, age: Integer)
+
+  def describePandas(pandas: DataFrame): DataFrame = {
+    //tag::pandaSizeRangeVarDescribe[]
+    pandas.describe()
+    //end::pandaSizeRangeVarDescribe[]
+  }
+
+  //tag::maxPandaSizePerZip[]
+  def maxPandaSizePerZip(pandas: DataFrame): DataFrame = {
+    pandas.groupBy(pandas("zip")).max("pandaSize")
+  }
+  //end::maxPandaSizePerZip[]
+
+  //tag::minMaxPandasSizePerZip[]
+  def minMaxPandaSizePerZip(pandas: DataFrame): DataFrame = {
+    pandas.groupBy(pandas("zip")).agg(min("pandaSize"), max("pandaSize"))
+  }
+  //end::minMaxPandasSizePerZip[]
+
+  def minPandaSizeMaxAgePerZip(pandas: DataFrame): DataFrame = {
+    // this query can be written in two methods
+
+    // 1
+    pandas.groupBy(pandas("zip")).agg(("pandaSize", "min"), ("age", "max"))
+
+    // 2
+    pandas.groupBy(pandas("zip")).agg(Map("pandaSize" -> "min", "age" -> "max"))
+  }
+
+  //tag::complexAggPerZip[]
+  def minMeanSizePerZip(pandas: DataFrame): DataFrame = {
+    // Compute the min and mean
+    pandas.groupBy(pandas("zip")).agg(min(pandas("pandaSize")), mean(pandas("pandaSize")))
+  }
+  //end::complexAggPerZip[]
+
+  def simpleSqlExample(pandas: DataFrame): DataFrame = {
+    val sqlCtx = pandas.sqlContext
+    //tag::pandasSQLQuery[]
+    pandas.registerTempTable("pandas")
+    val miniPandas = sqlCtx.sql("SELECT * FROM pandas WHERE pandaSize < 12")
+    //end::pandasSQLQuery[]
+    miniPandas
+  }
+
+  def startJDBCServer(sqlContext: HiveContext): Unit = {
+    //tag::startJDBC[]
+    sqlContext.setConf("hive.server2.thrift.port", "9090")
+    HiveThriftServer2.startWithContext(sqlContext)
+    //end::startJDBC[]
+  }
+
+  /**
+    * Orders pandas by size ascending and by age descending.
+    * Pandas will be sorted by "size" first and if two pandas have the same "size"
+    * will be sorted by "age".
+    */
+  def orderPandas(pandas: DataFrame): DataFrame = {
+    //tag::simpleSort[]
+    pandas.orderBy(pandas("pandaSize").asc, pandas("age").desc)
+    //end::simpleSort[]
+  }
+
+  def computeRelativePandaSizes(pandas: DataFrame): DataFrame = {
+    //tag::relativePandaSizesWindow[]
+    val windowSpec = Window
+      .orderBy(pandas("age"))
+      .partitionBy(pandas("zip"))
+      .rowsBetween(start = -10, end = 10) // can use rangeBetween for range instead
+    //end::relativePandaSizesWindow[]
+
+    //tag::relativePandaSizesQuery[]
+    val pandaRelativeSizeCol = pandas("pandaSize") -
+      avg(pandas("pandaSize")).over(windowSpec)
+
+    pandas.select(pandas("name"), pandas("zip"), pandas("pandaSize"), pandas("age"),
+      pandaRelativeSizeCol.as("panda_relative_size"))
+    //end::relativePandaSizesQuery[]
+  }
+
+  // Join DataFrames of Pandas and Sizes with
+  def joins(df1: DataFrame, df2: DataFrame): Unit = {
+
+    //tag::innerJoin[]
+    // Inner join implicit
+    df1.join(df2, df1("name") === df2("name"))
+    // Inner join explicit
+    df1.join(df2, df1("name") === df2("name"), "inner")
+    //end::innerJoin[]
+
+    //tag::leftouterJoin[]
+    // Left outer join explicit
+    df1.join(df2, df1("name") === df2("name"), "left_outer")
+    //end::leftouterJoin[]
+
+    //tag::rightouterJoin[]
+    // Right outer join explicit
+    df1.join(df2, df1("name") === df2("name"), "right_outer")
+    //end::rightouterJoin[]
+
+    //tag::leftsemiJoin[]
+    // Left semi join explicit
+    df1.join(df2, df1("name") === df2("name"), "leftsemi")
+    //end::leftsemiJoin[]
+  }
+
+  /**
+   * Cut the lineage of a DataFrame which has too long a query plan.
+   */
+  def cutLineage(df: DataFrame): DataFrame = {
+    val sqlCtx = df.sqlContext
+    //tag::cutLineage[]
+    val rdd = df.rdd
+    rdd.cache()
+    sqlCtx.createDataFrame(rdd, df.schema)
+    //end::cutLineage[]
+  }
+
+  // Self join
+  def selfJoin(df: DataFrame): DataFrame = {
+    val sqlCtx = df.sqlContext
+    import sqlCtx.implicits._
+    //tag::selfJoin[]
+    val joined = df.as("a").join(df.as("b")).where($"a.name" === $"b.name")
+    //end::selfJoin[]
+    joined
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala
new file mode 100644
index 0000000..2c865f8
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala
@@ -0,0 +1,127 @@
+/**
+ * Load and save data to/from DataFrames
+ */
+package com.highperformancespark.examples.dataframe
+
+import java.util.Properties
+
+import org.apache.spark.rdd._
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+case class LoadSave(sqlContext: SQLContext) {
+  import sqlContext.implicits._
+  //tag::createFromRDD[]
+  def createFromCaseClassRDD(input: RDD[PandaPlace]) = {
+    // Create DataFrame explicitly using sqlContext and schema inference
+    val df1 = sqlContext.createDataFrame(input)
+
+    // Create DataFrame using sqlContext implicits and schema inference
+    val df2 = input.toDF()
+
+    // Create a Row RDD from our RDD of case classes
+    val rowRDD = input.map(pm => Row(pm.name,
+      pm.pandas.map(pi => Row(pi.id, pi.zip, pi.happy, pi.attributes))))
+
+    val pandasType = ArrayType(StructType(List(
+      StructField("id", LongType, true),
+      StructField("zip", StringType, true),
+      StructField("happy", BooleanType, true),
+      StructField("attributes", ArrayType(FloatType), true))))
+
+    // Create DataFrame explicitly with specified schema
+    val schema = StructType(List(StructField("name", StringType, true),
+      StructField("pandas", pandasType)))
+
+    val df3 = sqlContext.createDataFrame(rowRDD, schema)
+  }
+  //end::createFromRDD[]
+
+  //tag::createFromLocal[]
+  def createFromLocal(input: Seq[PandaPlace]) = {
+    sqlContext.createDataFrame(input)
+  }
+  //end::createFromLocal[]
+
+  //tag::collectResults[]
+  def collectDF(df: DataFrame) = {
+    val result: Array[Row] = df.collect()
+    result
+  }
+  //end::collectResults[]
+
+  //tag::toRDD[]
+  def toRDD(input: DataFrame): RDD[RawPanda] = {
+    val rdd: RDD[Row] = input.rdd
+    rdd.map(row => RawPanda(row.getAs[Long](0), row.getAs[String](1),
+      row.getAs[String](2), row.getAs[Boolean](3), row.getAs[Array[Double]](4)))
+  }
+  //end::toRDD[]
+
+  //tag::partitionedOutput[]
+  def writeOutByZip(input: DataFrame): Unit = {
+    input.write.partitionBy("zipcode").format("json").save("output/")
+  }
+  //end::partitionedOutput[]
+
+  //tag::saveAppend[]
+  def writeAppend(input: DataFrame): Unit = {
+    input.write.mode(SaveMode.Append).save("output/")
+  }
+  //end::saveAppend[]
+
+  def createJDBC() = {
+    //tag::createJDBC[]
+    sqlContext.read.jdbc("jdbc:dialect:serverName;user=user;password=pass",
+      "table", new Properties)
+
+    sqlContext.read.format("jdbc")
+      .option("url", "jdbc:dialect:serverName")
+      .option("dbtable", "table").load()
+    //end::createJDBC[]
+  }
+
+  def writeJDBC(df: DataFrame) = {
+    //tag::writeJDBC[]
+    df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass",
+      "table", new Properties)
+
+    df.write.format("jdbc")
+      .option("url", "jdbc:dialect:serverName")
+      .option("user", "user")
+      .option("password", "pass")
+      .option("dbtable", "table").save()
+    //end::writeJDBC[]
+  }
+
+  //tag::loadParquet[]
+  def loadParquet(path: String): DataFrame = {
+    // Configure Spark to read binary data as string, note: must be configured on SQLContext
+    sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
+
+    // Load parquet data using merge schema (configured through option)
+    sqlContext.read
+      .option("mergeSchema", "true")
+      .format("parquet")
+      .load(path)
+  }
+  //end::loadParquet[]
+
+  //tag::writeParquet[]
+  def writeParquet(df: DataFrame, path: String) = {
+    df.write.format("parquet").save(path)
+  }
+  //end::writeParquet[]
+
+  //tag::loadHiveTable[]
+  def loadHiveTable(): DataFrame = {
+    sqlContext.read.table("pandas")
+  }
+  //end::loadHiveTable[]
+
+  //tag::saveManagedTable[]
+  def saveManagedTable(df: DataFrame): Unit = {
+    df.write.saveAsTable("pandas")
+  }
+  //end::saveManagedTable[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala
new file mode 100644
index 0000000..83e4d86
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala
@@ -0,0 +1,140 @@
+/**
+ * A sample mixing relational & functional transformations with Datasets.
+ */
+package com.highperformancespark.examples.dataframe
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.expressions._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+// Additional imports for using HiveContext
+import org.apache.spark.sql.hive._
+import org.apache.spark.sql.hive.thriftserver._
+
+class MixedDataset(sqlCtx: SQLContext) {
+  import sqlCtx.implicits._
+
+  /**
+   * A sample function on a Dataset of RawPandas.
+   * This is contrived, since our reduction could also be done with SQL aggregates, but
+   * we can see the flexibility of being able to specify arbitrary Scala code.
+   */
+  def happyPandaSums(ds: Dataset[RawPanda]): Double = {
+    ds.toDF().filter($"happy" === true).as[RawPanda].
+      select($"attributes"(0).as[Double]).
+      reduce((x, y) => x + y)
+  }
+
+  /**
+   * A sample function on a Dataset of RawPandas.
+   * Use the first attribute to deterimine if a panda is squishy.
+   */
+  //tag::basicSelect[]
+  def squishyPandas(ds: Dataset[RawPanda]): Dataset[(Long, Boolean)] = {
+    ds.select($"id".as[Long], ($"attributes"(0) > 0.5).as[Boolean])
+  }
+  //end::basicSelect[]
+
+  /**
+   * Union happy and sad pandas
+   */
+  //tag::basicUnion[]
+  def unionPandas(happyPandas: Dataset[RawPanda], sadPandas: Dataset[RawPanda]) = {
+    happyPandas.union(sadPandas)
+  }
+  //end::basicUnion[]
+
+  /**
+   * Functional map + Dataset, sums the positive attributes for the pandas
+   */
+  //tag::functionalQuery[]
+  def funMap(ds: Dataset[RawPanda]): Dataset[Double] = {
+    ds.map{rp => rp.attributes.filter(_ > 0).sum}
+  }
+  //end::functionalQuery[]
+
+  //tag::maxPandaSizePerZip[]
+  def maxPandaSizePerZip(ds: Dataset[RawPanda]): Dataset[(String, Double)] = {
+    ds.groupBy($"zip").keyAs[String].agg(max("attributes(2)").as[Double])
+  }
+  //end::maxPandaSizePerZip[]
+
+  //tag::maxPandaSizePerZipScala[]
+  def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = {
+    ds.groupBy($"zip").keyAs[String].mapGroups{ case (g, iter) =>
+      (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _)))
+    }
+  }
+  //end::maxPandaSizePerZipScala[]
+
+  /**
+   * Illustrate how we make typed queries, using some of the float properties to produce boolean
+   * values.
+   */
+  def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = {
+    ds.select($"attributes"(0).as[Double])
+  }
+
+  /**
+   * Illustrate Dataset joins
+   */
+  def joinSample(pandas: Dataset[RawPanda], coffeeShops: Dataset[CoffeeShop]):
+      Dataset[(RawPanda, CoffeeShop)] = {
+    //tag::joinWith[]
+    val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops,
+      $"zip" === $"zip")
+    //end::joinWith[]
+    result
+  }
+
+  /**
+   * Illustrate a self join to compare pandas in the same zip code
+   */
+  def selfJoin(pandas: Dataset[RawPanda]):
+      Dataset[(RawPanda, RawPanda)] = {
+    //tag::selfJoin[]
+    val result: Dataset[(RawPanda, RawPanda)] = pandas.joinWith(pandas,
+      $"zip" === $"zip")
+    //end::selfJoin[]
+    result
+  }
+
+  //tag::fromRDD[]
+  /**
+   * Illustrate converting an RDD to DS
+   */
+  def fromRDD(rdd: RDD[RawPanda]): Dataset[RawPanda] = {
+    rdd.toDS
+  }
+
+  //end::fromRDD[]
+
+  //tag::toRDDDF[]
+  /**
+   * Illustrate converting a Dataset to an RDD
+   */
+  def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = {
+    ds.rdd
+  }
+
+  /**
+   * Illustrate converting a Dataset to a DataFrame
+   */
+  def toDF(ds: Dataset[RawPanda]): DataFrame = {
+    ds.toDF()
+  }
+  //end::toRDDDF[]
+
+  /**
+   * Illustrate DataFrame to Dataset. Its important to note that if the schema does not match what
+   * is expected by the Dataset this fails fast.
+   */
+  //tag::DataFrameAsDataset[]
+  def fromDF(df: DataFrame): Dataset[RawPanda] = {
+    df.as[RawPanda]
+  }
+  //end::DataFrameAsDataset[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back
new file mode 100644
index 0000000..cdae7c1
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back
@@ -0,0 +1,67 @@
+/**
+ * A sample mixing relational & functional transformations with Datasets.
+ */
+package com.highperformancespark.examples.dataframe
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.expressions._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+// Additional imports for using HiveContext
+import org.apache.spark.sql.hive._
+import org.apache.spark.sql.hive.thriftserver._
+
+class MixedDataset(sqlCtx: SQLContext) {
+  import sqlCtx.implicits._
+
+  /**
+   * A sample function on a Dataset of RawPandas.
+   * This is contrived, since our reduction could also be done with SQL aggregates, but
+   * we can see the flexibility of being able to specify arbitrary Scala code.
+   */
+  def happyPandaSums(ds: Dataset[RawPanda]): Double = {
+    ds.toDF().filter($"happy" === true).as[RawPanda].
+      select($"attributes"(0).as[Double]).
+      reduce((x, y) => x + y)
+  }
+
+  /**
+   * Functional map + Dataset, sums the positive attributes for the pandas
+   */
+  def funMap(ds: Dataset[RawPanda]): Dataset[Double] = {
+    ds.map{rp => rp.attributes.filter(_ > 0).sum}
+  }
+
+  /**
+   * Illustrate how we make typed queries, using some of the float properties to produce boolean
+   * values.
+   */
+  def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = {
+    ds.select($"attributes"(0).as[Double])
+  }
+
+  /**
+   * Illustrate converting a Dataset to an RDD
+   */
+  def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = {
+    ds.rdd
+  }
+
+  /**
+   * Illustrate converting a Dataset to a DataFrame
+   */
+  def toDF(ds: Dataset[RawPanda]): DataFrame = {
+    ds.toDF()
+  }
+
+  /**
+   * Illustrate DataFrame to Dataset. Its important to note that if the schema does not match what
+   * is expected by the Dataset this fails fast.
+   */
+  def fromDF(df: DataFrame): Dataset[RawPanda] = {
+    df.as[RawPanda]
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala
new file mode 100644
index 0000000..d118130
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala
@@ -0,0 +1,17 @@
+package com.highperformancespark.examples.dataframe
+/**
+ * @param id panda id
+ * @param zip zip code of panda residence
+ * @param pt Type of panda as a string
+ * @param happy if panda is happy
+ * @param attributes array of panada attributes
+ */
+case class RawPanda(id: Long, zip: String, pt: String, happy: Boolean, attributes: Array[Double])
+
+/**
+ * @param name place name
+ * @param pandas pandas in that place
+ */
+case class PandaPlace(name: String, pandas: Array[RawPanda])
+
+case class CoffeeShop(zip: String, name: String)
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala
new file mode 100644
index 0000000..a25a97f
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala
@@ -0,0 +1,29 @@
+/**
+ * Using plain-old-sql
+ */
+package com.highperformancespark.examples.dataframe
+
+import org.apache.spark.sql._
+
+case class RegularSQL(sqlContext: SQLContext) {
+
+  //tag::queryTable[]
+  def querySQL(): DataFrame = {
+    sqlContext.sql("SELECT * FROM pandas WHERE size > 0")
+  }
+  //end::queryTable[]
+
+  // TODO: Holden: include a parquet example file and point this to that.
+  //tag::queryRawFile[]
+  def queryRawFile(): DataFrame = {
+    sqlContext.sql("SELECT * FROM parquet.`path_to_parquet_file`")
+  }
+  //end::queryRawFile[]
+
+  //tag::registerTable[]
+  def registerTable(df: DataFrame): Unit = {
+    df.registerTempTable("pandas")
+    df.saveAsTable("perm_pandas")
+  }
+  //end::registerTable[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala
new file mode 100644
index 0000000..56d4beb
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala
@@ -0,0 +1,58 @@
+/**
+ * Example UDFs
+ */
+package com.highperformancespark.examples.dataframe
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.expressions._
+import org.apache.spark.sql.types._
+
+object UDFs {
+  //tag::setupUDFs[]
+  def setupUDFs(sqlCtx: SQLContext) = {
+    sqlCtx.udf.register("strLen", (s: String) => s.length())
+  }
+  //end::setupUDFs[]
+
+  //tag::setupUDAFs[]
+  def setupUDAFs(sqlCtx: SQLContext) = {
+    class Avg extends UserDefinedAggregateFunction {
+      // Input type
+      def inputSchema: org.apache.spark.sql.types.StructType =
+        StructType(StructField("value", DoubleType) :: Nil)
+
+      def bufferSchema: StructType = StructType(
+        StructField("count", LongType) ::
+        StructField("sum", DoubleType) :: Nil
+      )
+
+      // Return type
+      def dataType: DataType = DoubleType
+
+      def deterministic: Boolean = true
+
+      def initialize(buffer: MutableAggregationBuffer): Unit = {
+        buffer(0) = 0L
+        buffer(1) = 0.0
+      }
+
+      def update(buffer: MutableAggregationBuffer,input: Row): Unit = {
+        buffer(0) = buffer.getAs[Long](0) + 1
+        buffer(1) = buffer.getAs[Double](1) + input.getAs[Double](0)
+      }
+
+      def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
+        buffer1(0) = buffer1.getAs[Long](0) + buffer2.getAs[Long](0)
+        buffer1(1) = buffer1.getAs[Double](1) + buffer2.getAs[Double](1)
+      }
+
+      def evaluate(buffer: Row): Any = {
+        buffer.getDouble(1) / buffer.getLong(0)
+      }
+    }
+    // Optionally register
+    val avg = new Avg
+    sqlCtx.udf.register("ourAvg", avg)
+  }
+  //end::setupUDAFs[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala
new file mode 100644
index 0000000..198518d
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala
@@ -0,0 +1,9 @@
+package com.highperformancespark.examples.ffi
+
+import org.apache.spark.rdd.RDD
+
+object NativeExample {
+  def jniSum(input: RDD[(String, Array[Int])]): RDD[(String, Int)] = {
+    input.mapValues(values => new SumJNI().sum(values))
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala
new file mode 100644
index 0000000..7a83aa4
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala
@@ -0,0 +1,8 @@
+package com.highperformancespark.examples.ffi
+
+object StandAlone {
+  def main(args: Array[String]) {
+    System.loadLibrary("highPerformanceSpark0")
+    println(new SumJNI().sum(Array(1,2,3)))
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala
new file mode 100644
index 0000000..de848bb
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala
@@ -0,0 +1,8 @@
+package com.highperformancespark.examples.ffi
+
+import ch.jodersky.jni.nativeLoader
+
+@nativeLoader("high-performance-spark0")
+class SumJNI {
+  @native def sum(n: Array[Int]): Int
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
new file mode 100644
index 0000000..9629451
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.highperformancespark.examples.perf
+
+import com.highperformancespark.examples.dataframe.RawPanda
+import com.highperformancespark.examples.tools._
+
+import org.apache.spark.rdd._
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.sql.types._
+
+/**
+ * A simple performance test to compare a simple sort between DataFrame, and RDD
+ */
+object SimplePerfTest {
+  def main(args: Array[String]) = {
+    val sparkConf = new SparkConf().setAppName("simple-perf-test")
+    val sc = new SparkContext(sparkConf)
+    val sqlCtx = new HiveContext(sc)
+    val scalingFactor = if (args.length > 0) args(0).toLong else 100L
+    val size = if (args.length > 1) args(1).toInt else 50
+    run(sc, sqlCtx, scalingFactor, size)
+  }
+
+  def run(sc: SparkContext, sqlCtx: HiveContext, scalingFactor: Long, size: Int) = {
+    import sqlCtx.implicits._
+    val inputRDD = GenerateScalingData.generateFullGoldilocks(sc, scalingFactor, size)
+    val pairRDD = inputRDD.map(p => (p.zip.toInt, p.attributes(0)))
+    pairRDD.cache()
+    pairRDD.count()
+    val rddTimeings = 1.to(10).map(x => time(testOnRDD(pairRDD)))
+    val groupTimeings = 1.to(10).map(x => time(groupOnRDD(pairRDD)))
+    val df = inputRDD.toDF()
+    val inputDataFrame = df.select(df("zip").cast(IntegerType), df("attributes")(0).as("fuzzyness").cast(DoubleType))
+    inputDataFrame.cache()
+    inputDataFrame.count()
+    val dataFrameTimeings = 1.to(10).map(x => time(testOnDataFrame(inputDataFrame)))
+    println(rddTimeings.map(_._2).mkString(","))
+    println(groupTimeings.map(_._2).mkString(","))
+    println(dataFrameTimeings.map(_._2).mkString(","))
+  }
+
+  def testOnRDD(rdd: RDD[(Int, Double)]) = {
+    rdd.map{case (x, y) => (x, (y, 1))}.reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count()
+  }
+
+  def groupOnRDD(rdd: RDD[(Int, Double)]) = {
+    rdd.groupByKey().mapValues{v =>
+      v.aggregate((0.0, 0))({case (x, y) => (x._1 + y, x._2 + 1)},
+        {case (x, y) => (x._1 + y._1, x._2 + y._2)})}.count()
+  }
+
+  def testOnDataFrame(df: DataFrame) = {
+    df.groupBy("zip").avg("fuzzyness").count()
+  }
+
+  def time[R](block: => R): (R, Long) = {
+    val t0 = System.nanoTime()
+    val result = block    // call-by-name
+    val t1 = System.nanoTime()
+    println(s"Time ${t1 - t0}ns")
+    (result, t1 - t0)
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala b/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala
new file mode 100644
index 0000000..7fb4177
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala
@@ -0,0 +1,21 @@
+package com.highperformancespark.example.tokenize
+
+import org.apache.spark.rdd.RDD
+
+object SampleTokenize {
+  //tag::DIFFICULT[]
+  def difficultTokenizeRDD(input: RDD[String]) = {
+    input.flatMap(_.split(" "))
+  }
+  //end::DIFFICULT[]
+
+  //tag::EASY[]
+  def tokenizeRDD(input: RDD[String]) = {
+    input.flatMap(tokenize)
+  }
+
+  protected[tokenize] def tokenize(input: String) = {
+    input.split(" ")
+  }
+  //end::EASY[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
new file mode 100644
index 0000000..0fbe944
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
@@ -0,0 +1,19 @@
+package com.highperformancespark.examples.tools
+
+import scala.collection.immutable.HashSet
+
+import com.highperformancespark.examples.dataframe.RawPanda
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+
+object FilterInvalidPandas {
+
+  def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = {
+    //tag::broadcast[]
+    val invalid = HashSet() ++ invalidPandas
+    val invalidBroadcast = sc.broadcast(invalid)
+    input.filter{panda => !invalidBroadcast.value.contains(panda.id)}
+    //end::broadcast[]
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
new file mode 100644
index 0000000..66d01d4
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
@@ -0,0 +1,65 @@
+package com.highperformancespark.examples.tools
+
+import com.highperformancespark.examples.dataframe.RawPanda
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Row
+import org.apache.spark.mllib.random.RandomRDDs
+import org.apache.spark.mllib.linalg.Vector
+
+// TODO: Add tests for this
+object GenerateScalingData {
+  /**
+   * Generate a Goldilocks data set. We expect the zip code to follow an exponential
+   * distribution and the data its self to be normal
+   * @param rows number of rows in the RDD (approximate)
+   * @param size number of value elements
+   */
+  def generateFullGoldilocks(sc: SparkContext, rows: Long, numCols: Int): RDD[RawPanda] = {
+    val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000,  size = rows).map(_.toInt.toString)
+    val valuesRDD = RandomRDDs.normalVectorRDD(sc, numRows = rows, numCols = numCols).repartition(zipRDD.partitions.size)
+    val keyRDD = sc.parallelize(1L.to(rows), zipRDD.getNumPartitions)
+    keyRDD.zipPartitions(zipRDD, valuesRDD){
+      (i1, i2, i3) =>
+      new Iterator[(Long, String, Vector)] {
+        def hasNext: Boolean = (i1.hasNext, i2.hasNext, i3.hasNext) match {
+          case (true, true, true) => true
+          case (false, false, false) => false
+          case _ => false // Note: this is unsafe (we throw away data when one of our partitions has run out).
+        }
+        def next(): (Long, String, Vector) = (i1.next(), i2.next(), i3.next())
+      }
+    }.map{case (k, z, v) =>
+      RawPanda(k, z, "giant", v(0) > 0.5, v.toArray)}
+  }
+
+  /**
+   * Transform it down to just the data used for the benchmark
+   */
+  def generateMiniScale(sc: SparkContext, rows: Long, numCols: Int): RDD[(Int, Double)] = {
+    generateFullGoldilocks(sc, rows, numCols).map(p => (p.zip.toInt, p.attributes(0)))
+  }
+
+    /**
+   * Transform it down to just the data used for the benchmark
+   */
+  def generateMiniScaleRows(sc: SparkContext, rows: Long, numCols: Int): RDD[Row] = {
+    generateMiniScale(sc, rows, numCols).map{case (zip, fuzzy) => Row(zip, fuzzy)}
+  }
+
+  // tag::MAGIC_PANDA[]
+  /**
+   * Generate a Goldilocks data set all with the same id.
+   * We expect the zip code to follow an exponential
+   * distribution and the data its self to be normal.
+   * Simplified to avoid a 3-way zip.
+   */
+  def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int): RDD[RawPanda] = {
+    val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000,  size = rows).map(_.toInt.toString)
+    val valuesRDD = RandomRDDs.normalVectorRDD(sc, numRows = rows, numCols = numCols)
+      zipRDD.zip(valuesRDD).map{case (z, v) =>
+      RawPanda(1, z, "giant", v(0) > 0.5, v.toArray)}
+  }
+  // end::MAGIC_PANDA[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
new file mode 100644
index 0000000..18934f8
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
@@ -0,0 +1,54 @@
+import scala.util.Random
+import scala.reflect.{ClassTag}
+
+import org.apache.spark.rdd.RDD
+
+/**
+ * Sample our production data to be able to use it for tests
+ */
+object SampleData {
+  /**
+   * Sample the input down to k % for usage in tests
+   */
+  def sampleInput[T](rdd: RDD[T]): RDD[T] = {
+  // tag::randomSampleInput[]
+    rdd.sample(withReplacement=false, fraction=0.1)
+  // end::randomSampleInput[]
+  }
+
+  /**
+   * Construct a stratified sample
+   */
+  def stratifiedSample(rdd: RDD[(String, Array[Double])]): RDD[(String, Array[Double])] = {
+    // tag::stratifiedSample[]
+    // 5% of the red pandas, and 50% of the giant pandas
+    val stratas = Map("red" -> 0.05, "giant" -> 0.50)
+    rdd.sampleByKey(withReplacement=false, fractions = stratas)
+    // end::stratifiedSample[]
+  }
+
+  /**
+   * Custom random sample with RNG. This is intended as an example of how to save setup overhead.
+   */
+  def slowSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = {
+    rdd.flatMap{x => val r = new Random()
+      if (r.nextInt(10) == 0) {
+        Some(x)
+      } else {
+        None
+      }}
+  }
+
+  /**
+   * Custom random sample with RNG. This is intended as an example of how to save setup overhead.
+   */
+  def customSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = {
+    //tag::mapPartitions[]
+    rdd.mapPartitions{itr =>
+      // Only create once RNG per partitions
+      val r = new Random()
+      itr.filter(x => r.nextInt(10) == 0)
+    }
+    //end::mapPartitions[]
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
new file mode 100644
index 0000000..a781ecd
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
@@ -0,0 +1,46 @@
+/**
+ * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
+ */
+package com.highperformancespark.examples.transformations
+
+import com.highperformancespark.examples.dataframe.RawPanda
+
+import org.apache.spark._
+import org.apache.spark.rdd._
+
+object Accumulators {
+  /**
+   * Compute the total fuzzyness with an accumulator while generating an id and zip pair for sorting
+   */
+  //tag::sumFuzzyAcc[]
+  def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): (RDD[(String, Long)], Double) = {
+    val acc = sc.accumulator(0.0) // Create an accumulator with the initial value of 0.0
+    val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)}
+    // accumulator still has zero value
+    transformed.count() // force evaluation
+    // Note: This example is dangerous since the transformation may be evaluated multiple times
+    (transformed, acc.value)
+  }
+  //end::sumFuzzyAcc[]
+
+  /**
+   * Compute the max fuzzyness with an accumulator while generating an id and zip pair for sorting
+   */
+  //tag::maxFuzzyAcc[]
+  def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): (RDD[(String, Long)], Double) = {
+    object MaxDoubleParam extends AccumulatorParam[Double] {
+      override def zero(initValue: Double) = initValue
+      override def addInPlace(r1: Double, r2: Double): Double = {
+        Math.max(r1, r2)
+      }
+    }
+    // Create an accumulator with the initial value of Double.MinValue
+    val acc = sc.accumulator(Double.MinValue)(MaxDoubleParam)
+    val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)}
+    // accumulator still has Double.MinValue
+    transformed.count() // force evaluation
+    // Note: This example is dangerous since the transformation may be evaluated multiple times
+    (transformed, acc.value)
+  }
+  //end::maxFuzzyAcc[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala b/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala
new file mode 100644
index 0000000..d341cb8
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala
@@ -0,0 +1,39 @@
+
+package com.highperformancespark.examples.transformations
+
+import org.apache.spark.rdd.RDD
+
+
+object NarrowAndWide {
+
+  //toDO: Probably should write some sort of test for this.
+  //this is used in chapter 4 for the stage diagram
+  def sillySparkProgram(rdd1 : RDD[Int]) = {
+
+    //tag::narrowWide[]
+
+    //Narrow dependency. Map the rdd to tuples  of (x, 1)
+    val rdd2 = rdd1.map((_, 1))
+    //wide dependency groupByKey
+    val rdd3 = rdd2.groupByKey()
+    //end::narrowWide[]
+
+    rdd3
+  }
+  //this is used in chapter two for the stage diagram.
+
+  //tag::stageDiagram[]
+  def simpleSparkProgram(rdd : RDD[Double]): Long ={
+  //stage1
+    rdd.filter(_< 1000.0)
+      .map(x => (x , x) )
+  //stage2
+      .groupByKey()
+      .map{ case(value, groups) => (groups.sum, value)}
+  //stage 3
+      .sortByKey()
+      .count()
+  }
+  //end::stageDiagram[]
+
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala b/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala
new file mode 100644
index 0000000..c89653a
--- /dev/null
+++ b/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala
@@ -0,0 +1,44 @@
+package com.highperformancespark.examples.wordcount
+
+/**
+ * What sort of big data book would this be if we didn't mention wordcount?
+ */
+import org.apache.spark.rdd._
+
+object WordCount {
+  // bad idea: uses group by key
+  def badIdea(rdd: RDD[String]): RDD[(String, Int)] = {
+    val words = rdd.flatMap(_.split(" "))
+    val wordPairs = words.map((_, 1))
+    val grouped = wordPairs.groupByKey()
+    val wordCounts = grouped.mapValues(_.sum)
+    wordCounts
+  }
+
+  // good idea: doesn't use group by key
+  //tag::simpleWordCount[]
+  def simpleWordCount(rdd: RDD[String]): RDD[(String, Int)] = {
+    val words = rdd.flatMap(_.split(" "))
+    val wordPairs = words.map((_, 1))
+    val wordCounts = wordPairs.reduceByKey(_ + _)
+    wordCounts
+  }
+  //end::simpleWordCount
+
+  /**
+    * Come up with word counts but filter out the illegal tokens and stop words
+    */
+  //tag::wordCountStopwords[]
+  def withStopWordsFiltered(rdd : RDD[String], illegalTokens : Array[Char],
+    stopWords : Set[String]): RDD[(String, Int)] = {
+    val seperators = illegalTokens ++ Array[Char](' ')
+    val tokens: RDD[String] = rdd.flatMap(_.split(seperators).
+      map(_.trim.toLowerCase))
+    val words = tokens.filter(token =>
+      !stopWords.contains(token) && (token.length > 0) )
+    val wordPairs = words.map((_, 1))
+    val wordCounts = wordPairs.reduceByKey(_ + _)
+    wordCounts
+  }
+  //end::wordCountStopwords[]
+}
diff --git a/src/test/java/com/highperformancespark/examples/JavaInteropTest.java b/src/test/java/com/highperformancespark/examples/JavaInteropTest.java
new file mode 100644
index 0000000..66318f7
--- /dev/null
+++ b/src/test/java/com/highperformancespark/examples/JavaInteropTest.java
@@ -0,0 +1,43 @@
+package com.highperformancespark.examples;
+
+import com.holdenkarau.spark.testing.SharedJavaSparkContext;
+
+import scala.Tuple2;
+
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class JavaInteropTest extends SharedJavaSparkContext {
+  
+  @Test
+  public void wrapPairRDDTest() {
+    JavaInteropTestHelper helper = new JavaInteropTestHelper(sc());
+    JavaInterop ji = new JavaInterop();
+    RDD<Tuple2<String, Object>> rdd = helper.generateMiniPairRDD();
+    JavaPairRDD prdd = ji.wrapPairRDD(rdd);
+    List<Tuple2<String, Long>> expected = Arrays.asList(new Tuple2<String, Long>("panda", 12L));
+    assertEquals(expected, prdd.collect());
+  }
+
+  @Test
+  public void wrapPairRDDFakeCtTest() {
+    JavaInteropTestHelper helper = new JavaInteropTestHelper(sc());
+    JavaInterop ji = new JavaInterop();
+    RDD<Tuple2<String, Object>> rdd = helper.generateMiniPairRDD();
+    JavaPairRDD prdd = ji.wrapPairRDDFakeCt(rdd);
+    List<Tuple2<String, Long>> expected = Arrays.asList(new Tuple2<String, Long>("panda", 12L));
+    assertEquals(expected, prdd.collect());
+  }
+}
diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java
new file mode 100644
index 0000000..b0d4bdc
--- /dev/null
+++ b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java
@@ -0,0 +1,151 @@
+package com.highperformancespark.examples.dataframe;
+
+import com.highperformancespark.examples.objects.JavaPandaInfo;
+import com.highperformancespark.examples.objects.JavaPandas;
+import com.highperformancespark.examples.objects.JavaRawPanda;
+import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.*;
+
+public class JavaHappyPandasTest extends JavaDataFrameSuiteBase {
+  String toronto = "toronto";
+  String sandiego = "san diego";
+  String virginia = "virginia";
+
+  List<JavaPandaInfo> pandaInfoList = Arrays.asList(
+    new JavaPandaInfo(toronto, "giant", 1, 2),
+    new JavaPandaInfo(sandiego, "red", 2, 3),
+    new JavaPandaInfo(virginia, "black", 1, 10)
+  );
+
+  List<JavaRawPanda> rawPandaList = Arrays.asList(
+    new JavaRawPanda(10L, "94110", "giant", true, Arrays.asList(1.0, 0.9)),
+    new JavaRawPanda(11L, "94110", "red", true, Arrays.asList(1.0, 0.9)));
+
+  List<JavaPandas> pandasList = Arrays.asList(
+    new JavaPandas("bata", "10010", 10, 2),
+    new JavaPandas("wiza", "10010", 20, 4),
+    new JavaPandas("dabdob", "11000", 8, 2),
+    new JavaPandas("hanafy", "11000", 15, 7),
+    new JavaPandas("hamdi", "11111", 20, 10)
+  );
+
+  @Test
+  public void simpleSelfJoinTest() {
+    DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class);
+    DataFrame result = JavaHappyPandas.selfJoin(inputDF).select("a.name", "b.name");
+    List<Row> resultList = result.collectAsList();
+
+    resultList.stream().forEach(row -> assertEquals(row.getString(0), row.getString(1)));
+  }
+
+  @Test
+  public void verifyhappyPandasPercentage() {
+    List<Row> expectedList = Arrays.asList(RowFactory.create(toronto, 0.5),
+      RowFactory.create(sandiego, 2 / 3.0), RowFactory.create(virginia, 1/10.0));
+    DataFrame expectedDF = sqlContext().createDataFrame(
+      expectedList, new StructType(
+        new StructField[]{
+          new StructField("place", DataTypes.StringType, true, Metadata.empty()),
+          new StructField("percentHappy", DataTypes.DoubleType, true, Metadata.empty())
+        }));
+
+    DataFrame inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class);
+    DataFrame resultDF = JavaHappyPandas.happyPandasPercentage(inputDF);
+
+    assertDataFrameApproximateEquals(expectedDF, resultDF, 1E-5);
+  }
+
+  @Test
+  public void encodePandaType() {
+    DataFrame inputDF = sqlContext().createDataFrame(rawPandaList, JavaRawPanda.class);
+    DataFrame resultDF = JavaHappyPandas.encodePandaType(inputDF);
+
+    List<Row> expectedRows = Arrays.asList(RowFactory.create(10L, 0), RowFactory.create(11L, 1));
+    DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, new StructType(new StructField[]{
+      new StructField("id", DataTypes.LongType, false, Metadata.empty()),
+      new StructField("encodedType", DataTypes.IntegerType, false, Metadata.empty())
+    }));
+
+    assertDataFrameEquals(expectedDF, resultDF);
+  }
+
+  @Test
+  public void happyPandasPlaces() {
+    DataFrame inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class);
+    DataFrame resultDF = JavaHappyPandas.happyPandasPlaces(inputDF);
+
+    List<JavaPandaInfo> expectedRows = Arrays.asList(
+      new JavaPandaInfo(toronto, "giant", 1, 2),
+      new JavaPandaInfo(sandiego, "red", 2, 3));
+    DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, JavaPandaInfo.class);
+
+    assertDataFrameEquals(expectedDF, resultDF);
+  }
+
+  @Test
+  public void maxPandaSizePerZip() {
+    DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class);
+    DataFrame resultDF = JavaHappyPandas.maxPandaSizePerZip(inputDF);
+
+    List<Row> expectedRows = Arrays.asList(
+      RowFactory.create(pandasList.get(1).getZip(), pandasList.get(1).getPandaSize()),
+      RowFactory.create(pandasList.get(3).getZip(), pandasList.get(3).getPandaSize()),
+      RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize())
+    );
+    DataFrame expectedDF = sqlContext().createDataFrame(expectedRows,
+      new StructType(
+        new StructField[]{
+          new StructField("zip", DataTypes.StringType, true, Metadata.empty()),
+          new StructField("max(pandaSize)", DataTypes.IntegerType, true, Metadata.empty())
+        }
+      ));
+
+    assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"));
+  }
+
+  @Test
+  public void complexAggPerZip() {
+    DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class);
+    DataFrame resultDF = JavaHappyPandas.minMeanSizePerZip(inputDF);
+
+    List<Row> expectedRows = Arrays.asList(
+      RowFactory.create(pandasList.get(1).getZip(), pandasList.get(0).getPandaSize(), 15.0),
+      RowFactory.create(pandasList.get(3).getZip(), pandasList.get(2).getPandaSize(), 11.5),
+      RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize(), 20.0));
+
+    DataFrame expectedDF = sqlContext().createDataFrame(expectedRows,
+      new StructType(
+        new StructField[]{
+          new StructField("zip", DataTypes.StringType, true, Metadata.empty()),
+          new StructField("min(pandaSize)", DataTypes.IntegerType, true, Metadata.empty()),
+          new StructField("avg(pandaSize)", DataTypes.DoubleType, true, Metadata.empty())
+        }
+      ));
+
+    assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1E-5);
+  }
+
+  @Test
+  public void simpleSQLExample() {
+    DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class);
+    DataFrame resultDF = JavaHappyPandas.simpleSqlExample(inputDF);
+
+    List<JavaPandas> expectedList = Arrays.asList(
+      pandasList.get(0), pandasList.get(2)
+    );
+    DataFrame expectedDF = sqlContext().createDataFrame(expectedList, JavaPandas.class);
+
+    assertDataFrameEquals(expectedDF, resultDF);
+  }
+
+}
\ No newline at end of file
diff --git a/src/test/java/com/highperformancespark/examples/goldilocks/JavaQuantileOnlyArtisanalTest.java b/src/test/java/com/highperformancespark/examples/goldilocks/JavaQuantileOnlyArtisanalTest.java
new file mode 100644
index 0000000..98110d0
--- /dev/null
+++ b/src/test/java/com/highperformancespark/examples/goldilocks/JavaQuantileOnlyArtisanalTest.java
@@ -0,0 +1,42 @@
+package com.highperformancespark.examples.goldilocks;
+
+import com.google.common.collect.Sets;
+import com.highperformancespark.examples.objects.JavaGoldiLocksRow;
+import com.holdenkarau.spark.testing.SharedJavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+import org.junit.Test;
+
+import java.util.*;
+
+import static junit.framework.Assert.assertEquals;
+
+public class JavaQuantileOnlyArtisanalTest extends SharedJavaSparkContext {
+
+  private List<JavaGoldiLocksRow> inputList = Arrays.asList(
+    new JavaGoldiLocksRow(0.0, 4.5, 7.7, 5.0),
+    new JavaGoldiLocksRow(1.0, 5.5, 6.7, 6.0),
+    new JavaGoldiLocksRow(2.0, 5.5, 1.5, 7.0),
+    new JavaGoldiLocksRow(3.0, 5.5, 0.5, 7.0),
+    new JavaGoldiLocksRow(4.0, 5.5, 0.5, 8.0));
+
+  @Test
+  public void goldiLocksFirstTry() {
+    SQLContext sqlContext = new SQLContext(jsc());
+    DataFrame input = sqlContext.createDataFrame(inputList, JavaGoldiLocksRow.class);
+    Map<Integer, Iterable<Double>> secondAndThird = JavaGoldiLocksFirstTry.findRankStatistics(input, Arrays.asList(2L, 3L));
+
+    Map<Integer, Set<Double>> expectedResult = new HashMap<>();
+    expectedResult.put(0, new HashSet<>(Arrays.asList(1.0, 2.0)));
+    expectedResult.put(1, new HashSet<>(Arrays.asList(5.5, 5.5)));
+    expectedResult.put(2, new HashSet<>(Arrays.asList(0.5, 1.5)));
+    expectedResult.put(3, new HashSet<>(Arrays.asList(6.0, 7.0)));
+
+    for (Map.Entry<Integer, Iterable<Double>> entry: secondAndThird.entrySet()) {
+      Set<Double> resultSet = Sets.newHashSet(entry.getValue());
+      Set<Double> expectedSet = expectedResult.get(entry.getKey());
+
+      assertEquals(expectedSet, resultSet);
+    }
+  }
+}
diff --git a/src/test/scala/com/high-performance-spark-examples/GoldiLocks/EvaluationTests.scala b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/EvaluationTests.scala
new file mode 100644
index 0000000..c635184
--- /dev/null
+++ b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/EvaluationTests.scala
@@ -0,0 +1,94 @@
+package com.highperformancespark.examples.goldilocks
+
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.apache.spark.rdd.RDD
+import org.scalatest.FunSuite
+
+class EvaluationTests extends FunSuite with SharedSparkContext {
+  val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0)
+  val keyValuePairs =  Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex
+  val path = "target/testResults"
+  test("MapValues preserves Partitioning "){
+
+    val data: RDD[(Double, Int )] = sc.parallelize(keyValuePairs)
+    // tag::MapValues[]
+    val sortedData = data.sortByKey()
+    val mapValues: RDD[(Double, String)] = sortedData.mapValues(_.toString)
+    assert(mapValues.partitioner.isDefined, "Using Map Values preserves partitioning")
+
+    val map = sortedData.map( pair => (pair._1, pair._2.toString))
+    assert(!map.partitioner.isDefined, "Using map does not preserve partitioning")
+    // end::MapValues[]
+  }
+
+  test( "Subtract Behavior "){
+    // tag::Subtract[]
+    val a = Array(1, 2, 3 ,4 ,4 ,4 ,4 )
+    val b = Array(3, 4 )
+    val rddA = sc.parallelize(a)
+    val rddB = sc.parallelize(b)
+    val rddC =  rddA.subtract(rddB)
+    assert(rddC.count() < rddA.count() - rddB.count())
+    // end::Subtract[]
+  }
+
+  test( "Intersection Behavior "){
+    // tag::Intersect[]
+    val a = Array(1, 2, 3 ,4 ,4 ,4 ,4 )
+    val b = Array(3, 4 )
+    val rddA = sc.parallelize(a)
+    val rddB = sc.parallelize(b)
+    val intersection =  rddA.intersection(rddB)
+    val subtraction = rddA.subtract(rddB)
+    val union = intersection.union(subtraction)
+     assert(!rddA.collect().sorted.sameElements(union.collect().sorted))
+    // end::Intersect[]
+  }
+
+  test("Itereative Computations "){
+    def RMSE(rdd : RDD[(Int, Int )]) = {
+      val n = rdd.count()
+      math.sqrt(rdd.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
+    }
+
+    val validationSet = sc.parallelize(keyValuePairs)
+
+    // tag::iterativeComp[]
+    val testSet: Array[RDD[(Double, Int)]] = Array(validationSet.mapValues(_ + 1), validationSet.mapValues(_ + 2), validationSet)
+    validationSet.persist() //persist since we are using this RDD several times
+    val errors = testSet.map( rdd => {
+        RMSE(rdd.join(validationSet).values)
+    })
+    // end::iterativeComp[]
+
+    //the one where we didn't change anything should have the lowest root mean squared error
+    assert(errors.min == errors(2))
+
+  }
+
+  test( "Two actions without caching  ") {
+    val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs)
+
+    // tag::TwoActions[]
+    val sorted = rddA.sortByKey()
+    val count = sorted.count()
+    val sample: Long = count / 10
+    sorted.take(sample.toInt)
+    // end::TwoActions[]
+  }
+
+  test( "Two actions with caching  "){
+    val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs)
+    // tag::TwoActionsCache[]
+    val sorted = rddA.sortByKey()
+    val count = sorted.count()
+    val sample: Long = count / 10
+    rddA.persist()
+    sorted.take(sample.toInt)
+    // end::TwoActionsCache[]
+  }
+
+
+
+}
+
diff --git a/src/test/scala/com/high-performance-spark-examples/GoldiLocks/QuantileOnlyArtisanalTest.scala b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/QuantileOnlyArtisanalTest.scala
new file mode 100644
index 0000000..5126ce4
--- /dev/null
+++ b/src/test/scala/com/high-performance-spark-examples/GoldiLocks/QuantileOnlyArtisanalTest.scala
@@ -0,0 +1,97 @@
+package com.highperformancespark.examples.goldilocks
+
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.apache.spark.sql.SQLContext
+import org.scalatest.FunSuite
+
+// tag::MAGIC_PANDA[]
+class QuantileOnlyArtisanalTest extends FunSuite with SharedSparkContext {
+
+  val inputList = List(
+    GoldiLocksRow(0.0, 4.5, 7.7, 5.0),
+    GoldiLocksRow(1.0, 5.5, 6.7, 6.0),
+    GoldiLocksRow(2.0, 5.5, 1.5, 7.0),
+    GoldiLocksRow(3.0, 5.5, 0.5, 7.0),
+    GoldiLocksRow(4.0, 5.5, 0.5, 8.0)
+  )
+
+  test("Goldilocks first try ") {
+    val sqlContext = new SQLContext(sc)
+    val input = sqlContext.createDataFrame(inputList)
+    val secondAndThird = GoldiLocksFirstTry.findRankStatistics(input, targetRanks = List(2L, 3L))
+    val expectedResult = Map[Int, Set[Double]](
+      0 -> Set(1.0, 2.0),
+      1 -> Set(5.5, 5.5),
+      2 -> Set(0.5, 1.5),
+      3 -> Set(6.0, 7.0))
+    secondAndThird.foreach(x => println( x._1 +"," + x._2.mkString(" ")))
+    assert(expectedResult.forall{case ((index, expectedRanks)) =>
+      secondAndThird.get(index).get.toSet.equals(expectedRanks)})
+  }
+
+  //tests the edge case in which one partition does not contain any of the elements in one column
+  test("Goldilocks first try multiplePartitions") {
+    import org.scalatest.PrivateMethodTester._
+    val testData = sc.parallelize(List(1.0, 2.0, 3.0, 4.0).map(x => (x, x)), 3)
+    val mapPartitions = testData.mapPartitionsWithIndex {
+      case (index, iter) =>
+        val key = if (index == 1) 1 else 0
+          iter.map(x => (x._1, key))
+    }
+
+    val getColumnsFreqPerPartition = PrivateMethod[ Array[(Int, Array[Long])]]('getColumnsFreqPerPartition)
+    val totals = GoldiLocksFirstTry invokePrivate getColumnsFreqPerPartition(mapPartitions, 2)
+
+    totals.foreach(x => println(x._1 + " : " + x._2.mkString(" ")))
+    val getRanksLocationsWithinEachPart =
+      PrivateMethod[Array[(Int, List[(Int, Long)])]]('getRanksLocationsWithinEachPart)
+
+    val locations = GoldiLocksFirstTry invokePrivate getRanksLocationsWithinEachPart(List(1L), totals, 2)
+    locations.foreach(x => println(x._1 + " : " + x._2.mkString(" ")))
+
+    //assert that there is nothing in the column with index 1 on the second partition
+    assert(totals(1)._2(0) == 0 )
+
+    val firstPartition = locations(0)._2
+    //assertFirstPartitionOnlyContains a target rank for the for columnIndex 0, at index 1
+    assert(firstPartition.toSet.equals(Set((0,1))) )
+
+    //assertSecondPartition only contains rank for columnIndex 1, at index 1
+    val secondPartition = locations(1)._2
+    assert(secondPartition.toSet.equals(Set((1,1))) )
+
+    //assert ThirdPartition contains no locations
+    val thirdPartition = locations(2)._2
+    assert(thirdPartition.toSet.equals(Set()))
+    assert(locations.length == 3)
+  }
+
+  test("GoldiLocks With Hashmap ") {
+    val sqlContext = new SQLContext(sc)
+    val input = sqlContext.createDataFrame(inputList)
+    val secondAndThird = GoldiLocksWithHashMap.findRankStatistics(input, targetRanks = List(2L, 3L))
+    val expectedResult = Map[Int, Set[Double]](
+      0 -> Set(1.0, 2.0),
+      1 -> Set(5.5, 5.5),
+      2 -> Set(0.5, 1.5),
+      3 -> Set(6.0, 7.0))
+    secondAndThird.foreach(x => println( x._1 +"," + x._2.mkString(" ")))
+    assert(expectedResult.forall{case ((index, expectedRanks)) =>
+      secondAndThird.get(index).get.toSet.equals(expectedRanks)})
+  }
+
+  test("Secondary Sort"){
+    val data = sc.parallelize(Range.apply(0, 10)).flatMap( i => List(20.0, 30.0 , 40.0 ).map(x => ((x, i), 1L )))
+    val r = SecondarySort.groupByKeyAndSortBySecondaryKey(data, 3)
+    r.collect().foreach( v => println( v))
+    val rSorted = r.collect().sortWith(
+      lt = (a, b) => a._1.toDouble > b._1.toDouble )
+    assert(r.collect().zipWithIndex.forall{
+      case (((key, list), index )) => rSorted(index)._1.equals(key)
+    })
+  }
+
+}
+// end::MAGIC_PANDA[]
+
+case class GoldiLocksRow(pandaId: Double, softness: Double, fuzziness: Double, size: Double)
\ No newline at end of file
diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala
new file mode 100644
index 0000000..c6d64fe
--- /dev/null
+++ b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala
@@ -0,0 +1,291 @@
+/**
+ * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
+ */
+package com.highperformancespark.examples.dataframe
+
+import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}
+import com.holdenkarau.spark.testing._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.scalatest.Matchers._
+
+import scala.collection.mutable
+import scala.util.Random
+
+class HappyPandasTest extends DataFrameSuiteBase {
+  val toronto = "toronto"
+  val sandiego = "san diego"
+  val virginia = "virginia"
+  val pandaInfoList = List(
+    PandaInfo(toronto, "giant", 1, 2),
+    PandaInfo(sandiego, "red", 2, 3),
+    PandaInfo(virginia, "black", 1, 10))
+
+  val rawPandaList = List(
+    RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)),
+    RawPanda(11L, "94110", "red", true, Array(1.0, 0.9)))
+
+  val pandasList = List(Pandas("bata", "10010", 10, 2),
+                        Pandas("wiza", "10010", 20, 4),
+                        Pandas("dabdob", "11000", 8, 2),
+                        Pandas("hanafy", "11000", 15, 7),
+                        Pandas("hamdi", "11111", 20, 10))
+
+  val pandaPlaces = List(PandaPlace("toronto", rawPandaList.toArray))
+
+  test("simple self join test") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(pandasList)
+    val result = HappyPandas.selfJoin(inputDF).select($"a.name", $"b.name")
+    val rez = result.collect()
+    rez.foreach{x => assert(x(0) == x(1))}
+  }
+
+  test("simple explode test") {
+    val inputDF = sqlContext.createDataFrame(pandaPlaces)
+    val pandaInfo = sqlContext.createDataFrame(rawPandaList)
+    val expectedDf = pandaInfo.select((pandaInfo("attributes")(0) / pandaInfo("attributes")(1)).as("squishyness"))
+    val result = HappyPandas.squishPandaFromPace(inputDF)
+
+    assertDataFrameApproximateEquals(expectedDf, result, 1E-5)
+  }
+
+  //tag::approxEqualDataFrames[]
+
+  test("verify simple happy pandas Percentage") {
+    val expectedList = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0))
+    val expectedDf = createDF(expectedList, ("place", StringType),
+                                              ("percentHappy", DoubleType))
+
+    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val resultDF = HappyPandas.happyPandasPercentage(inputDF)
+
+    assertDataFrameApproximateEquals(expectedDf, resultDF, 1E-5)
+  }
+  //end::approxEqualDataFrames[]
+
+  test("verify approx by hand") {
+    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val resultDF = HappyPandas.happyPandasPercentage(inputDF)
+    val resultRows = resultDF.collect()
+
+    val expectedRows = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0))
+
+    //tag::approxEqualRow[]
+    assert(expectedRows.length === resultRows.length)
+    expectedRows.zip(resultRows).foreach{case (r1, r2) =>
+      assert(r1(0) === r2(0))
+      assert(r1.getDouble(1) === (r2.getDouble(1) +- 0.001))
+    }
+    //end::approxEqualRow[]
+  }
+
+  test("test encode Panda type") {
+    val inputDF = sqlContext.createDataFrame(rawPandaList)
+    val resultDF = HappyPandas.encodePandaType(inputDF)
+
+    val expectedRows = List(Row(10L, 0), Row(11L, 1))
+    val expectedDF = createDF3(expectedRows, ("id", LongType, false),
+                                             ("encodedType", IntegerType, false))
+
+    assertDataFrameEquals(expectedDF, resultDF)
+  }
+
+  //tag::exactEqualDataFrames[]
+  test("verify exact equality") {
+    // test minHappyPandas
+    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val result = HappyPandas.minHappyPandas(inputDF, 2)
+    val resultRows = result.collect()
+
+    val expectedRows = List(Row(sandiego, "red", 2, 3))
+    assert(expectedRows === resultRows)
+  }
+  //end::exactEqualDataFrames[]
+
+  test("test happyPandasPlaces") {
+    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val resultDF = HappyPandas.happyPandasPlaces(inputDF)
+
+    val expectedRows = List(PandaInfo(toronto, "giant", 1, 2),
+                            PandaInfo(sandiego, "red", 2, 3))
+    val expectedDF = sqlContext.createDataFrame(expectedRows)
+
+    assertDataFrameEquals(expectedDF, resultDF)
+  }
+
+  test("test maxPandaSizePerZip") {
+    val inputDF = sqlContext.createDataFrame(pandasList)
+    val resultDF = HappyPandas.maxPandaSizePerZip(inputDF)
+
+    val expectedRows = List(Row(pandasList(1).zip, pandasList(1).pandaSize),
+                            Row(pandasList(3).zip, pandasList(3).pandaSize),
+                            Row(pandasList(4).zip, pandasList(4).pandaSize))
+    val expectedDF = createDF(expectedRows, ("zip", StringType),
+                                            ("max(pandaSize)", IntegerType))
+
+    assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"))
+  }
+
+  test("test minMaxPandaSizePerZip"){
+    val inputDF = sqlContext.createDataFrame(pandasList)
+    val resultDF = HappyPandas.minMaxPandaSizePerZip(inputDF)
+
+    val expectedRows = List(
+      Row(pandasList(1).zip, pandasList(0).pandaSize, pandasList(1).pandaSize),
+      Row(pandasList(3).zip, pandasList(2).pandaSize, pandasList(3).pandaSize),
+      Row(pandasList(4).zip, pandasList(4).pandaSize, pandasList(4).pandaSize))
+
+    val expectedDF = createDF(expectedRows, ("zip", StringType),
+                                            ("min(pandaSize)", IntegerType),
+                                            ("max(pandaSize)", IntegerType))
+
+    assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"))
+  }
+
+  test("test minPandaSizeMaxAgePerZip") {
+    val inputDF = sqlContext.createDataFrame(pandasList)
+    val resultDF = HappyPandas.minPandaSizeMaxAgePerZip(inputDF)
+
+    val expectedRows = List(
+      Row(pandasList(1).zip, pandasList(0).pandaSize, pandasList(1).age),
+      Row(pandasList(3).zip, pandasList(2).pandaSize, pandasList(3).age),
+      Row(pandasList(4).zip, pandasList(4).pandaSize, pandasList(4).age))
+
+    val expectedDF = createDF(expectedRows, ("zip", StringType),
+                                            ("min(pandaSize)", IntegerType),
+                                            ("max(age)", IntegerType))
+
+    assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"))
+  }
+
+  test("test complexAggPerZip") {
+    val inputDF = sqlContext.createDataFrame(pandasList)
+    val resultDF = HappyPandas.minMeanSizePerZip(inputDF)
+
+    val expectedRows = List(
+      Row(pandasList(1).zip, pandasList(0).pandaSize, 15.0),
+      Row(pandasList(3).zip, pandasList(2).pandaSize, 11.5),
+      Row(pandasList(4).zip, pandasList(4).pandaSize, 20.0))
+
+    val expectedDF = createDF(expectedRows, ("zip", StringType),
+                                            ("min(pandaSize)", IntegerType),
+                                            ("avg(pandaSize)", DoubleType))
+
+    assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1e-5)
+  }
+
+
+  test("test Simple SQL example") {
+    val inputDF = sqlContext.createDataFrame(pandasList)
+    val resultDF = HappyPandas.simpleSqlExample(inputDF)
+
+    val expectedRows = List(pandasList(0), pandasList(2))
+    val expectedDF = sqlContext.createDataFrame(expectedRows)
+
+    assertDataFrameEquals(expectedDF, resultDF)
+  }
+
+  test("test Order Pandas") {
+    val inputDF = sqlContext.createDataFrame(pandasList)
+    val resultDF = HappyPandas.orderPandas(inputDF)
+
+    val expectedRows = List(pandasList(2), pandasList(0), pandasList(3),
+                            pandasList(4), pandasList(1))
+    val expectedDF = sqlContext.createDataFrame(expectedRows)
+
+    assertDataFrameEquals(expectedDF, resultDF)
+  }
+
+
+  test("test computeRelativePandaSizes") {
+    val inputPandaList = loadPandaStuffies()
+    val inputDF = sqlContext.createDataFrame(inputPandaList)
+
+    val resultDF = HappyPandas.computeRelativePandaSizes(inputDF)
+
+    val expectedDF = getExpectedPandasRelativeSize(inputPandaList, -10, 10)
+
+    assertDataFrameApproximateEquals(expectedDF.orderBy("name"), resultDF.orderBy("name"), 1e-5)
+  }
+
+  private def getExpectedPandasRelativeSize(pandaList: List[Pandas], start: Int, end: Int):DataFrame = {
+
+    val expectedRows =
+      pandaList
+        .groupBy(_.zip)
+        .map(zipPandas => (zipPandas._1, zipPandas._2.sortBy(_.age)))
+        .flatMap(zipPandas => {
+          val pandas = zipPandas._2
+          val length = pandas.size - 1
+          val result = new mutable.MutableList[Row]
+
+          for (i <- 0 to length) {
+            var totalSum = 0
+            val startOffset = math.max(0, i + start)
+            val endOffset = math.min(length, i + end)
+
+            for (j <- startOffset to endOffset)
+              totalSum += pandas(j).pandaSize
+
+            val count = endOffset - startOffset + 1
+            val average = totalSum.toDouble / count
+
+            val panda = pandas(i)
+            result += Row(panda.name, panda.zip, panda.pandaSize, panda.age, panda.pandaSize - average)
+          }
+
+          result
+        }).toList
+
+    val expectedDF = createDF(expectedRows, ("name", StringType),
+                                            ("zip", StringType),
+                                            ("pandaSize", IntegerType),
+                                            ("age", IntegerType),
+                                            ("panda_relative_size", DoubleType))
+
+    expectedDF
+  }
+
+  private def loadPandaStuffies(): List[Pandas] = {
+    val zipCount = 3
+    val maxPandasPerZip = 15
+    val maxPandaAge = 50
+    val maxPandaSize = 500
+    val random = new Random()
+
+    val pandas =
+      (1 to zipCount)
+      .flatMap(zipId => {
+        val pandasCount = 1 + random.nextInt(maxPandasPerZip)
+        val zipName = s"zip($zipId)"
+
+        (1 to pandasCount).map(pandaId => {
+          val name = s"panda($pandaId)($zipId)"
+          val size = 1 + random.nextInt(maxPandaSize)
+          val age = 1 + random.nextInt(maxPandaAge)
+
+           Pandas(name, zipName, size, age)
+        }
+      )
+
+    })
+
+    pandas.toList
+  }
+
+
+  private def createDF(list: List[Row], fields: (String, DataType)*) =
+    sqlContext.createDataFrame(sc.parallelize(list), structType2(fields))
+
+  private def structType2(fields: Seq[(String, DataType)]) =
+    StructType(fields.map(f => StructField(f._1, f._2)).toList)
+
+
+  private def createDF3(list: List[Row], fields: (String, DataType, Boolean)*) =
+    sqlContext.createDataFrame(sc.parallelize(list), structType3(fields))
+
+  private def structType3(fields: Seq[(String, DataType, Boolean)]) =
+    StructType(fields.map(f => StructField(f._1, f._2, f._3)).toList)
+}
diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala
new file mode 100644
index 0000000..3a50e47
--- /dev/null
+++ b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala
@@ -0,0 +1,39 @@
+/**
+ * Test our simple JNI
+ */
+package com.highperformancespark.examples.ffi
+
+import com.holdenkarau.spark.testing._
+import org.scalacheck.{Arbitrary, Gen}
+import org.scalacheck.Prop.forAll
+import org.scalatest.FunSuite
+import org.scalatest.prop.Checkers
+import org.scalatest.Matchers._
+
+class NativeExampleSuite extends FunSuite with SharedSparkContext with Checkers {
+  test("local sum") {
+  //def magic2() {
+    val input = Array(1, 2, 3)
+    val sumMagic = new SumJNI()
+    val result = sumMagic.sum(input)
+    val expected = 6
+    result === expected
+  }
+
+  test("super simple test") {
+    val input = sc.parallelize(List(("hi", Array(1, 2, 3))))
+    val result = NativeExample.jniSum(input).collect()
+    val expected = List(("hi", 6))
+    result === expected
+  }
+
+  test("native call should find sum correctly") {
+    val property = forAll(RDDGenerator.genRDD[(String, Array[Int])](sc)(Arbitrary.arbitrary[(String, Array[Int])])) {
+      rdd =>
+        val expected = rdd.mapValues(_.sum)
+        val result = NativeExample.jniSum(rdd)
+        RDDComparisons.compareWithOrder(expected, result).isEmpty
+    }
+    check(property)
+  }
+}
diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
new file mode 100644
index 0000000..ba90abe
--- /dev/null
+++ b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
@@ -0,0 +1,24 @@
+/**
+ * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
+ */
+package com.highperformancespark.examples.transformations
+
+import com.highperformancespark.examples.dataframe.RawPanda
+
+import com.holdenkarau.spark.testing._
+
+import org.scalatest.FunSuite
+
+class AccumulatorsTest extends FunSuite with SharedSparkContext {
+  test("accumulator max should function") {
+    val input = sc.parallelize(1.to(100)).map(x => RawPanda(1L, "1", "red", true, Array(x.toDouble)))
+    val (_, max) = Accumulators.computeMaxFuzzyNess(sc, input)
+    assert(max === 100.0)
+  }
+
+  test("accumulator sum should function") {
+    val input = sc.parallelize(1.to(100)).map(x => RawPanda(1L, "1", "red", true, Array(x.toDouble)))
+    val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input)
+    assert(sum === 5050.0)
+  }
+}
diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala
new file mode 100644
index 0000000..6d8edb9
--- /dev/null
+++ b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala
@@ -0,0 +1,24 @@
+package com.highperformancespark.examples.wordcount
+
+
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.scalatest.FunSuite
+
+class WordCountTest extends FunSuite with SharedSparkContext {
+  test("word count with Stop Words Removed"){
+    val wordRDD = sc.parallelize(Seq(
+      "How happy was the panda? You ask.",
+      "Panda is the most happy panda in all the #$!?ing land!"))
+
+    val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he")
+    val illegalTokens: Array[Char] = "#$%?!.".toCharArray
+
+    val wordCounts = WordCount.withStopWordsFiltered(wordRDD, illegalTokens, stopWords)
+    val wordCountsAsMap = wordCounts.collectAsMap()
+    assert(!wordCountsAsMap.contains("the"))
+    assert(!wordCountsAsMap.contains("?"))
+    assert(!wordCountsAsMap.contains("#$!?ing"))
+    assert(wordCountsAsMap.contains("ing"))
+    assert(wordCountsAsMap.get("panda").get.equals(3))
+  }
+}
diff --git a/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala b/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala
new file mode 100644
index 0000000..4d983a6
--- /dev/null
+++ b/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala
@@ -0,0 +1,11 @@
+package com.highperformancespark.examples
+
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+
+class JavaInteropTestHelper(sc: SparkContext) {
+  def generateMiniPairRDD(): RDD[(String, Long)] = {
+    sc.parallelize(List(("panda", 12L)))
+  }
+}