high-performance-spark · mahmoudhanafy · Oct 8, 2015 · Oct 8, 2015 · Oct 8, 2015 · Oct 8, 2015
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,9 @@ project/plugins/project/
 # Scala-IDE specific
 .scala_dependencies
 .worksheet
+
+# emacs stuff
+\#*\#
+\.\#*
+*~
+sbt/*launch*.jar
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,38 @@
+language: scala
+sudo: false
+apt:
+  - pandas
+  - numpy
+cache:
+  directories:
+    - $HOME/.ivy2
+    - $HOME/spark
+    - $HOME/.cache/pip
+    - $HOME/.sbt/launchers
+scala:
+   - 2.11.6
+jdk:
+  - oraclejdk8
+apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - gfortran
+      - gcc
+      - binutils
+      - python-pip
+r_packages:
+  - Imap
+before_install:
+  - pip install --user codecov unittest2 nose pep8 pylint --download-cache $HOME/.pip-cache
+script:
+  - "export SPARK_CONF_DIR=./log4j/"
+  - sbt clean coverage compile test
+  - "[ -f spark] || mkdir spark && cd spark && wget http://d3kbcqa49mib13.cloudfront.net/spark-1.6.1-bin-hadoop2.6.tgz && cd .."
+  - "tar -xf ./spark/spark-1.6.1-bin-hadoop2.6.tgz"
+  - "export SPARK_HOME=`pwd`/spark-1.6.1-bin-hadoop2.6"
+  - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH"
+  - "nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov"
+after_success:
+# For now no coverage report
+  - codecov
diff --git a/LICENSE b/LICENSE
@@ -1,3 +1,6 @@
+Individual components under resources are available under their own licenses.
+	   * MySQL connector is GPL
+The source code in this repo is available under the Apache License
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/

diff --git a/README.md b/README.md
@@ -1,2 +1,10 @@
 # high-performance-spark-examples
 Examples for High Performance Spark
+
+# Building
+
+Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake.
+
+# Tests
+
+The full test suite depends on having the C and Fortran components built as well as a local R installation available.
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,90 @@
+organization := "com.highperformancespark"
+
+name := "examples"
+
+publishMavenStyle := true
+
+version := "0.0.1"
+
+scalaVersion := "2.11.6"
+
+crossScalaVersions := Seq("2.11.6")
+
+javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
+
+sparkVersion := "1.6.1"
+
+//tag::sparkComponents[]
+// TODO(Holden): re-add hive-thriftserver post Spark 2.0
+sparkComponents ++= Seq("core", "streaming", "mllib")
+//end::sparkComponents[]
+//tag::addSQLHiveComponent[]
+sparkComponents ++= Seq("sql", "hive")
+//end::addSQLHiveComponent[]
+
+
+parallelExecution in Test := false
+
+fork := true
+
+javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled")
+
+// additional libraries
+libraryDependencies ++= Seq(
+  "org.scalatest" %% "scalatest" % "2.2.1",
+  "org.scalacheck" %% "scalacheck" % "1.12.4",
+  "junit" % "junit" % "4.10",
+  // Temporary hack until Spark 2.0
+  "org.apache.spark" % "spark-hive-thriftserver_2.10" % "1.6.1" % "provided" intransitive(),
+  //tag::sparkCSV[]
+  "com.databricks" % "spark-csv_2.10" % "1.3.0",
+  //end::sparkCSV[]
+  "com.holdenkarau" % "spark-testing-base_2.11" % "1.6.1_0.3.3",
+  "org.eclipse.jetty" % "jetty-util" % "9.3.2.v20150730",
+  "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8",
+  "com.novocode" % "junit-interface" % "0.10" % "test->default")
+
+
+scalacOptions ++= Seq("-deprecation", "-unchecked")
+
+pomIncludeRepository := { x => false }
+
+resolvers ++= Seq(
+  "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
+  "Spray Repository" at "http://repo.spray.cc/",
+  "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
+  "Akka Repository" at "http://repo.akka.io/releases/",
+  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
+  "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
+  "Twitter Maven Repo" at "http://maven.twttr.com/",
+  "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
+  "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/",
+  "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
+  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
+  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
+  Resolver.sonatypeRepo("public"),
+  Resolver.bintrayRepo("jodersky", "sbt-jni-macros"),
+  "jodersky" at "https://dl.bintray.com/jodersky/maven/"
+)
+
+licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html"))
+
+mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
+  {
+    case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
+    case m if m.startsWith("META-INF") => MergeStrategy.discard
+    case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
+    case PathList("org", "apache", xs @ _*) => MergeStrategy.first
+    case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
+    case "log4j.properties" => MergeStrategy.discard
+    case "about.html"  => MergeStrategy.rename
+    case "reference.conf" => MergeStrategy.concat
+    case _ => MergeStrategy.first
+  }
+}
+
+// JNI
+
+enablePlugins(JniNative)
+
+sourceDirectory in nativeCompile := sourceDirectory.value
diff --git a/conf/log4j.properties b/conf/log4j.properties
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=ERROR, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=ERROR
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.spark-project.jetty=ERROR
+log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
diff --git a/high_performance_pyspark/SQLLineage.py b/high_performance_pyspark/SQLLineage.py
@@ -0,0 +1,38 @@
+"""
+>>> from pyspark.context import SparkContext
+>>> from pyspark.sql import SQLContext, Row, DataFrame
+>>> sc = SparkContext('local', 'test')
+...
+>>> sc.setLogLevel("ERROR")
+>>> sqlCtx = SQLContext(sc)
+...
+>>> rdd = sc.parallelize(range(1, 100)).map(lambda x: Row(i = x))
+>>> df = rdd.toDF()
+>>> df2 = cutLineage(df)
+>>> df.head() == df2.head()
+True
+>>> df.schema == df2.schema
+True
+"""
+
+from pyspark.sql import DataFrame
+
+#tag::cutLineage[]
+def cutLineage(df):
+    """
+    Cut the lineage of a DataFrame - used for iterative algorithms
+
+    .. Note: This uses internal members and may break between versions
+    """
+    jRDD = df._jdf.toJavaRDD()
+    jSchema = df._jdf.schema()
+    jRDD.cache()
+    sqlCtx = df.sql_ctx
+    try:
+        javaSqlCtx = sqlCtx._jsqlContext
+    except:
+        javaSqlCtx = sqlCtx._ssql_ctx
+    newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema)
+    newDF = DataFrame(newJavaDF, sqlCtx)
+    return newDF
+#end::cutLineage[]
diff --git a/high_performance_pyspark/__init__.py b/high_performance_pyspark/__init__.py
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+"""
+Python version of selected examples from High Performance Spark
+"""
+
+import os
+import sys
+
diff --git a/high_performance_pyspark/simple_perf_test.py b/high_performance_pyspark/simple_perf_test.py
@@ -0,0 +1,91 @@
+# When running this example make sure to include the built Scala jar :
+# $SPARK_HOME/bin/pyspark --jars ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar
+# This example illustrates how to interface Scala and Python code, but caution
+# should be taken as it depends on many private members that may change in
+# future releases of Spark.
+
+from pyspark.sql.types import *
+from pyspark.sql import DataFrame
+import timeit
+import time
+
+def generate_scale_data(sqlCtx, rows, numCols):
+    """
+    Generate scale data for the performance test.
+
+    This also illustrates calling custom Scala code from the driver.
+
+    .. Note: This depends on many internal methods and may break between versions.
+    """
+    sc = sqlCtx._sc
+    # Get the SQL Context, 2.0 and pre-2.0 syntax
+    try:
+        javaSqlCtx = sqlCtx._jsqlContext
+    except:
+        javaSqlCtx = sqlCtx._ssql_ctx
+    jsc = sc._jsc
+    scalasc = jsc.sc()
+    gateway = sc._gateway
+    # Call a java method that gives us back an RDD of JVM Rows (Int, Double)
+    # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are different, so we
+    # can't directly wrap this.
+    # This returns a Java RDD of Rows - normally it would better to
+    # return a DataFrame directly, but for illustration we will work with an RDD
+    # of Rows.
+    java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData. \
+               generateMiniScaleRows(scalasc, rows, numCols)
+    # Schemas are serialized to JSON and sent back and forth
+    # Construct a Python Schema and turn it into a Java Schema
+    schema = StructType([StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())])
+    jschema = javaSqlCtx.parseDataType(schema.json())
+    # Convert the Java RDD to Java DataFrame
+    java_dataframe = javaSqlCtx.createDataFrame(java_rdd, jschema)
+    # Wrap the Java DataFrame into a Python DataFrame
+    python_dataframe = DataFrame(java_dataframe, sqlCtx)
+    # Convert the Python DataFrame into an RDD
+    pairRDD = python_dataframe.rdd.map(lambda row: (row[0], row[1]))
+    return (python_dataframe, pairRDD)
+
+def runOnDF(df):
+    result = df.groupBy("zip").avg("fuzzyness").count()
+    return result
+
+def runOnRDD(rdd):
+    result = rdd.map(lambda (x, y): (x, (y, 1))). \
+             reduceByKey(lambda x, y: (x[0] + y [0], x[1] + y[1])). \
+             count()
+    return result
+
+def groupOnRDD(rdd):
+    return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count()
+
+def run(sc, sqlCtx, scalingFactor, size):
+    (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size)
+    input_rdd.cache().count()
+    rddTimeings = timeit.repeat(stmt=lambda: runOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()')
+    groupTimeings = timeit.repeat(stmt=lambda: groupOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()')
+    input_df.cache().count()
+    dfTimeings = timeit.repeat(stmt=lambda: runOnDF(input_df), repeat=10, number=1, timer=time.time, setup='gc.enable()')
+    print "RDD:"
+    print rddTimeings
+    print "group:"
+    print groupTimeings
+    print "df:"
+    print dfTimeings
+    print "yay"
+
+if __name__ == "__main__":
+
+    """
+    Usage: simple_perf_test scalingFactor size
+    """
+    import sys
+    from pyspark import SparkContext
+    from pyspark.sql import SQLContext
+    scalingFactor = int(sys.argv[1])
+    size = int(sys.argv[2])
+    sc = SparkContext(appName="SimplePythonPerf")
+    sqlCtx = SQLContext(sc)
+    run(sc, sqlCtx, scalingFactor, size)
+
+    sc.stop()
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -0,0 +1,21 @@
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0")
+
+resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
+
+resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/"
+
+
+resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
+
+// Temporary hack for bintray being sad
+
+resolvers +=  Resolver.bintrayRepo("jodersky", "sbt-jni-macros")
+resolvers += "jodersky" at "https://dl.bintray.com/jodersky/maven/"
+
+addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2")
+
+//addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
+
+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.3.3")
+
+addSbtPlugin("ch.jodersky" % "sbt-jni" % "1.0.0-RC3")
diff --git a/resources/mysql-connector-java-5.1.38.jar b/resources/mysql-connector-java-5.1.38.jar
diff --git a/resources/rawpanda.json b/resources/rawpanda.json
@@ -0,0 +1,2 @@
+{"name":"mission","pandas":[{"id":1,"zip":"94110","pt":"giant", "happy":true,
+			     "attributes":[0.4,0.5]}]}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"name":"mission","pandas":[{"id":1,"zip":"94110","pt":"giant", "happy":true,
		"attributes":[0.4,0.5]}]}