From b887283a1f97d0f90b41c5b7ebb342c882b469ed Mon Sep 17 00:00:00 2001 From: Mahmoud Hanafy Date: Sat, 21 May 2016 00:46:50 +0200 Subject: [PATCH] Add simple test cases for JavaHappyPandas --- .../examples/dataframe/JavaHappyPandas.java | 4 +- .../dataframe/JavaHappyPandasTest.java | 151 ++++++++++++++++++ .../dataframe/HappyPandasTest.scala | 8 +- 3 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java index 09b5e2f..bc93163 100644 --- a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java +++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java @@ -62,7 +62,7 @@ public static DataFrame jsonLoadFromRDD(SQLContext sqlContext, JavaRDD i */ public static DataFrame happyPandasPercentage(DataFrame pandaInfo) { DataFrame happyPercentage = pandaInfo.select(pandaInfo.col("place"), - pandaInfo.col("happyPandas").divide(pandaInfo.col("totalPandas")).as("percentHappy")); + (pandaInfo.col("happyPandas").divide(pandaInfo.col("totalPandas"))).as("percentHappy")); return happyPercentage; } @@ -204,7 +204,7 @@ public static void joins(DataFrame df1, DataFrame df2) { } public static DataFrame selfJoin(DataFrame df) { - return df.as("a").join(df.as("b")).where(df.col("name").equalTo(df.col("name"))); + return (df.as("a")).join(df.as("b")).where("a.name = b.name"); } } diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java new file mode 100644 index 0000000..b0d4bdc --- /dev/null +++ b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java @@ -0,0 +1,151 @@ +package com.highperformancespark.examples.dataframe; + +import com.highperformancespark.examples.objects.JavaPandaInfo; +import com.highperformancespark.examples.objects.JavaPandas; +import com.highperformancespark.examples.objects.JavaRawPanda; +import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +public class JavaHappyPandasTest extends JavaDataFrameSuiteBase { + String toronto = "toronto"; + String sandiego = "san diego"; + String virginia = "virginia"; + + List pandaInfoList = Arrays.asList( + new JavaPandaInfo(toronto, "giant", 1, 2), + new JavaPandaInfo(sandiego, "red", 2, 3), + new JavaPandaInfo(virginia, "black", 1, 10) + ); + + List rawPandaList = Arrays.asList( + new JavaRawPanda(10L, "94110", "giant", true, Arrays.asList(1.0, 0.9)), + new JavaRawPanda(11L, "94110", "red", true, Arrays.asList(1.0, 0.9))); + + List pandasList = Arrays.asList( + new JavaPandas("bata", "10010", 10, 2), + new JavaPandas("wiza", "10010", 20, 4), + new JavaPandas("dabdob", "11000", 8, 2), + new JavaPandas("hanafy", "11000", 15, 7), + new JavaPandas("hamdi", "11111", 20, 10) + ); + + @Test + public void simpleSelfJoinTest() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame result = JavaHappyPandas.selfJoin(inputDF).select("a.name", "b.name"); + List resultList = result.collectAsList(); + + resultList.stream().forEach(row -> assertEquals(row.getString(0), row.getString(1))); + } + + @Test + public void verifyhappyPandasPercentage() { + List expectedList = Arrays.asList(RowFactory.create(toronto, 0.5), + RowFactory.create(sandiego, 2 / 3.0), RowFactory.create(virginia, 1/10.0)); + DataFrame expectedDF = sqlContext().createDataFrame( + expectedList, new StructType( + new StructField[]{ + new StructField("place", DataTypes.StringType, true, Metadata.empty()), + new StructField("percentHappy", DataTypes.DoubleType, true, Metadata.empty()) + })); + + DataFrame inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class); + DataFrame resultDF = JavaHappyPandas.happyPandasPercentage(inputDF); + + assertDataFrameApproximateEquals(expectedDF, resultDF, 1E-5); + } + + @Test + public void encodePandaType() { + DataFrame inputDF = sqlContext().createDataFrame(rawPandaList, JavaRawPanda.class); + DataFrame resultDF = JavaHappyPandas.encodePandaType(inputDF); + + List expectedRows = Arrays.asList(RowFactory.create(10L, 0), RowFactory.create(11L, 1)); + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, new StructType(new StructField[]{ + new StructField("id", DataTypes.LongType, false, Metadata.empty()), + new StructField("encodedType", DataTypes.IntegerType, false, Metadata.empty()) + })); + + assertDataFrameEquals(expectedDF, resultDF); + } + + @Test + public void happyPandasPlaces() { + DataFrame inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class); + DataFrame resultDF = JavaHappyPandas.happyPandasPlaces(inputDF); + + List expectedRows = Arrays.asList( + new JavaPandaInfo(toronto, "giant", 1, 2), + new JavaPandaInfo(sandiego, "red", 2, 3)); + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, JavaPandaInfo.class); + + assertDataFrameEquals(expectedDF, resultDF); + } + + @Test + public void maxPandaSizePerZip() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame resultDF = JavaHappyPandas.maxPandaSizePerZip(inputDF); + + List expectedRows = Arrays.asList( + RowFactory.create(pandasList.get(1).getZip(), pandasList.get(1).getPandaSize()), + RowFactory.create(pandasList.get(3).getZip(), pandasList.get(3).getPandaSize()), + RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize()) + ); + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, + new StructType( + new StructField[]{ + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("max(pandaSize)", DataTypes.IntegerType, true, Metadata.empty()) + } + )); + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")); + } + + @Test + public void complexAggPerZip() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame resultDF = JavaHappyPandas.minMeanSizePerZip(inputDF); + + List expectedRows = Arrays.asList( + RowFactory.create(pandasList.get(1).getZip(), pandasList.get(0).getPandaSize(), 15.0), + RowFactory.create(pandasList.get(3).getZip(), pandasList.get(2).getPandaSize(), 11.5), + RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize(), 20.0)); + + DataFrame expectedDF = sqlContext().createDataFrame(expectedRows, + new StructType( + new StructField[]{ + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("min(pandaSize)", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("avg(pandaSize)", DataTypes.DoubleType, true, Metadata.empty()) + } + )); + + assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1E-5); + } + + @Test + public void simpleSQLExample() { + DataFrame inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + DataFrame resultDF = JavaHappyPandas.simpleSqlExample(inputDF); + + List expectedList = Arrays.asList( + pandasList.get(0), pandasList.get(2) + ); + DataFrame expectedDF = sqlContext().createDataFrame(expectedList, JavaPandas.class); + + assertDataFrameEquals(expectedDF, resultDF); + } + +} \ No newline at end of file diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index f46a2f3..c6d64fe 100644 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -54,14 +54,14 @@ class HappyPandasTest extends DataFrameSuiteBase { //tag::approxEqualDataFrames[] test("verify simple happy pandas Percentage") { - val expectedResult = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0)) - val expectedDf = createDF(expectedResult, ("place", StringType), + val expectedList = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0)) + val expectedDf = createDF(expectedList, ("place", StringType), ("percentHappy", DoubleType)) val inputDF = sqlContext.createDataFrame(pandaInfoList) - val result = HappyPandas.happyPandasPercentage(inputDF) + val resultDF = HappyPandas.happyPandasPercentage(inputDF) - assertDataFrameApproximateEquals(expectedDf, result, 1E-5) + assertDataFrameApproximateEquals(expectedDf, resultDF, 1E-5) } //end::approxEqualDataFrames[]