Skip to content

Commit

Permalink
Fixes holdenk#220 : Add dataframe comparison without order
Browse files Browse the repository at this point in the history
  • Loading branch information
smadarasmi committed Feb 25, 2018
1 parent 0ebf978 commit 154ba1e
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ trait DataFrameSuiteBaseLike extends SparkContextProvider
}
}

/**
* Compares if two [[DataFrame]]s are equal without caring about order of rows, by
* finding elements in one DataFrame not in the other. The resulting DataFrame
* should be empty inferring the two DataFrames have the same elements.
*/
def assertDataFrameNoOrderEquals(expected: DataFrame, result: DataFrame) {
assertEmpty(expected.except(result).rdd.take(maxUnequalRowsToShow))
assertEmpty(result.except(expected).rdd.take(maxUnequalRowsToShow))
}

/**
* Zip RDD's with precise indexes. This is used so we can join two DataFrame's
* Rows together regardless of if the source is different but still compare
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ class SampleDataFrameTest extends FunSuite with DataFrameSuiteBase {
assertDataFrameEquals(input, input)
}

test("dataframe should be equal with different order of rows") {
import sqlContext.implicits._
val input = sc.parallelize(inputList).toDF
val reverseInput = sc.parallelize(inputList.reverse).toDF
assertDataFrameNoOrderEquals(input, reverseInput)
}

test("unequal dataframes should not be equal") {
import sqlContext.implicits._
val input = sc.parallelize(inputList).toDF
Expand All @@ -46,6 +53,18 @@ class SampleDataFrameTest extends FunSuite with DataFrameSuiteBase {
}
}

test("unequal dataframe with different order should not equal") {
import sqlContext.implicits._
val input = sc.parallelize(inputList).toDF
val input2 = sc.parallelize(List(inputList.head)).toDF
intercept[org.scalatest.exceptions.TestFailedException] {
assertDataFrameNoOrderEquals(input, input2)
}
intercept[org.scalatest.exceptions.TestFailedException] {
assertDataFrameNoOrderEquals(input2, input)
}
}

test("dataframe approx expected") {
import sqlContext.implicits._
val input = sc.parallelize(inputList).toDF
Expand Down

0 comments on commit 154ba1e

Please sign in to comment.