-
-
Notifications
You must be signed in to change notification settings - Fork 355
DataFrameGenerator
DataframeGenerator
provides an easy way to generate arbitrary DataFrames, to be able to check any property.
If you don't know scalacheck, I suggest you read about it first; to understand the concepts of properties and generators.
You can generate DataFrames using method arbitraryDataFrame
. Just provide the schema you want and all DataFrames will be generated with this schema.
Example:
class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
test("assert dataframes generated correctly") {
val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
val sqlContext = new SQLContext(sc)
val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)
val property =
forAll(dataframeGen.arbitrary) {
dataframe => dataframe.schema === schema && dataframe.count >= 0
}
check(property)
}
}
You can create schema with complex types like ArrayType
, StructType
or MapType
.
Example:
class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
test("test Array Type generator"){
val schema = StructType(List(StructField("name", StringType, true),
StructField("pandas", ArrayType(StructType(List(
StructField("id", LongType, true),
StructField("zip", StringType, true),
StructField("happy", BooleanType, true),
StructField("attributes", ArrayType(FloatType), true)))))))
val sqlContext = new SQLContext(sc)
val dataframeGen: Arbitrary[DataFrame] = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)
val property =
forAll(dataframeGen.arbitrary) {
dataframe => dataframe.schema === schema &&
dataframe.select("pandas.attributes").map(_.getSeq(0)).count() >= 0
}
check(property)
}
}
Some times you want to choose custom generators for specific columns. You can do this by using method arbitraryDataFrameWithCustomFields
and providing the required generators.
Example:
class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
test("test multiple columns generators") {
val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
val sqlContext = new SQLContext(sc)
val nameGenerator = new Column("name", Gen.oneOf("Holden", "Hanafy")) // name should be one of those
val ageGenerator = new Column("age", Gen.choose(10, 100))
val dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields(sqlContext, schema)(nameGenerator, ageGenerator)
val property =
forAll(dataframeGen.arbitrary) {
dataframe => dataframe.schema === schema &&
dataframe.filter("(name != 'Holden' AND name != 'Hanafy') OR (age > 100 OR age < 10)").count() == 0
}
check(property)
}
}
In other cases, you may want to supply custom generators for nested columns. You can do this by using method arbitraryDataFrameWithCustomFields
and providing the required generators nested using ColumnList
.
Example:
class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
test("test multi-level column generators") {
val schema = StructType(List(
StructField("user", StructType(List(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("address", StructType(List(
StructField("street", StringType),
StructField("zip_code", IntegerType)
)))
)))
))
val sqlContext = new SQLContext(sc)
val userGenerator = new ColumnList("user", Seq(
// name should be on of Holden or Hanafy
new Column("name", Gen.oneOf("Holden", "Hanafy")),
new Column("age", Gen.choose(10, 100)),
new ColumnList("address", Seq(new Column("zip_code", Gen.choose(100, 200))))
))
val dataframeGen =
DataframeGenerator.arbitraryDataFrameWithCustomFields(
sqlContext, schema)(userGenerator)
val sqlExpr = """
|(user.name != 'Holden' AND user.name != 'Hanafy') OR
|(user.age > 100 OR user.age < 10) OR
|(user.address.zip_code > 200 OR user.address.zip_code < 100)""".
stripMargin
val property =
forAll(dataframeGen.arbitrary) {
dataframe => dataframe.schema === schema &&
dataframe.filter(sqlExpr).count() == 0
}
check(property)
}
}