-
-
Notifications
You must be signed in to change notification settings - Fork 355
DataFrameGenerator
Mahmoud Hanafy edited this page Apr 22, 2016
·
9 revisions
DataframeGenerator
provides an easy way to generate arbitrary DataFrames, to be able to check any property.
If you don't know scalacheck, I suggest you read about it first; to understand the concepts of properties and generators.
You can generate DataFrames using method arbitraryDataFrame
. Just provide the schema you want and all DataFrames will be generated with this schema.
Example:
class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
test("assert dataframes generated correctly") {
val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
val sqlContext = new SQLContext(sc)
val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)
val property =
forAll(dataframeGen.arbitrary) {
dataframe => dataframe.schema === schema && dataframe.count >= 0
}
check(property)
}
}
You can create schema with complex types like ArrayType
, StructType
or MapType
.
Example:
class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
test("test Array Type generator"){
val schema = StructType(List(StructField("name", StringType, true),
StructField("pandas", ArrayType(StructType(List(
StructField("id", LongType, true),
StructField("zip", StringType, true),
StructField("happy", BooleanType, true),
StructField("attributes", ArrayType(FloatType), true)))))))
val sqlContext = new SQLContext(sc)
val dataframeGen: Arbitrary[DataFrame] = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)
val property =
forAll(dataframeGen.arbitrary) {
dataframe => dataframe.schema === schema &&
dataframe.select("pandas.attributes").map(_.getSeq(0)).count() >= 0
}
check(property)
}
}
Some times you want to choose custom generators for specific columns. You can do this by using method arbitraryDataFrameWithCustomFields
and providing the required generators.
Example:
class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
test("test multiple columns generators") {
val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
val sqlContext = new SQLContext(sc)
val nameGenerator = new ColumnGenerator("name", Gen.oneOf("Holden", "Hanafy")) // name should be on of those
val ageGenerator = new ColumnGenerator("age", Gen.choose(10, 100))
val dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields(sqlContext, schema)(nameGenerator, ageGenerator)
val property =
forAll(dataframeGen.arbitrary) {
dataframe => dataframe.schema === schema &&
dataframe.filter("(name != 'Holden' AND name != 'Hanafy') OR (age > 100 OR age < 10)").count() == 0
}
check(property)
}
}