Skip to content
Robin Bramley edited this page Oct 24, 2019 · 9 revisions

DataframeGenerator provides an easy way to generate arbitrary DataFrames, to be able to check any property. If you don't know scalacheck, I suggest you read about it first; to understand the concepts of properties and generators.

You can generate DataFrames using method arbitraryDataFrame. Just provide the schema you want and all DataFrames will be generated with this schema.

Example:

class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {
  test("assert dataframes generated correctly") {
    val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
    val sqlContext = new SQLContext(sc)
    val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)
  
    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => dataframe.schema === schema && dataframe.count >= 0
      }
  
    check(property)
  }
}

You can create schema with complex types like ArrayType, StructType or MapType.

Example:

class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {

  test("test Array Type generator"){
    val schema = StructType(List(StructField("name", StringType, true),
      StructField("pandas", ArrayType(StructType(List(
        StructField("id", LongType, true),
        StructField("zip", StringType, true),
        StructField("happy", BooleanType, true),
        StructField("attributes", ArrayType(FloatType), true)))))))

    val sqlContext = new SQLContext(sc)
    val dataframeGen: Arbitrary[DataFrame] = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)
    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => dataframe.schema === schema &&
          dataframe.select("pandas.attributes").map(_.getSeq(0)).count() >= 0
      }

    check(property)
  }
}

Some times you want to choose custom generators for specific columns. You can do this by using method arbitraryDataFrameWithCustomFields and providing the required generators.

Example:

class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {

  test("test multiple columns generators") {
    val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
    val sqlContext = new SQLContext(sc)
    val nameGenerator = new Column("name", Gen.oneOf("Holden", "Hanafy")) // name should be one of those
    val ageGenerator = new Column("age", Gen.choose(10, 100))
    val dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields(sqlContext, schema)(nameGenerator, ageGenerator)

    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => dataframe.schema === schema &&
          dataframe.filter("(name != 'Holden' AND name != 'Hanafy') OR (age > 100 OR age < 10)").count() == 0
      }

    check(property)
  }
}

In other cases, you may want to supply custom generators for nested columns. You can do this by using method arbitraryDataFrameWithCustomFields and providing the required generators nested using ColumnList.

Example:

class DataFrameCheck extends FunSuite with SharedSparkContext with Checkers {

  test("test multi-level column generators") {
    val schema = StructType(List(
      StructField("user", StructType(List(
        StructField("name", StringType),
        StructField("age", IntegerType),
        StructField("address", StructType(List(
          StructField("street", StringType),
          StructField("zip_code", IntegerType)
        )))
      )))
    ))
    val sqlContext = new SQLContext(sc)
    val userGenerator = new ColumnList("user", Seq(
       // name should be on of Holden or Hanafy
      new Column("name", Gen.oneOf("Holden", "Hanafy")),
      new Column("age", Gen.choose(10, 100)),
      new ColumnList("address", Seq(new Column("zip_code", Gen.choose(100, 200))))
    ))
    val dataframeGen =
      DataframeGenerator.arbitraryDataFrameWithCustomFields(
        sqlContext, schema)(userGenerator)

    val sqlExpr = """
                 |(user.name != 'Holden' AND user.name != 'Hanafy') OR
                 |(user.age > 100 OR user.age < 10) OR
                 |(user.address.zip_code > 200 OR user.address.zip_code < 100)""".
      stripMargin
    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => dataframe.schema === schema &&
        dataframe.filter(sqlExpr).count() == 0
      }

    check(property)
  }
}
Clone this wiki locally