diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala index 51a8b136e0..4f5d7601fe 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala @@ -459,16 +459,22 @@ object StatisticsCollection extends DeltaCommand { * @param name The name of the data skipping column for validating data type. * @param dataType The data type of the data skipping column. * @param columnPaths The column paths of all valid fields. + * @param insideStruct Tracks if we are inside a struct, in which case all types are valid */ private def validateDataSkippingType( name: String, dataType: DataType, - columnPaths: ArrayBuffer[String]): Unit = dataType match { + columnPaths: ArrayBuffer[String], + insideStruct: Boolean = false): Unit = dataType match { case s: StructType => s.foreach { field => - validateDataSkippingType(name + "." + field.name, field.dataType, columnPaths) + validateDataSkippingType(name + "." + field.name, field.dataType, columnPaths, + insideStruct = true) } case SkippingEligibleDataType(_) => columnPaths.append(name) + case d if insideStruct => + logWarning(s"Data skipping is not supported for column $name of type $dataType") + columnPaths.append(name) case _ => throw new DeltaIllegalArgumentException( errorClass = "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE", diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala index 88a4e3265c..6ec60f2b30 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala @@ -599,6 +599,33 @@ trait DataSkippingDeltaTestsBase extends DeltaExcludedBySparkVersionTestMixinShi deltaStatsColNamesOpt = Some("b.c") ) + testSkipping( + "indexed column names - naming a nested column allows nested complex types", + """{ + "a": { + "b": [1, 2, 3], + "c": [4, 5, 6], + "d": 7, + "e": 8, + "f": { + "g": 9 + } + }, + "i": 10 + }""".replace("\n", ""), + hits = Seq( + "i < 0", + "a.d > 6", + "a.f.g < 10" + ), + misses = Seq( + "a.d < 0", + "a.e < 0", + "a.f.g < 0" + ), + deltaStatsColNamesOpt = Some("a") + ) + testSkipping( "indexed column names - index only a subset of leaf columns", """{ diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala index 1bf169b90d..8021f03d45 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala @@ -499,14 +499,13 @@ class StatsCollectionSuite ("BINARY", "BinaryType"), ("BOOLEAN", "BooleanType"), ("ARRAY", "ArrayType(ByteType,true)"), - ("MAP", "MapType(DateType,IntegerType,true)"), - ("STRUCT>", "ArrayType(IntegerType,true)") + ("MAP", "MapType(DateType,IntegerType,true)") ).foreach { case (invalidType, typename) => val tableName1 = "delta_table_1" val tableName2 = "delta_table_2" test(s"Delta statistic column: invalid data type $invalidType") { withTable(tableName1, tableName2) { - val columnName = if (typename.equals("ArrayType(IntegerType,true)")) "c2.c61" else "c2" + val columnName = "c2" val exceptOne = intercept[DeltaIllegalArgumentException] { sql( s"create table $tableName1 (c1 long, c2 $invalidType) using delta " + @@ -530,7 +529,7 @@ class StatsCollectionSuite test(s"Delta statistic column: invalid data type $invalidType in nested column") { withTable(tableName1, tableName2) { - val columnName = if (typename == "ArrayType(IntegerType,true)") "c2.c21.c61" else "c2.c21" + val columnName = "c2.c21" val exceptOne = intercept[DeltaIllegalArgumentException] { sql( s"create table $tableName1 (c1 long, c2 STRUCT) " + @@ -541,16 +540,6 @@ class StatsCollectionSuite exceptOne.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && exceptOne.getMessageParametersArray.toSeq == Seq(columnName, typename) ) - val exceptTwo = intercept[DeltaIllegalArgumentException] { - sql( - s"create table $tableName1 (c1 long, c2 STRUCT) " + - s"using delta TBLPROPERTIES('delta.dataSkippingStatsColumns' = 'c2')" - ) - } - assert( - exceptTwo.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && - exceptTwo.getMessageParametersArray.toSeq == Seq(columnName, typename) - ) sql(s"create table $tableName2 (c1 long, c2 STRUCT) using delta") val exceptThree = interceptWithUnwrapping[DeltaIllegalArgumentException] { sql( @@ -561,13 +550,6 @@ class StatsCollectionSuite exceptThree.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && exceptThree.getMessageParametersArray.toSeq == Seq(columnName, typename) ) - val exceptFour = interceptWithUnwrapping[DeltaIllegalArgumentException] { - sql(s"ALTER TABLE $tableName2 SET TBLPROPERTIES('delta.dataSkippingStatsColumns'='c2')") - } - assert( - exceptFour.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && - exceptFour.getMessageParametersArray.toSeq == Seq(columnName, typename) - ) } } } @@ -608,7 +590,8 @@ class StatsCollectionSuite Seq( "BIGINT", "DATE", "DECIMAL(3, 2)", "DOUBLE", "FLOAT", "INT", "SMALLINT", "STRING", - "TIMESTAMP", "TIMESTAMP_NTZ", "TINYINT" + "TIMESTAMP", "TIMESTAMP_NTZ", "TINYINT", "STRUCT", + "STRUCT>" ).foreach { validType => val tableName1 = "delta_table_1" val tableName2 = "delta_table_2"