diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala index 51a8b136e09..a37ec480cf1 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala @@ -454,21 +454,34 @@ object StatisticsCollection extends DeltaCommand { } /** - * This method validates that the data type of data skipping column supports data skipping - * based on file statistics. + * This method validates that the data type of a data skipping column provided in + * [[DeltaConfigs.DATA_SKIPPING_STATS_COLUMNS]] supports data skipping based on file statistics. + * If a struct column is specified, all its children are considered valid. This helps users + * who have complex nested types and wish to collect stats on all supported nested columns + * without specifying each field individually. At stats collection time, unsupported types will + * simply be skipped, so it is safe to allow those through. * @param name The name of the data skipping column for validating data type. * @param dataType The data type of the data skipping column. * @param columnPaths The column paths of all valid fields. + * @param insideStruct Tracks if the field is inside a user-specified struct. Don't throw an + * error on ineligible skipping types inside structs as the user didn't + * specify them directly. Simply log a warning to let the user know + * statistics won't be collected on that nested field. */ private def validateDataSkippingType( name: String, dataType: DataType, - columnPaths: ArrayBuffer[String]): Unit = dataType match { + columnPaths: ArrayBuffer[String], + insideStruct: Boolean = false): Unit = dataType match { case s: StructType => s.foreach { field => - validateDataSkippingType(name + "." + field.name, field.dataType, columnPaths) + validateDataSkippingType(name + "." + field.name, field.dataType, columnPaths, + insideStruct = true) } case SkippingEligibleDataType(_) => columnPaths.append(name) + case _ if insideStruct => + logWarning(s"Data skipping is not supported for column $name of type $dataType") + columnPaths.append(name) case _ => throw new DeltaIllegalArgumentException( errorClass = "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE", diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala index 88a4e3265ce..6ec60f2b303 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala @@ -599,6 +599,33 @@ trait DataSkippingDeltaTestsBase extends DeltaExcludedBySparkVersionTestMixinShi deltaStatsColNamesOpt = Some("b.c") ) + testSkipping( + "indexed column names - naming a nested column allows nested complex types", + """{ + "a": { + "b": [1, 2, 3], + "c": [4, 5, 6], + "d": 7, + "e": 8, + "f": { + "g": 9 + } + }, + "i": 10 + }""".replace("\n", ""), + hits = Seq( + "i < 0", + "a.d > 6", + "a.f.g < 10" + ), + misses = Seq( + "a.d < 0", + "a.e < 0", + "a.f.g < 0" + ), + deltaStatsColNamesOpt = Some("a") + ) + testSkipping( "indexed column names - index only a subset of leaf columns", """{ diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala index 1bf169b90d6..8021f03d455 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala @@ -499,14 +499,13 @@ class StatsCollectionSuite ("BINARY", "BinaryType"), ("BOOLEAN", "BooleanType"), ("ARRAY", "ArrayType(ByteType,true)"), - ("MAP", "MapType(DateType,IntegerType,true)"), - ("STRUCT>", "ArrayType(IntegerType,true)") + ("MAP", "MapType(DateType,IntegerType,true)") ).foreach { case (invalidType, typename) => val tableName1 = "delta_table_1" val tableName2 = "delta_table_2" test(s"Delta statistic column: invalid data type $invalidType") { withTable(tableName1, tableName2) { - val columnName = if (typename.equals("ArrayType(IntegerType,true)")) "c2.c61" else "c2" + val columnName = "c2" val exceptOne = intercept[DeltaIllegalArgumentException] { sql( s"create table $tableName1 (c1 long, c2 $invalidType) using delta " + @@ -530,7 +529,7 @@ class StatsCollectionSuite test(s"Delta statistic column: invalid data type $invalidType in nested column") { withTable(tableName1, tableName2) { - val columnName = if (typename == "ArrayType(IntegerType,true)") "c2.c21.c61" else "c2.c21" + val columnName = "c2.c21" val exceptOne = intercept[DeltaIllegalArgumentException] { sql( s"create table $tableName1 (c1 long, c2 STRUCT) " + @@ -541,16 +540,6 @@ class StatsCollectionSuite exceptOne.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && exceptOne.getMessageParametersArray.toSeq == Seq(columnName, typename) ) - val exceptTwo = intercept[DeltaIllegalArgumentException] { - sql( - s"create table $tableName1 (c1 long, c2 STRUCT) " + - s"using delta TBLPROPERTIES('delta.dataSkippingStatsColumns' = 'c2')" - ) - } - assert( - exceptTwo.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && - exceptTwo.getMessageParametersArray.toSeq == Seq(columnName, typename) - ) sql(s"create table $tableName2 (c1 long, c2 STRUCT) using delta") val exceptThree = interceptWithUnwrapping[DeltaIllegalArgumentException] { sql( @@ -561,13 +550,6 @@ class StatsCollectionSuite exceptThree.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && exceptThree.getMessageParametersArray.toSeq == Seq(columnName, typename) ) - val exceptFour = interceptWithUnwrapping[DeltaIllegalArgumentException] { - sql(s"ALTER TABLE $tableName2 SET TBLPROPERTIES('delta.dataSkippingStatsColumns'='c2')") - } - assert( - exceptFour.getErrorClass == "DELTA_COLUMN_DATA_SKIPPING_NOT_SUPPORTED_TYPE" && - exceptFour.getMessageParametersArray.toSeq == Seq(columnName, typename) - ) } } } @@ -608,7 +590,8 @@ class StatsCollectionSuite Seq( "BIGINT", "DATE", "DECIMAL(3, 2)", "DOUBLE", "FLOAT", "INT", "SMALLINT", "STRING", - "TIMESTAMP", "TIMESTAMP_NTZ", "TINYINT" + "TIMESTAMP", "TIMESTAMP_NTZ", "TINYINT", "STRUCT", + "STRUCT>" ).foreach { validType => val tableName1 = "delta_table_1" val tableName2 = "delta_table_2"