@@ -2785,52 +2785,31 @@ object QueryPlanSerde extends Logging with CometExprShim {
2785
2785
* Check if the datatypes of shuffle input are supported. This is used for Columnar shuffle
2786
2786
* which supports struct/array.
2787
2787
*/
2788
- def supportPartitioningTypes (
2789
- inputs : Seq [Attribute ],
2790
- partitioning : Partitioning ): (Boolean , String ) = {
2791
- def supportedDataType (dt : DataType ): Boolean = dt match {
2792
- case _ : ByteType | _ : ShortType | _ : IntegerType | _ : LongType | _ : FloatType |
2793
- _ : DoubleType | _ : StringType | _ : BinaryType | _ : TimestampType | _ : DecimalType |
2794
- _ : DateType | _ : BooleanType =>
2795
- true
2796
- case StructType (fields) =>
2797
- fields.forall(f => supportedDataType(f.dataType)) &&
2798
- // Java Arrow stream reader cannot work on duplicate field name
2799
- fields.map(f => f.name).distinct.length == fields.length
2800
- case ArrayType (ArrayType (_, _), _) => false // TODO: nested array is not supported
2801
- case ArrayType (MapType (_, _, _), _) => false // TODO: map array element is not supported
2802
- case ArrayType (elementType, _) =>
2803
- supportedDataType(elementType)
2804
- case MapType (MapType (_, _, _), _, _) => false // TODO: nested map is not supported
2805
- case MapType (_, MapType (_, _, _), _) => false
2806
- case MapType (StructType (_), _, _) => false // TODO: struct map key/value is not supported
2807
- case MapType (_, StructType (_), _) => false
2808
- case MapType (ArrayType (_, _), _, _) => false // TODO: array map key/value is not supported
2809
- case MapType (_, ArrayType (_, _), _) => false
2810
- case MapType (keyType, valueType, _) =>
2811
- supportedDataType(keyType) && supportedDataType(valueType)
2812
- case _ =>
2813
- false
2814
- }
2815
-
2788
+ def columnarShuffleSupported (s : ShuffleExchangeExec ): (Boolean , String ) = {
2789
+ val inputs = s.child.output
2790
+ val partitioning = s.outputPartitioning
2816
2791
var msg = " "
2817
2792
val supported = partitioning match {
2818
2793
case HashPartitioning (expressions, _) =>
2794
+ // columnar shuffle supports the same data types (including complex types) both for
2795
+ // partition keys and for other columns
2819
2796
val supported =
2820
2797
expressions.map(QueryPlanSerde .exprToProto(_, inputs)).forall(_.isDefined) &&
2821
- expressions.forall(e => supportedDataType (e.dataType)) &&
2822
- inputs.forall(attr => supportedDataType (attr.dataType))
2798
+ expressions.forall(e => supportedShuffleDataType (e.dataType)) &&
2799
+ inputs.forall(attr => supportedShuffleDataType (attr.dataType))
2823
2800
if (! supported) {
2824
2801
msg = s " unsupported Spark partitioning expressions: $expressions"
2825
2802
}
2826
2803
supported
2827
- case SinglePartition => inputs.forall(attr => supportedDataType(attr.dataType))
2828
- case RoundRobinPartitioning (_) => inputs.forall(attr => supportedDataType(attr.dataType))
2804
+ case SinglePartition =>
2805
+ inputs.forall(attr => supportedShuffleDataType(attr.dataType))
2806
+ case RoundRobinPartitioning (_) =>
2807
+ inputs.forall(attr => supportedShuffleDataType(attr.dataType))
2829
2808
case RangePartitioning (orderings, _) =>
2830
2809
val supported =
2831
2810
orderings.map(QueryPlanSerde .exprToProto(_, inputs)).forall(_.isDefined) &&
2832
- orderings.forall(e => supportedDataType (e.dataType)) &&
2833
- inputs.forall(attr => supportedDataType (attr.dataType))
2811
+ orderings.forall(e => supportedShuffleDataType (e.dataType)) &&
2812
+ inputs.forall(attr => supportedShuffleDataType (attr.dataType))
2834
2813
if (! supported) {
2835
2814
msg = s " unsupported Spark partitioning expressions: $orderings"
2836
2815
}
@@ -2849,33 +2828,42 @@ object QueryPlanSerde extends Logging with CometExprShim {
2849
2828
}
2850
2829
2851
2830
/**
2852
- * Whether the given Spark partitioning is supported by Comet.
2831
+ * Whether the given Spark partitioning is supported by Comet native shuffle .
2853
2832
*/
2854
- def supportPartitioning (
2855
- inputs : Seq [Attribute ],
2856
- partitioning : Partitioning ): (Boolean , String ) = {
2857
- def supportedDataType (dt : DataType ): Boolean = dt match {
2858
- case _ : ByteType | _ : ShortType | _ : IntegerType | _ : LongType | _ : FloatType |
2859
- _ : DoubleType | _ : StringType | _ : BinaryType | _ : TimestampType | _ : DecimalType |
2860
- _ : DateType | _ : BooleanType =>
2833
+ def nativeShuffleSupported (s : ShuffleExchangeExec ): (Boolean , String ) = {
2834
+
2835
+ /**
2836
+ * Determine which data types are supported as hash-partition keys in native shuffle.
2837
+ *
2838
+ * Hash Partition Key determines how data should be collocated for operations like
2839
+ * `groupByKey`, `reduceByKey` or `join`.
2840
+ */
2841
+ def supportedPartitionKeyDataType (dt : DataType ): Boolean = dt match {
2842
+ case _ : BooleanType | _ : ByteType | _ : ShortType | _ : IntegerType | _ : LongType |
2843
+ _ : FloatType | _ : DoubleType | _ : StringType | _ : BinaryType | _ : TimestampType |
2844
+ _ : TimestampNTZType | _ : DecimalType | _ : DateType =>
2861
2845
true
2862
2846
case _ =>
2863
- // Native shuffle doesn't support struct/array yet
2864
2847
false
2865
2848
}
2866
2849
2850
+ val inputs = s.child.output
2851
+ val partitioning = s.outputPartitioning
2867
2852
var msg = " "
2868
2853
val supported = partitioning match {
2869
2854
case HashPartitioning (expressions, _) =>
2855
+ // native shuffle currently does not support complex types as partition keys
2856
+ // due to lack of hashing support for those types
2870
2857
val supported =
2871
2858
expressions.map(QueryPlanSerde .exprToProto(_, inputs)).forall(_.isDefined) &&
2872
- expressions.forall(e => supportedDataType (e.dataType)) &&
2873
- inputs.forall(attr => supportedDataType (attr.dataType))
2859
+ expressions.forall(e => supportedPartitionKeyDataType (e.dataType)) &&
2860
+ inputs.forall(attr => supportedShuffleDataType (attr.dataType))
2874
2861
if (! supported) {
2875
2862
msg = s " unsupported Spark partitioning expressions: $expressions"
2876
2863
}
2877
2864
supported
2878
- case SinglePartition => inputs.forall(attr => supportedDataType(attr.dataType))
2865
+ case SinglePartition =>
2866
+ inputs.forall(attr => supportedShuffleDataType(attr.dataType))
2879
2867
case _ =>
2880
2868
msg = s " unsupported Spark partitioning: ${partitioning.getClass.getName}"
2881
2869
false
@@ -2889,6 +2877,34 @@ object QueryPlanSerde extends Logging with CometExprShim {
2889
2877
}
2890
2878
}
2891
2879
2880
+ /**
2881
+ * Determine which data types are supported in a shuffle.
2882
+ */
2883
+ def supportedShuffleDataType (dt : DataType ): Boolean = dt match {
2884
+ case _ : BooleanType | _ : ByteType | _ : ShortType | _ : IntegerType | _ : LongType |
2885
+ _ : FloatType | _ : DoubleType | _ : StringType | _ : BinaryType | _ : TimestampType |
2886
+ _ : TimestampNTZType | _ : DecimalType | _ : DateType =>
2887
+ true
2888
+ case StructType (fields) =>
2889
+ fields.forall(f => supportedShuffleDataType(f.dataType)) &&
2890
+ // Java Arrow stream reader cannot work on duplicate field name
2891
+ fields.map(f => f.name).distinct.length == fields.length
2892
+ case ArrayType (ArrayType (_, _), _) => false // TODO: nested array is not supported
2893
+ case ArrayType (MapType (_, _, _), _) => false // TODO: map array element is not supported
2894
+ case ArrayType (elementType, _) =>
2895
+ supportedShuffleDataType(elementType)
2896
+ case MapType (MapType (_, _, _), _, _) => false // TODO: nested map is not supported
2897
+ case MapType (_, MapType (_, _, _), _) => false
2898
+ case MapType (StructType (_), _, _) => false // TODO: struct map key/value is not supported
2899
+ case MapType (_, StructType (_), _) => false
2900
+ case MapType (ArrayType (_, _), _, _) => false // TODO: array map key/value is not supported
2901
+ case MapType (_, ArrayType (_, _), _) => false
2902
+ case MapType (keyType, valueType, _) =>
2903
+ supportedShuffleDataType(keyType) && supportedShuffleDataType(valueType)
2904
+ case _ =>
2905
+ false
2906
+ }
2907
+
2892
2908
// Utility method. Adds explain info if the result of calling exprToProto is None
2893
2909
def optExprWithInfo (
2894
2910
optExpr : Option [Expr ],
@@ -2920,7 +2936,8 @@ object QueryPlanSerde extends Logging with CometExprShim {
2920
2936
val canSort = sortOrder.head.dataType match {
2921
2937
case _ : BooleanType => true
2922
2938
case _ : ByteType | _ : ShortType | _ : IntegerType | _ : LongType | _ : FloatType |
2923
- _ : DoubleType | _ : TimestampType | _ : TimestampType | _ : DecimalType | _ : DateType =>
2939
+ _ : DoubleType | _ : TimestampType | _ : TimestampNTZType | _ : DecimalType |
2940
+ _ : DateType =>
2924
2941
true
2925
2942
case _ : BinaryType | _ : StringType => true
2926
2943
case ArrayType (elementType, _) => canRank(elementType)
0 commit comments