[Feature][Spark] Support SeaTunnel Time Type.

apache · Aug 18, 2023 · b51384f · b51384f
1 parent 51c0d70
commit b51384f
Show file tree

Hide file tree

Showing 17 changed files with 163 additions and 64 deletions.
diff --git a/release-note.md b/release-note.md
@@ -144,6 +144,7 @@
 - [Core] [API] Add copy method to Catalog codes (#4414)
 - [Core] [API] Add options check before create source and sink and transform in FactoryUtil (#4424)
 - [Core] [Shade] Add guava shade module (#4358)
+- [Core] [Spark] Support SeaTunnel Time Type (#5188)
 
 ### Connector-V2
 

diff --git a/...src/main/java/org/apache/seatunnel/core/starter/spark/execution/SinkExecuteProcessor.java b/...src/main/java/org/apache/seatunnel/core/starter/spark/execution/SinkExecuteProcessor.java
@@ -34,6 +34,7 @@
 
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructType;
 
 import com.google.common.collect.Lists;
 
@@ -92,6 +93,12 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams)
             SeaTunnelSink<?, ?, ?, ?> seaTunnelSink = plugins.get(i);
             Dataset<Row> dataset =
                     fromSourceTable(sinkConfig, sparkRuntimeEnvironment).orElse(input);
+            StructType inputSchema =
+                    sourceSchema(sinkConfig, sparkRuntimeEnvironment)
+                            .orElseGet(
+                                    () -> {
+                                        return dataset.schema();
+                                    });
             int parallelism;
             if (sinkConfig.hasPath(CommonOptions.PARALLELISM.key())) {
                 parallelism = sinkConfig.getInt(CommonOptions.PARALLELISM.key());
@@ -105,8 +112,7 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams)
             }
             dataset.sparkSession().read().option(CommonOptions.PARALLELISM.key(), parallelism);
             // TODO modify checkpoint location
-            seaTunnelSink.setTypeInfo(
-                    (SeaTunnelRowType) TypeConverterUtils.convert(dataset.schema()));
+            seaTunnelSink.setTypeInfo((SeaTunnelRowType) TypeConverterUtils.convert(inputSchema));
             if (SupportDataSaveMode.class.isAssignableFrom(seaTunnelSink.getClass())) {
                 SupportDataSaveMode saveModeSink = (SupportDataSaveMode) seaTunnelSink;
                 DataSaveMode dataSaveMode = saveModeSink.getUserConfigSaveMode();

diff --git a/...src/main/java/org/apache/seatunnel/core/starter/spark/execution/SinkExecuteProcessor.java b/...src/main/java/org/apache/seatunnel/core/starter/spark/execution/SinkExecuteProcessor.java
@@ -35,6 +35,7 @@
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.types.StructType;
 
 import com.google.common.collect.Lists;
 
@@ -93,6 +94,12 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams)
             SeaTunnelSink<?, ?, ?, ?> seaTunnelSink = plugins.get(i);
             Dataset<Row> dataset =
                     fromSourceTable(sinkConfig, sparkRuntimeEnvironment).orElse(input);
+            StructType inputSchema =
+                    sourceSchema(sinkConfig, sparkRuntimeEnvironment)
+                            .orElseGet(
+                                    () -> {
+                                        return dataset.schema();
+                                    });
             int parallelism;
             if (sinkConfig.hasPath(CommonOptions.PARALLELISM.key())) {
                 parallelism = sinkConfig.getInt(CommonOptions.PARALLELISM.key());
@@ -106,8 +113,7 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams)
             }
             dataset.sparkSession().read().option(CommonOptions.PARALLELISM.key(), parallelism);
             // TODO modify checkpoint location
-            seaTunnelSink.setTypeInfo(
-                    (SeaTunnelRowType) TypeConverterUtils.convert(dataset.schema()));
+            seaTunnelSink.setTypeInfo((SeaTunnelRowType) TypeConverterUtils.convert(inputSchema));
             if (SupportDataSaveMode.class.isAssignableFrom(seaTunnelSink.getClass())) {
                 SupportDataSaveMode saveModeSink = (SupportDataSaveMode) seaTunnelSink;
                 DataSaveMode dataSaveMode = saveModeSink.getUserConfigSaveMode();

diff --git a/...c/main/java/org/apache/seatunnel/core/starter/spark/execution/SourceExecuteProcessor.java b/...c/main/java/org/apache/seatunnel/core/starter/spark/execution/SourceExecuteProcessor.java
@@ -68,6 +68,7 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams) {
                                         CommonOptions.PARALLELISM.key(),
                                         CommonOptions.PARALLELISM.defaultValue());
             }
+            StructType schema = (StructType) TypeConverterUtils.convert(source.getProducedType());
             Dataset<Row> dataset =
                     sparkRuntimeEnvironment
                             .getSparkSession()
@@ -77,12 +78,13 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams) {
                             .option(
                                     Constants.SOURCE_SERIALIZATION,
                                     SerializationUtils.objectToString(source))
-                            .schema(
-                                    (StructType)
-                                            TypeConverterUtils.convert(source.getProducedType()))
+                            .schema(schema)
                             .load();
             sources.add(dataset);
-            registerInputTempView(pluginConfigs.get(i), dataset);
+
+            Config config = pluginConfigs.get(i);
+            stageSchema(config, schema);
+            registerInputTempView(config, dataset);
         }
         return sources;
     }

diff --git a/...rg/apache/seatunnel/core/starter/spark/execution/SparkAbstractPluginExecuteProcessor.java b/...rg/apache/seatunnel/core/starter/spark/execution/SparkAbstractPluginExecuteProcessor.java
@@ -24,6 +24,7 @@
 
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructType;
 
 import java.util.List;
 import java.util.Optional;
@@ -72,6 +73,26 @@ protected Optional<Dataset<Row>> fromSourceTable(
         return Optional.of(sparkRuntimeEnvironment.getSparkSession().read().table(sourceTableName));
     }
 
+    protected Optional<StructType> sourceSchema(
+            Config pluginConfig, SparkRuntimeEnvironment sparkRuntimeEnvironment) {
+        if (!pluginConfig.hasPath(SOURCE_TABLE_NAME)) {
+            return Optional.empty();
+        }
+        String sourceTableName = pluginConfig.getString(SOURCE_TABLE_NAME);
+        return this.sparkRuntimeEnvironment.schema(sourceTableName);
+    }
+
+    protected void stageSchema(Config pluginConfig, StructType schema) {
+        if (pluginConfig.hasPath(RESULT_TABLE_NAME)) {
+            String tableName = pluginConfig.getString(RESULT_TABLE_NAME);
+            stageSchema(tableName, schema);
+        }
+    }
+
+    protected void stageSchema(String tableName, StructType schema) {
+        this.sparkRuntimeEnvironment.stageSchema(tableName, schema);
+    }
+
     private void registerTempView(String tableName, Dataset<Row> ds) {
         ds.createOrReplaceTempView(tableName);
     }

diff --git a/.../main/java/org/apache/seatunnel/core/starter/spark/execution/SparkRuntimeEnvironment.java b/.../main/java/org/apache/seatunnel/core/starter/spark/execution/SparkRuntimeEnvironment.java
@@ -27,13 +27,17 @@
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.StructType;
 import org.apache.spark.streaming.Seconds;
 import org.apache.spark.streaming.StreamingContext;
 
 import lombok.extern.slf4j.Slf4j;
 
 import java.net.URL;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Optional;
 
 @Slf4j
 public class SparkRuntimeEnvironment implements RuntimeEnvironment {
@@ -55,9 +59,12 @@ public class SparkRuntimeEnvironment implements RuntimeEnvironment {
 
     private String jobName = Constants.LOGO;
 
+    private Map<String, StructType> schemas;
+
     private SparkRuntimeEnvironment(Config config) {
         this.setEnableHive(checkIsContainHive(config));
         this.initialize(config);
+        this.schemas = new HashMap<>();
     }
 
     public void setEnableHive(boolean enableHive) {
@@ -176,4 +183,13 @@ public static SparkRuntimeEnvironment getInstance(Config config) {
         }
         return INSTANCE;
     }
+
+    public Optional<StructType> schema(String tblName) {
+        StructType schema = this.schemas.get(tblName);
+        return schema == null ? Optional.empty() : Optional.of(schema);
+    }
+
+    public void stageSchema(String tblName, StructType schema) {
+        this.schemas.put(tblName, schema);
+    }
 }
diff --git a/...ain/java/org/apache/seatunnel/core/starter/spark/execution/TransformExecuteProcessor.java b/...ain/java/org/apache/seatunnel/core/starter/spark/execution/TransformExecuteProcessor.java
@@ -105,7 +105,7 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams)
                 Config pluginConfig = pluginConfigs.get(i);
                 Dataset<Row> stream =
                         fromSourceTable(pluginConfig, sparkRuntimeEnvironment).orElse(input);
-                input = sparkTransform(transform, stream);
+                input = sparkTransform(pluginConfig, transform, stream);
                 registerInputTempView(pluginConfig, input);
                 result.add(input);
             } catch (Exception e) {
@@ -119,33 +119,45 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams)
         return result;
     }
 
-    private Dataset<Row> sparkTransform(SeaTunnelTransform transform, Dataset<Row> stream)
+    private Dataset<Row> sparkTransform(
+            Config pluginConfig, SeaTunnelTransform transform, Dataset<Row> stream)
             throws IOException {
-        SeaTunnelDataType<?> seaTunnelDataType = TypeConverterUtils.convert(stream.schema());
+        StructType inputSchema =
+                sourceSchema(pluginConfig, sparkRuntimeEnvironment)
+                        .orElseGet(
+                                () -> {
+                                    return stream.schema();
+                                });
+
+        SeaTunnelDataType<?> seaTunnelDataType = TypeConverterUtils.convert(inputSchema);
         transform.setTypeInfo(seaTunnelDataType);
-        StructType structType =
+        StructType outputSchema =
                 (StructType) TypeConverterUtils.convert(transform.getProducedType());
         SeaTunnelRowConverter inputRowConverter = new SeaTunnelRowConverter(seaTunnelDataType);
         SeaTunnelRowConverter outputRowConverter =
                 new SeaTunnelRowConverter(transform.getProducedType());
-        ExpressionEncoder<Row> encoder = RowEncoder.apply(structType);
-        return stream.mapPartitions(
-                        (MapPartitionsFunction<Row, Row>)
-                                (Iterator<Row> rowIterator) -> {
-                                    TransformIterator iterator =
-                                            new TransformIterator(
-                                                    rowIterator,
-                                                    transform,
-                                                    structType,
-                                                    inputRowConverter,
-                                                    outputRowConverter);
-                                    return iterator;
-                                },
-                        encoder)
-                .filter(
-                        (Row row) -> {
-                            return row != null;
-                        });
+        ExpressionEncoder<Row> encoder = RowEncoder.apply(outputSchema);
+        Dataset<Row> result =
+                stream.mapPartitions(
+                                (MapPartitionsFunction<Row, Row>)
+                                        (Iterator<Row> rowIterator) -> {
+                                            TransformIterator iterator =
+                                                    new TransformIterator(
+                                                            rowIterator,
+                                                            transform,
+                                                            outputSchema,
+                                                            inputRowConverter,
+                                                            outputRowConverter);
+                                            return iterator;
+                                        },
+                                encoder)
+                        .filter(
+                                (Row row) -> {
+                                    return row != null;
+                                });
+
+        stageSchema(pluginConfig, outputSchema);
+        return result;
     }
 
     private static class TransformIterator implements Iterator<Row>, Serializable {

diff --git a/...nector-v2-e2e/connector-assert-e2e/src/test/resources/assertion/fakesource_to_assert.conf b/...nector-v2-e2e/connector-assert-e2e/src/test/resources/assertion/fakesource_to_assert.conf
@@ -35,6 +35,7 @@ source {
       fields {
         name = "string"
         age = "int"
+        c_time = "time"
       }
     }
   }
@@ -46,7 +47,7 @@ transform {
   Filter {
     source_table_name = "fake"
     result_table_name = "fake1"
-    fields = ["name", "age"]
+    fields = ["name", "age", "c_time"]
   }
 }
 
@@ -97,10 +98,17 @@ sink {
               rule_value = 2147483647
             }
           ]
+        }, {
+          field_name = c_time
+          field_type = time
+          field_value = [
+            {
+              rule_type = NOT_NULL
+            }
+          ]
         }
-        ]
-      }
-
+      ]
+    }
   }
   # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
   # please go to https://seatunnel.apache.org/docs/connector-v2/sink/Assert

diff --git a/...-e2e-part-1/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcMysqlIT.java b/...-e2e-part-1/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcMysqlIT.java
@@ -33,9 +33,11 @@
 
 import java.math.BigDecimal;
 import java.sql.Date;
+import java.sql.Time;
 import java.sql.Timestamp;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
+import java.time.LocalTime;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -95,6 +97,7 @@ public class JdbcMysqlIT extends AbstractJdbcIT {
                     + "    `c_longtext`             longtext,\n"
                     + "    `c_date`                 date                  DEFAULT NULL,\n"
                     + "    `c_datetime`             datetime              DEFAULT NULL,\n"
+                    + "    `c_time`                 time                  DEFAULT NULL,\n"
                     + "    `c_timestamp`            timestamp NULL        DEFAULT NULL,\n"
                     + "    `c_tinyblob`             tinyblob,\n"
                     + "    `c_mediumblob`           mediumblob,\n"
@@ -187,6 +190,7 @@ Pair<String[], List<SeaTunnelRow>> initTestData() {
                     "c_longtext",
                     "c_date",
                     "c_datetime",
+                    "c_time",
                     "c_timestamp",
                     "c_tinyblob",
                     "c_mediumblob",
@@ -244,6 +248,7 @@ Pair<String[], List<SeaTunnelRow>> initTestData() {
                                 String.format("f1_%s", i),
                                 Date.valueOf(LocalDate.now()),
                                 Timestamp.valueOf(LocalDateTime.now()),
+                                Time.valueOf(LocalTime.now()),
                                 new Timestamp(System.currentTimeMillis()),
                                 "test".getBytes(),
                                 "test".getBytes(),

diff --git a/...tor-jdbc-e2e/connector-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink.conf b/...tor-jdbc-e2e/connector-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink.conf
@@ -46,8 +46,8 @@ sink {
                                                 c_mediumint, c_mediumint_unsigned, c_int, c_integer, c_bigint, c_bigint_unsigned,
                                                 c_decimal, c_decimal_unsigned, c_float, c_float_unsigned, c_double, c_double_unsigned,
                                                 c_char, c_tinytext, c_mediumtext, c_text, c_varchar, c_json, c_longtext, c_date,
-                                                c_datetime, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
+                                                c_datetime, c_time, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
                                                 c_binary, c_year, c_int_unsigned, c_integer_unsigned,c_bigint_30,c_decimal_unsigned_30,c_decimal_30)
-                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
+                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
   }
-}
+}
diff --git a/...e2e/connector-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink_parallel.conf b/...e2e/connector-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink_parallel.conf
@@ -49,8 +49,8 @@ sink {
                                                 c_mediumint, c_mediumint_unsigned, c_int, c_integer, c_bigint, c_bigint_unsigned,
                                                 c_decimal, c_decimal_unsigned, c_float, c_float_unsigned, c_double, c_double_unsigned,
                                                 c_char, c_tinytext, c_mediumtext, c_text, c_varchar, c_json, c_longtext, c_date,
-                                                c_datetime, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
+                                                c_datetime, c_time, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
                                                 c_binary, c_year, c_int_unsigned, c_integer_unsigned,c_bigint_30,c_decimal_unsigned_30,c_decimal_30)
-                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
+                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
   }
-}
+}
diff --git a/...r-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink_parallel_upper_lower.conf b/...r-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink_parallel_upper_lower.conf
@@ -50,8 +50,8 @@ sink {
                                                 c_mediumint, c_mediumint_unsigned, c_int, c_integer, c_bigint, c_bigint_unsigned,
                                                 c_decimal, c_decimal_unsigned, c_float, c_float_unsigned, c_double, c_double_unsigned,
                                                 c_char, c_tinytext, c_mediumtext, c_text, c_varchar, c_json, c_longtext, c_date,
-                                                c_datetime, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
+                                                c_datetime, c_time, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
                                                 c_binary, c_year, c_int_unsigned, c_integer_unsigned,c_bigint_30,c_decimal_unsigned_30,c_decimal_30)
-                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
+                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
   }
-}
+}
diff --git a/...-jdbc-e2e/connector-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink_xa.conf b/...-jdbc-e2e/connector-jdbc-e2e-part-1/src/test/resources/jdbc_mysql_source_and_sink_xa.conf
@@ -46,9 +46,9 @@ sink {
                                                 c_mediumint, c_mediumint_unsigned, c_int, c_integer, c_bigint, c_bigint_unsigned,
                                                 c_decimal, c_decimal_unsigned, c_float, c_float_unsigned, c_double, c_double_unsigned,
                                                 c_char, c_tinytext, c_mediumtext, c_text, c_varchar, c_json, c_longtext, c_date,
-                                                c_datetime, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
+                                                c_datetime, c_time, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary,
                                                 c_binary, c_year, c_int_unsigned, c_integer_unsigned)
-                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
+                   values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"""
 
     # Non-root users need to grant XA_RECOVER_ADMIN permission on is_exactly_once = "true"
     is_exactly_once = "true"
@@ -57,4 +57,4 @@ sink {
     max_commit_attempts = 3
     transaction_timeout_sec = 86400
   }
-}
+}