diff --git a/docs/en/connector-v2/sink/Iceberg.md b/docs/en/connector-v2/sink/Iceberg.md new file mode 100644 index 000000000000..0ca271a60279 --- /dev/null +++ b/docs/en/connector-v2/sink/Iceberg.md @@ -0,0 +1,184 @@ +# Apache Iceberg + +> Apache Iceberg sink connector + +## Support Iceberg Version + +- 1.4.2 + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Description + +Sink connector for Apache Iceberg. It can support cdc mode 、auto create table and table schema evolution. + +## Supported DataSource Info + +| Datasource | Dependent | Maven | +|------------|-----------|---------------------------------------------------------------------------| +| Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | +| Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | + +## Database Dependency + +> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. + +``` +hive-exec-xxx.jar +libfb303-xxx.jar +``` + +> Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package. + +## Data Type Mapping + +| SeaTunnel Data type | Iceberg Data type | +|---------------------|-------------------| +| BOOLEAN | BOOLEAN | +| INT | INTEGER | +| BIGINT | LONG | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DATE | DATE | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| STRING | STRING | +| BYTES | FIXED
BINARY | +| DECIMAL | DECIMAL | +| ROW | STRUCT | +| ARRAY | LIST | +| MAP | MAP | + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------------------------------|---------|----------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| catalog_name | string | yes | default | User-specified catalog name. default is `default` | +| namespace | string | yes | default | The iceberg database name in the backend catalog. default is `default` | +| table | string | yes | - | The iceberg table name in the backend catalog. | +| iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:"https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java" | +| hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | +| iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | +| case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | +| iceberg.table.write-props | map | no | - | Properties passed through to Iceberg writer initialization, these take precedence, such as 'write.format.default', 'write.target-file-size-bytes', and other settings, can be found with specific parameters at 'https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/TableProperties.java'. | +| iceberg.table.auto-create-enabled | boolean | no | false | Set to true to automatically create destination table, false otherwise | +| iceberg.table.auto-create-props | map | no | - | Configuration specified by Iceberg during automatic table creation. | +| iceberg.table.schema-evolution-enabled | boolean | no | false | Setting to true enables Iceberg tables to support schema evolution during the synchronization process | +| iceberg.table.primary-keys | string | no | - | Default comma-separated list of columns that identify a row in tables (primary key) | +| iceberg.table.partition-keys | string | no | - | Default comma-separated list of partition fields to use when creating tables | +| iceberg.table.upsert-mode-enabled | boolean | no | false | Set to `true` to enable upsert mode, default is `false` | + +## Task Example + +### Simple: + +```hocon +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + MySQL-CDC { + result_table_name = "customers_mysql_cdc_iceberg" + server-id = 5652 + username = "st_user" + password = "seatunnel" + table-names = ["mysql_cdc.mysql_cdc_e2e_source_table"] + base-url = "jdbc:mysql://mysql_cdc_e2e:3306/mysql_cdc" + } +} + +transform { +} + +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + "type"="hadoop" + "warehouse"="file:///tmp/seatunnel/iceberg/hadoop-sink/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=536870912 + } + iceberg.table.auto-create-enabled="true" + iceberg.table.primary-keys="id" + iceberg.table.partition-keys="f_datetime" + iceberg.table.upsert-mode-enabled=true + iceberg.table.schema-evolution-enabled=true + case_sensitive=true + } +} +``` + +### Hive Catalog: + +```hocon +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + type = "hive" + uri = "thrift://localhost:9083" + warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=536870912 + } + iceberg.table.auto-create-enabled="true" + iceberg.table.primary-keys="id" + iceberg.table.partition-keys="f_datetime" + iceberg.table.upsert-mode-enabled=true + iceberg.table.schema-evolution-enabled=true + case_sensitive=true + } +} +``` + +### Hadoop catalog: + +```hocon +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + type = "hadoop" + warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=536870912 + } + iceberg.table.auto-create-enabled="true" + iceberg.table.primary-keys="id" + iceberg.table.partition-keys="f_datetime" + iceberg.table.upsert-mode-enabled=true + iceberg.table.schema-evolution-enabled=true + case_sensitive=true + } +} + +``` + +## Changelog + +### 2.3.4-SNAPSHOT 2024-01-18 + +- Add Iceberg Sink Connector + +### next version + diff --git a/docs/en/connector-v2/source/Iceberg.md b/docs/en/connector-v2/source/Iceberg.md index a01833f066bb..8fb296467a88 100644 --- a/docs/en/connector-v2/source/Iceberg.md +++ b/docs/en/connector-v2/source/Iceberg.md @@ -4,7 +4,7 @@ ## Support Iceberg Version -- 0.14.0 +- 1.4.2 ## Support Those Engines @@ -34,18 +34,16 @@ Source connector for Apache Iceberg. It can support batch and stream mode. ## Supported DataSource Info -| Datasource | Dependent | Maven | -|------------|---------------------|---------------------------------------------------------------------------| -| Iceberg | flink-shaded-hadoop | [Download](https://mvnrepository.com/search?q=flink-shaded-hadoop-) | -| Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | -| Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | +| Datasource | Dependent | Maven | +|------------|-----------|---------------------------------------------------------------------------| +| Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | +| Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | ## Database Dependency -> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec and flink-shaded-hadoop-2 in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. +> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. ``` -flink-shaded-hadoop-x-xxx.jar hive-exec-xxx.jar libfb303-xxx.jar ``` @@ -76,11 +74,11 @@ libfb303-xxx.jar | Name | Type | Required | Default | Description | |--------------------------|---------|----------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | catalog_name | string | yes | - | User-specified catalog name. | -| catalog_type | string | yes | - | The optional values are: hive(The hive metastore catalog),hadoop(The hadoop catalog) | -| uri | string | no | - | The Hive metastore’s thrift URI. | -| warehouse | string | yes | - | The location to store metadata files and data files. | | namespace | string | yes | - | The iceberg database name in the backend catalog. | | table | string | yes | - | The iceberg table name in the backend catalog. | +| iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:"https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java" | +| hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | +| iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | | schema | config | no | - | Use projection to select data columns and columns order. | | case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | | start_snapshot_timestamp | long | no | - | Instructs this scan to look for changes starting from the most recent snapshot for the table as of the timestamp.
timestamp – the timestamp in millis since the Unix epoch | @@ -123,8 +121,10 @@ source { } } catalog_name = "seatunnel" - catalog_type = "hadoop" - warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" + iceberg.catalog.config={ + type = "hadoop" + warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" + } namespace = "database1" table = "source" result_table_name = "iceberg" @@ -147,9 +147,13 @@ sink { source { Iceberg { catalog_name = "seatunnel" + iceberg.catalog.config={ + type = "hive" + uri = "thrift://localhost:9083" + warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" + } catalog_type = "hive" - uri = "thrift://localhost:9083" - warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" + namespace = "your_iceberg_database" table = "your_iceberg_table" } @@ -162,8 +166,10 @@ source { source { Iceberg { catalog_name = "seatunnel" - catalog_type = "hadoop" - warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" + iceberg.catalog.config={ + type = "hadoop" + warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" + } namespace = "your_iceberg_database" table = "your_iceberg_table" diff --git a/plugin-mapping.properties b/plugin-mapping.properties index c5ccae20a222..9e3769550556 100644 --- a/plugin-mapping.properties +++ b/plugin-mapping.properties @@ -72,6 +72,7 @@ seatunnel.sink.Sentry = connector-sentry seatunnel.source.MongoDB = connector-mongodb seatunnel.sink.MongoDB = connector-mongodb seatunnel.source.Iceberg = connector-iceberg +seatunnel.sink.Iceberg = connector-iceberg seatunnel.source.InfluxDB = connector-influxdb seatunnel.source.S3File = connector-file-s3 seatunnel.sink.S3File = connector-file-s3 diff --git a/pom.xml b/pom.xml index 274fe05c7eba..3c9865273fb1 100644 --- a/pom.xml +++ b/pom.xml @@ -70,7 +70,6 @@ 1.2.17 1.2.3 1.2 - 0.13.1 1.13.6 1.15.3 2.4.0 @@ -87,6 +86,7 @@ 2.13.3 1.18.24 1.20 + 1.11.1 false false false @@ -105,7 +105,6 @@ 2.11.0 4.4 3.3.0 - 1.20 1.8.0 provided provided diff --git a/release-note.md b/release-note.md index 831018d27329..ac84ca90aca5 100644 --- a/release-note.md +++ b/release-note.md @@ -187,6 +187,7 @@ - [Transform-V2] Add support CatalogTable for FilterFieldTransform (#4422) - [Transform-V2] Add catalog support for SQL Transform plugin (#4819) - [Connector-V2] [Assert] Support check the precision and scale of Decimal type (#6110) +- [Connector-V2] [Iceberg] Support iceberg sink #6198 ### Zeta(ST-Engine) diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/pom.xml b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/pom.xml index a4f082a3eae2..d9947ec324cc 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/pom.xml +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/pom.xml @@ -69,8 +69,18 @@ org.glassfish.jersey.core * + + + com.github.luben + zstd-jni + + + com.github.luben + zstd-jni + 1.5.5-5 + @@ -84,6 +94,10 @@ io.debezium debezium-embedded + + com.github.luben + zstd-jni + com.zaxxer HikariCP diff --git a/seatunnel-connectors-v2/connector-iceberg/pom.xml b/seatunnel-connectors-v2/connector-iceberg/pom.xml index d1689e8ef9e2..309900b09c77 100644 --- a/seatunnel-connectors-v2/connector-iceberg/pom.xml +++ b/seatunnel-connectors-v2/connector-iceberg/pom.xml @@ -30,12 +30,23 @@ SeaTunnel : Connectors V2 : Iceberg - 0.14.0 - 1.12.3 + 1.4.2 + 1.13.1 1.11.3 2.3.9 + connector-iceberg + + + + com.github.luben + zstd-jni + 1.5.5-5 + + + + org.apache.seatunnel @@ -48,6 +59,13 @@ iceberg-core ${iceberg.version} + + + org.apache.iceberg + iceberg-common + ${iceberg.version} + + org.apache.iceberg iceberg-api @@ -119,13 +137,15 @@ - org.apache.flink - flink-shaded-hadoop-2 + org.apache.seatunnel + seatunnel-hadoop3-3.1.4-uber + ${project.version} + optional provided org.apache.avro - * + avro @@ -186,10 +206,50 @@ junit junit - ${junit4.version} + 4.13.2 test + + + + org.apache.maven.plugins + maven-shade-plugin + + + + shade + + package + + + + org.apache.avro + + ${seatunnel.shade.package}.${connector.name}.org.apache.avro + + + org.apache.orc + ${seatunnel.shade.package}.${connector.name}.org.apache.orc + + + org.apache.parquet + + ${seatunnel.shade.package}.${connector.name}.org.apache.parquet + + + shaded.parquet + + ${seatunnel.shade.package}.${connector.name}.shaded.parquet + + + + + + + + + diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergCatalogFactory.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergCatalogFactory.java index 83a406897657..554099ef7b8c 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergCatalogFactory.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergCatalogFactory.java @@ -17,72 +17,108 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg; -import org.apache.seatunnel.common.exception.CommonErrorCodeDeprecated; -import org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType; -import org.apache.seatunnel.connectors.seatunnel.iceberg.exception.IcebergConnectorException; +import org.apache.seatunnel.shade.com.google.common.collect.ImmutableList; + +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; -import org.apache.hadoop.conf.Configuration; -import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.CatalogUtil; import org.apache.iceberg.catalog.Catalog; -import org.apache.iceberg.hadoop.HadoopCatalog; -import org.apache.iceberg.hadoop.SerializableConfiguration; -import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.common.DynClasses; +import org.apache.iceberg.common.DynMethods; -import lombok.NonNull; +import lombok.extern.slf4j.Slf4j; +import java.io.IOException; import java.io.Serializable; -import java.util.HashMap; -import java.util.Map; +import java.lang.reflect.InvocationTargetException; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +@Slf4j public class IcebergCatalogFactory implements Serializable { private static final long serialVersionUID = -6003040601422350869L; + private static final List HADOOP_CONF_FILES = + ImmutableList.of("core-site.xml", "hdfs-site.xml", "hive-site.xml"); + private CommonConfig config; - private final String catalogName; - private final IcebergCatalogType catalogType; - private final String warehouse; - private final String uri; + public IcebergCatalogFactory(CommonConfig config) { + this.config = config; + } - public IcebergCatalogFactory( - @NonNull String catalogName, - @NonNull IcebergCatalogType catalogType, - @NonNull String warehouse, - String uri) { - this.catalogName = catalogName; - this.catalogType = catalogType; - this.warehouse = warehouse; - this.uri = uri; + public Catalog loadCatalog() { + // When using the seatunel engine, set the current class loader to prevent loading failures + Thread.currentThread().setContextClassLoader(IcebergCatalogFactory.class.getClassLoader()); + return CatalogUtil.buildIcebergCatalog( + config.getCatalogName(), config.getCatalogProps(), loadHadoopConfig(config)); } - public Catalog create() { - Configuration conf = new Configuration(); - SerializableConfiguration serializableConf = new SerializableConfiguration(conf); - Map properties = new HashMap<>(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, warehouse); + /** + * Loading Hadoop configuration through reflection + * + * @param config + * @return + */ + private Object loadHadoopConfig(CommonConfig config) { + Class configClass = + DynClasses.builder() + .impl("org.apache.hadoop.hdfs.HdfsConfiguration") + .orNull() + .build(); + if (configClass == null) { + configClass = + DynClasses.builder() + .impl("org.apache.hadoop.conf.Configuration") + .orNull() + .build(); + } - switch (catalogType) { - case HADOOP: - return hadoop(catalogName, serializableConf, properties); - case HIVE: - properties.put(CatalogProperties.URI, uri); - return hive(catalogName, serializableConf, properties); - default: - throw new IcebergConnectorException( - CommonErrorCodeDeprecated.UNSUPPORTED_OPERATION, - String.format("Unsupported catalogType: %s", catalogType)); + if (configClass == null) { + log.info("Hadoop not found on classpath, not creating Hadoop config"); + return null; } - } - private static Catalog hadoop( - String catalogName, SerializableConfiguration conf, Map properties) { - return CatalogUtil.loadCatalog( - HadoopCatalog.class.getName(), catalogName, properties, conf.get()); - } + try { + Object result = configClass.getDeclaredConstructor().newInstance(); + DynMethods.BoundMethod addResourceMethod = + DynMethods.builder("addResource").impl(configClass, URL.class).build(result); + DynMethods.BoundMethod setMethod = + DynMethods.builder("set") + .impl(configClass, String.class, String.class) + .build(result); - private static Catalog hive( - String catalogName, SerializableConfiguration conf, Map properties) { - return CatalogUtil.loadCatalog( - HiveCatalog.class.getName(), catalogName, properties, conf.get()); + // load any config files in the specified config directory + String hadoopConfPath = config.getHadoopConfPath(); + if (hadoopConfPath != null) { + HADOOP_CONF_FILES.forEach( + confFile -> { + Path path = Paths.get(hadoopConfPath, confFile); + if (Files.exists(path)) { + try { + addResourceMethod.invoke(path.toUri().toURL()); + } catch (IOException e) { + log.warn( + "Error adding Hadoop resource {}, resource was not added", + path, + e); + } + } + }); + } + config.getHadoopProps().forEach(setMethod::invoke); + log.info("Hadoop config initialized: {}", configClass.getName()); + return result; + } catch (InstantiationException + | IllegalAccessException + | NoSuchMethodException + | InvocationTargetException e) { + log.warn( + "Hadoop found on classpath but could not create config, proceeding without config", + e); + } + return null; } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java index 554dd0bd7599..591c2450d0fa 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java @@ -17,7 +17,7 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg; -import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; import org.apache.iceberg.CachingCatalog; import org.apache.iceberg.Table; @@ -31,16 +31,13 @@ import java.io.IOException; import java.io.Serializable; -import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; - public class IcebergTableLoader implements Closeable, Serializable { private static final long serialVersionUID = 9061073826700804273L; private final IcebergCatalogFactory icebergCatalogFactory; private final String tableIdentifierStr; - - private Catalog catalog; + private transient Catalog catalog; public IcebergTableLoader( @NonNull IcebergCatalogFactory icebergCatalogFactory, @@ -49,14 +46,24 @@ public IcebergTableLoader( this.tableIdentifierStr = tableIdentifier.toString(); } - public void open() { - catalog = CachingCatalog.wrap(icebergCatalogFactory.create()); + public Catalog getCatalog() { + return catalog; + } + + public TableIdentifier getTableIdentifier() { + return TableIdentifier.parse(tableIdentifierStr); + } + + public IcebergTableLoader open() { + catalog = CachingCatalog.wrap(icebergCatalogFactory.loadCatalog()); + return this; } public Table loadTable() { TableIdentifier tableIdentifier = TableIdentifier.parse(tableIdentifierStr); - checkArgument( - catalog.tableExists(tableIdentifier), "Illegal source table: " + tableIdentifier); + if (catalog == null) { + open(); + } return catalog.loadTable(tableIdentifier); } @@ -67,16 +74,10 @@ public void close() throws IOException { } } - public static IcebergTableLoader create(SourceConfig sourceConfig) { - IcebergCatalogFactory catalogFactory = - new IcebergCatalogFactory( - sourceConfig.getCatalogName(), - sourceConfig.getCatalogType(), - sourceConfig.getWarehouse(), - sourceConfig.getUri()); + public static IcebergTableLoader create(CommonConfig config) { + IcebergCatalogFactory catalogFactory = new IcebergCatalogFactory(config); return new IcebergTableLoader( catalogFactory, - TableIdentifier.of( - Namespace.of(sourceConfig.getNamespace()), sourceConfig.getTable())); + TableIdentifier.of(Namespace.of(config.getNamespace()), config.getTable())); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java index 2f893da092bf..a7503e6e3062 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java @@ -17,19 +17,18 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg.config; -import org.apache.seatunnel.shade.com.typesafe.config.Config; - import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.common.config.ConfigRuntimeException; import lombok.Getter; import lombok.ToString; import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; -import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HADOOP; -import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HIVE; -import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; @Getter @@ -40,19 +39,13 @@ public class CommonConfig implements Serializable { public static final Option KEY_CATALOG_NAME = Options.key("catalog_name") .stringType() - .noDefaultValue() + .defaultValue("default") .withDescription(" the iceberg catalog name"); - public static final Option KEY_CATALOG_TYPE = - Options.key("catalog_type") - .enumType(IcebergCatalogType.class) - .noDefaultValue() - .withDescription(" the iceberg catalog type"); - public static final Option KEY_NAMESPACE = Options.key("namespace") .stringType() - .noDefaultValue() + .defaultValue("default") .withDescription(" the iceberg namespace"); public static final Option KEY_TABLE = @@ -61,17 +54,25 @@ public class CommonConfig implements Serializable { .noDefaultValue() .withDescription(" the iceberg table"); - public static final Option KEY_URI = - Options.key("uri") - .stringType() + public static final Option> CATALOG_PROPS = + Options.key("iceberg.catalog.config") + .mapType() .noDefaultValue() - .withDescription(" the iceberg server uri"); + .withDescription( + "Specify the properties for initializing the Iceberg catalog, which can be referenced in this file:'https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java'"); - public static final Option KEY_WAREHOUSE = - Options.key("warehouse") + public static final Option> HADOOP_PROPS = + Options.key("hadoop.config") + .mapType() + .defaultValue(new HashMap<>()) + .withDescription("Properties passed through to the Hadoop configuration"); + + public static final Option HADOOP_CONF_PATH_PROP = + Options.key("iceberg.hadoop-conf-path") .stringType() - .noDefaultValue() - .withDescription(" the iceberg warehouse"); + .defaultValue(null) + .withDescription( + "The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files."); public static final Option KEY_CASE_SENSITIVE = Options.key("case_sensitive") @@ -80,35 +81,39 @@ public class CommonConfig implements Serializable { .withDescription(" the iceberg case_sensitive"); private String catalogName; - private IcebergCatalogType catalogType; - private String uri; - private String warehouse; private String namespace; private String table; private boolean caseSensitive; - public CommonConfig(Config pluginConfig) { - String catalogType = checkArgumentNotNull(pluginConfig.getString(KEY_CATALOG_TYPE.key())); - checkArgument( - HADOOP.getType().equals(catalogType) || HIVE.getType().equals(catalogType), - "Illegal catalogType: " + catalogType); - - this.catalogType = IcebergCatalogType.valueOf(catalogType.toUpperCase()); - this.catalogName = checkArgumentNotNull(pluginConfig.getString(KEY_CATALOG_NAME.key())); - if (pluginConfig.hasPath(KEY_URI.key())) { - this.uri = checkArgumentNotNull(pluginConfig.getString(KEY_URI.key())); - } - this.warehouse = checkArgumentNotNull(pluginConfig.getString(KEY_WAREHOUSE.key())); - this.namespace = checkArgumentNotNull(pluginConfig.getString(KEY_NAMESPACE.key())); - this.table = checkArgumentNotNull(pluginConfig.getString(KEY_TABLE.key())); - - if (pluginConfig.hasPath(KEY_CASE_SENSITIVE.key())) { - this.caseSensitive = pluginConfig.getBoolean(KEY_CASE_SENSITIVE.key()); + private Map catalogProps; + private Map hadoopProps; + private String hadoopConfPath; + + public CommonConfig(ReadonlyConfig pluginConfig) { + this.catalogName = checkArgumentNotNull(pluginConfig.get(KEY_CATALOG_NAME)); + this.namespace = pluginConfig.get(KEY_NAMESPACE); + this.table = pluginConfig.get(KEY_TABLE); + this.catalogProps = pluginConfig.get(CATALOG_PROPS); + this.hadoopProps = pluginConfig.get(HADOOP_PROPS); + this.hadoopConfPath = pluginConfig.get(HADOOP_CONF_PATH_PROP); + if (pluginConfig.toConfig().hasPath(KEY_CASE_SENSITIVE.key())) { + this.caseSensitive = pluginConfig.get(KEY_CASE_SENSITIVE); } + validate(); } protected T checkArgumentNotNull(T argument) { checkNotNull(argument); return argument; } + + private void validate() { + checkState(!catalogProps.isEmpty(), "Must specify iceberg catalog config"); + } + + private void checkState(boolean condition, String msg) { + if (!condition) { + throw new ConfigRuntimeException(msg); + } + } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SinkConfig.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SinkConfig.java new file mode 100644 index 000000000000..deff7db2e180 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SinkConfig.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.config; + +import org.apache.seatunnel.shade.com.google.common.annotations.VisibleForTesting; +import org.apache.seatunnel.shade.com.google.common.collect.ImmutableList; +import org.apache.seatunnel.shade.com.google.common.collect.Maps; + +import org.apache.seatunnel.api.configuration.Option; +import org.apache.seatunnel.api.configuration.Options; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; + +import lombok.Getter; +import lombok.ToString; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.util.stream.Collectors.toList; + +@Getter +@ToString +public class SinkConfig extends CommonConfig { + + public static final int SCHEMA_UPDATE_RETRIES = 2; // 3 total attempts + public static final int CREATE_TABLE_RETRIES = 2; // 3 total attempts + + private static final String ID_COLUMNS = "id-columns"; + private static final String PARTITION_BY = "partition-by"; + + public static final Option> TABLE_PROPS = + Options.key("iceberg.table.config") + .mapType() + .defaultValue(new HashMap<>()) + .withDescription("Iceberg table configs"); + + public static final Option> WRITE_PROPS = + Options.key("iceberg.table.write-props") + .mapType() + .defaultValue(new HashMap<>()) + .withDescription( + "Properties passed through to Iceberg writer initialization, these take precedence, such as 'write.format.default', 'write.target-file-size-bytes', and other settings, can be found with specific parameters at 'https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/TableProperties.java'."); + + public static final Option TABLES_AUTO_CREATE_ENABLED = + Options.key("iceberg.table.auto-create-enabled") + .booleanType() + .defaultValue(false) + .withDescription( + "Set to true to automatically create destination table, false otherwise"); + + public static final Option> AUTO_CREATE_PROPS = + Options.key("iceberg.table.auto-create-props") + .mapType() + .defaultValue(new HashMap<>()) + .withDescription( + "Configuration specified by Iceberg during automatic table creation."); + + public static final Option TABLE_SCHEMA_EVOLUTION_ENABLED_PROP = + Options.key("iceberg.table.schema-evolution-enabled") + .booleanType() + .defaultValue(false) + .withDescription( + "Setting to true enables Iceberg tables to support schema evolution during the synchronization process"); + + public static final Option TABLE_PRIMARY_KEYS = + Options.key("iceberg.table.primary-keys") + .stringType() + .noDefaultValue() + .withDescription( + "Default comma-separated list of columns that identify a row in tables (primary key)"); + + public static final Option TABLE_DEFAULT_PARTITION_KEYS = + Options.key("iceberg.table.partition-keys") + .stringType() + .noDefaultValue() + .withDescription( + "Default comma-separated list of partition fields to use when creating tables."); + + public static final Option TABLE_UPSERT_MODE_ENABLED_PROP = + Options.key("iceberg.table.upsert-mode-enabled") + .booleanType() + .defaultValue(false) + .withDescription("Set to `true` to enable upsert mode, default is `false`"); + + @VisibleForTesting private static final String COMMA_NO_PARENS_REGEX = ",(?![^()]*+\\))"; + + private final ReadonlyConfig readonlyConfig; + private Map autoCreateProps; + private Map writeProps; + private Map tableConfigs = Maps.newHashMap(); + private String primaryKeys; + private String partitionKeys; + + private boolean upsertModeEnabled; + private boolean autoCreateEnabled; + private boolean tableSchemaEvolutionEnabled; + + public SinkConfig(ReadonlyConfig readonlyConfig) { + super(readonlyConfig); + this.readonlyConfig = readonlyConfig; + this.autoCreateProps = readonlyConfig.get(AUTO_CREATE_PROPS); + this.writeProps = readonlyConfig.get(WRITE_PROPS); + this.primaryKeys = readonlyConfig.get(TABLE_PRIMARY_KEYS); + this.partitionKeys = readonlyConfig.get(TABLE_DEFAULT_PARTITION_KEYS); + this.upsertModeEnabled = readonlyConfig.get(TABLE_UPSERT_MODE_ENABLED_PROP); + this.autoCreateEnabled = readonlyConfig.get(TABLES_AUTO_CREATE_ENABLED); + this.tableSchemaEvolutionEnabled = readonlyConfig.get(TABLE_SCHEMA_EVOLUTION_ENABLED_PROP); + } + + public TableSinkConfig tableConfig(String tableName) { + return tableConfigs.computeIfAbsent( + tableName, + notUsed -> { + Map tableConfig = readonlyConfig.get(TABLE_PROPS); + // define id columns + String idColumnsStr = tableConfig.getOrDefault(ID_COLUMNS, primaryKeys); + List idColumns = stringToList(idColumnsStr, ","); + // define partition columns + String partitionByStr = tableConfig.getOrDefault(PARTITION_BY, partitionKeys); + List partitionBy = stringToList(partitionByStr, COMMA_NO_PARENS_REGEX); + return new TableSinkConfig(idColumns, partitionBy); + }); + } + + @VisibleForTesting + static List stringToList(String value, String regex) { + if (value == null || value.isEmpty()) { + return ImmutableList.of(); + } + return Arrays.stream(value.split(regex)).map(String::trim).collect(toList()); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java index fc9394d07794..56d0a4509df9 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/SourceConfig.java @@ -23,6 +23,7 @@ import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.source.enumerator.scan.IcebergStreamScanStrategy; import org.apache.iceberg.expressions.Expression; @@ -86,8 +87,9 @@ public class SourceConfig extends CommonConfig { private Integer splitLookback; private Long splitOpenFileCost; - public SourceConfig(Config pluginConfig) { - super(pluginConfig); + public SourceConfig(ReadonlyConfig readonlyConfig) { + super(readonlyConfig); + Config pluginConfig = readonlyConfig.toConfig(); if (pluginConfig.hasPath(KEY_START_SNAPSHOT_TIMESTAMP.key())) { this.startSnapshotTimestamp = pluginConfig.getLong(KEY_START_SNAPSHOT_TIMESTAMP.key()); } @@ -110,7 +112,7 @@ public SourceConfig(Config pluginConfig) { } } - public static SourceConfig loadConfig(Config pluginConfig) { + public static SourceConfig loadConfig(ReadonlyConfig pluginConfig) { return new SourceConfig(pluginConfig); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/TableSinkConfig.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/TableSinkConfig.java new file mode 100644 index 000000000000..6b0a26361159 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/TableSinkConfig.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.config; + +import java.util.List; + +public class TableSinkConfig { + private final List idColumns; + private final List partitionBy; + + public TableSinkConfig(List idColumns, List partitionBy) { + this.idColumns = idColumns; + this.partitionBy = partitionBy; + } + + public List idColumns() { + return idColumns; + } + + public List partitionBy() { + return partitionBy; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSink.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSink.java new file mode 100644 index 000000000000..b0322799194c --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSink.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink; + +import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.api.serialization.DefaultSerializer; +import org.apache.seatunnel.api.serialization.Serializer; +import org.apache.seatunnel.api.sink.SeaTunnelSink; +import org.apache.seatunnel.api.sink.SinkAggregatedCommitter; +import org.apache.seatunnel.api.sink.SinkWriter; +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit.IcebergAggregatedCommitInfo; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit.IcebergAggregatedCommitter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit.IcebergCommitInfo; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.state.IcebergSinkState; + +import com.google.auto.service.AutoService; + +import java.io.IOException; +import java.util.List; +import java.util.Optional; + +@AutoService(SeaTunnelSink.class) +public class IcebergSink + implements SeaTunnelSink< + SeaTunnelRow, IcebergSinkState, IcebergCommitInfo, IcebergAggregatedCommitInfo> { + private static String PLUGIN_NAME = "Iceberg"; + private SinkConfig config; + private SeaTunnelRowType seaTunnelRowType; + + public IcebergSink(ReadonlyConfig pluginConfig, SeaTunnelRowType rowType) { + this.config = new SinkConfig(pluginConfig); + this.seaTunnelRowType = rowType; + } + + @Override + public String getPluginName() { + return PLUGIN_NAME; + } + + @Override + public void setTypeInfo(SeaTunnelRowType seaTunnelRowType) { + this.seaTunnelRowType = seaTunnelRowType; + } + + @Override + public SinkWriter createWriter( + SinkWriter.Context context) throws IOException { + return IcebergSinkWriter.of(config, seaTunnelRowType); + } + + @Override + public SinkWriter restoreWriter( + SinkWriter.Context context, List states) throws IOException { + return IcebergSinkWriter.of(config, seaTunnelRowType, states); + } + + @Override + public Optional> + createAggregatedCommitter() throws IOException { + return Optional.of(new IcebergAggregatedCommitter(config)); + } + + @Override + public Optional> getAggregatedCommitInfoSerializer() { + return Optional.of(new DefaultSerializer<>()); + } + + @Override + public Optional> getCommitInfoSerializer() { + return Optional.of(new DefaultSerializer<>()); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkFactory.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkFactory.java new file mode 100644 index 000000000000..389dff0b15a5 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink; + +import org.apache.seatunnel.api.configuration.util.OptionRule; +import org.apache.seatunnel.api.table.connector.TableSink; +import org.apache.seatunnel.api.table.factory.Factory; +import org.apache.seatunnel.api.table.factory.TableSinkFactory; +import org.apache.seatunnel.api.table.factory.TableSinkFactoryContext; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; + +import com.google.auto.service.AutoService; + +@AutoService(Factory.class) +public class IcebergSinkFactory implements TableSinkFactory { + + @Override + public String factoryIdentifier() { + return "Iceberg"; + } + + @Override + public OptionRule optionRule() { + return OptionRule.builder() + .required( + CommonConfig.KEY_CATALOG_NAME, + SinkConfig.KEY_NAMESPACE, + SinkConfig.KEY_TABLE, + SinkConfig.CATALOG_PROPS) + .optional( + SinkConfig.TABLE_PROPS, + SinkConfig.HADOOP_PROPS, + SinkConfig.WRITE_PROPS, + SinkConfig.TABLES_AUTO_CREATE_ENABLED, + SinkConfig.AUTO_CREATE_PROPS, + SinkConfig.TABLE_PRIMARY_KEYS, + SinkConfig.TABLE_DEFAULT_PARTITION_KEYS, + SinkConfig.TABLE_UPSERT_MODE_ENABLED_PROP, + SinkConfig.TABLE_SCHEMA_EVOLUTION_ENABLED_PROP) + .build(); + } + + @Override + public TableSink createSink(TableSinkFactoryContext context) { + return () -> + new IcebergSink( + context.getOptions(), + context.getCatalogTable().getTableSchema().toPhysicalRowDataType()); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkWriter.java new file mode 100644 index 000000000000..68b868d143af --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/IcebergSinkWriter.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink; + +import org.apache.seatunnel.api.sink.SinkWriter; +import org.apache.seatunnel.api.table.event.SchemaChangeEvent; +import org.apache.seatunnel.api.table.event.handler.DataTypeChangeEventDispatcher; +import org.apache.seatunnel.api.table.event.handler.DataTypeChangeEventHandler; +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit.IcebergCommitInfo; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit.IcebergFilesCommitter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.state.IcebergSinkState; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.IcebergWriterFactory; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.RecordWriter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.WriteResult; + +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.Lists; +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.UUID; + +/** Iceberg sink writer */ +@Slf4j +public class IcebergSinkWriter + implements SinkWriter { + private SeaTunnelRowType rowType; + private SinkConfig config; + private IcebergTableLoader icebergTableLoader; + private IcebergWriterFactory icebergWriterFactory; + private RecordWriter writer; + private IcebergFilesCommitter filesCommitter; + private List results = Lists.newArrayList(); + private String commitUser = UUID.randomUUID().toString(); + private long checkpointId; + + private final DataTypeChangeEventHandler dataTypeChangeEventHandler; + + public IcebergSinkWriter( + IcebergTableLoader icebergTableLoader, + SinkConfig config, + SeaTunnelRowType seaTunnelRowType, + List states) { + this.config = config; + this.icebergTableLoader = icebergTableLoader; + this.rowType = seaTunnelRowType; + this.filesCommitter = IcebergFilesCommitter.of(config, icebergTableLoader); + this.dataTypeChangeEventHandler = new DataTypeChangeEventDispatcher(); + createIcebergWriterFactory(); + if (Objects.nonNull(states) && !states.isEmpty()) { + this.commitUser = states.get(0).getCommitUser(); + this.checkpointId = states.get(0).getCheckpointId(); + preCommit(states); + } + } + + private void preCommit(List states) { + states.forEach( + icebergSinkState -> { + filesCommitter.doCommit(icebergSinkState.getWriteResults()); + }); + } + + private void createIcebergWriterFactory() { + this.icebergWriterFactory = new IcebergWriterFactory(icebergTableLoader, config); + this.writer = this.icebergWriterFactory.createWriter(rowType); + } + + public static IcebergSinkWriter of(SinkConfig config, SeaTunnelRowType seaTunnelRowType) { + return of(config, seaTunnelRowType, null); + } + + public static IcebergSinkWriter of( + SinkConfig config, SeaTunnelRowType seaTunnelRowType, List states) { + IcebergTableLoader icebergTableLoader = IcebergTableLoader.create(config).open(); + return new IcebergSinkWriter(icebergTableLoader, config, seaTunnelRowType, states); + } + + @Override + public void write(SeaTunnelRow element) throws IOException { + writer.write(element, rowType); + } + + @Override + public Optional prepareCommit() throws IOException { + List writeResults = writer.complete(); + IcebergCommitInfo icebergCommitInfo = new IcebergCommitInfo(writeResults); + this.results.addAll(writeResults); + return Optional.of(icebergCommitInfo); + } + + @Override + public void applySchemaChange(SchemaChangeEvent event) throws IOException { + log.info("changed rowType before: {}", fieldsInfo(rowType)); + this.rowType = dataTypeChangeEventHandler.reset(rowType).apply(event); + log.info("changed rowType after: {}", fieldsInfo(rowType)); + } + + @Override + public List snapshotState(long checkpointId) throws IOException { + IcebergSinkState icebergSinkState = new IcebergSinkState(results, commitUser, checkpointId); + results.clear(); + return Collections.singletonList(icebergSinkState); + } + + @Override + public void abortPrepare() {} + + @Override + public void close() throws IOException {} + + private String fieldsInfo(SeaTunnelRowType seaTunnelRowType) { + String[] fieldsInfo = new String[seaTunnelRowType.getTotalFields()]; + for (int i = 0; i < seaTunnelRowType.getTotalFields(); i++) { + fieldsInfo[i] = + String.format( + "%s<%s>", + seaTunnelRowType.getFieldName(i), seaTunnelRowType.getFieldType(i)); + } + return StringUtils.join(fieldsInfo, ", "); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitInfo.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitInfo.java new file mode 100644 index 000000000000..5ba75d3d3604 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitInfo.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit; + +import lombok.AllArgsConstructor; +import lombok.Data; + +import java.io.Serializable; +import java.util.List; + +@Data +@AllArgsConstructor +public class IcebergAggregatedCommitInfo implements Serializable { + List commitInfos; +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitter.java new file mode 100644 index 000000000000..a23e6a6b1e84 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergAggregatedCommitter.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit; + +import org.apache.seatunnel.api.sink.SinkAggregatedCommitter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; + +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +/** Iceberg aggregated committer */ +@Slf4j +public class IcebergAggregatedCommitter + implements SinkAggregatedCommitter { + + private final IcebergFilesCommitter filesCommitter; + + public IcebergAggregatedCommitter(SinkConfig config) { + IcebergTableLoader tableLoader = IcebergTableLoader.create(config).open(); + this.filesCommitter = IcebergFilesCommitter.of(config, tableLoader); + } + + @Override + public List commit( + List aggregatedCommitInfo) throws IOException { + for (IcebergAggregatedCommitInfo commitInfo : aggregatedCommitInfo) { + commitFiles(commitInfo.commitInfos); + } + return Collections.emptyList(); + } + + private void commitFiles(List commitInfos) { + for (IcebergCommitInfo icebergCommitInfo : commitInfos) { + if (icebergCommitInfo.getResults() == null) { + continue; + } + filesCommitter.doCommit(icebergCommitInfo.getResults()); + } + } + + @Override + public IcebergAggregatedCommitInfo combine(List commitInfos) { + return new IcebergAggregatedCommitInfo(commitInfos); + } + + @Override + public void abort(List aggregatedCommitInfo) throws Exception {} + + @Override + public void close() throws IOException {} +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergCommitInfo.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergCommitInfo.java new file mode 100644 index 000000000000..1b4affc0b9e6 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergCommitInfo.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit; + +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.WriteResult; + +import lombok.AllArgsConstructor; +import lombok.Data; + +import java.io.Serializable; +import java.util.List; + +@Data +@AllArgsConstructor +public class IcebergCommitInfo implements Serializable { + private List results; +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergFilesCommitter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergFilesCommitter.java new file mode 100644 index 000000000000..07363d69e1a0 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/commit/IcebergFilesCommitter.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.commit; + +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.WriteResult; + +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.RowDelta; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; + +import lombok.extern.slf4j.Slf4j; + +import java.io.Serializable; +import java.util.List; + +import static java.util.stream.Collectors.toList; + +@Slf4j +public class IcebergFilesCommitter implements Serializable { + private IcebergTableLoader icebergTableLoader; + private boolean caseSensitive; + + private IcebergFilesCommitter(SinkConfig config, IcebergTableLoader icebergTableLoader) { + this.icebergTableLoader = icebergTableLoader; + this.caseSensitive = config.isCaseSensitive(); + } + + public static IcebergFilesCommitter of( + SinkConfig config, IcebergTableLoader icebergTableLoader) { + return new IcebergFilesCommitter(config, icebergTableLoader); + } + + public void doCommit(List results) { + TableIdentifier tableIdentifier = icebergTableLoader.getTableIdentifier(); + Table table = icebergTableLoader.loadTable(); + log.info("do commit table : " + table.toString()); + commit(tableIdentifier, table, results); + } + + private void commit(TableIdentifier tableIdentifier, Table table, List results) { + List dataFiles = + results.stream() + .filter(payload -> payload.getDataFiles() != null) + .flatMap(payload -> payload.getDataFiles().stream()) + .filter(dataFile -> dataFile.recordCount() > 0) + .collect(toList()); + + List deleteFiles = + results.stream() + .filter(payload -> payload.getDeleteFiles() != null) + .flatMap(payload -> payload.getDeleteFiles().stream()) + .filter(deleteFile -> deleteFile.recordCount() > 0) + .collect(toList()); + + if (dataFiles.isEmpty() && deleteFiles.isEmpty()) { + log.info(String.format("Nothing to commit to table %s, skipping", tableIdentifier)); + } else { + if (deleteFiles.isEmpty()) { + AppendFiles append = table.newAppend(); + dataFiles.forEach(append::appendFile); + append.commit(); + } else { + RowDelta delta = table.newRowDelta(); + delta.caseSensitive(caseSensitive); + dataFiles.forEach(delta::addRows); + deleteFiles.forEach(delta::addDeletes); + delta.commit(); + } + } + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/ISchemaChange.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/ISchemaChange.java new file mode 100644 index 000000000000..047b726ac0e4 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/ISchemaChange.java @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema; + +public interface ISchemaChange {} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaAddColumn.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaAddColumn.java new file mode 100644 index 000000000000..c36b5bcf22c7 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaAddColumn.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema; + +import org.apache.iceberg.types.Type; + +public class SchemaAddColumn implements ISchemaChange { + private final String parentName; + private final String name; + private final Type type; + + public SchemaAddColumn(String parentName, String name, Type type) { + this.parentName = parentName; + this.name = name; + this.type = type; + } + + public String parentName() { + return parentName; + } + + public String name() { + return name; + } + + public String key() { + return parentName == null ? name : parentName + "." + name; + } + + public Type type() { + return type; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaChangeType.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaChangeType.java new file mode 100644 index 000000000000..23fc6dbd8e2a --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaChangeType.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema; + +import org.apache.iceberg.types.Type; + +public class SchemaChangeType implements ISchemaChange { + private final String name; + private final Type.PrimitiveType type; + + public SchemaChangeType(String name, Type.PrimitiveType type) { + this.name = name; + this.type = type; + } + + public String name() { + return name; + } + + public Type.PrimitiveType type() { + return type; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaChangeWrapper.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaChangeWrapper.java new file mode 100644 index 000000000000..f980cb3b88ce --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/schema/SchemaChangeWrapper.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema; + +import org.apache.seatunnel.shade.com.google.common.collect.Maps; + +import org.apache.iceberg.types.Type; + +import java.util.Collection; +import java.util.Map; + +public class SchemaChangeWrapper { + private final Map addColumns = Maps.newHashMap(); + private final Map changeTypes = Maps.newHashMap(); + + public Collection addColumns() { + return addColumns.values(); + } + + public Collection changeTypes() { + return changeTypes.values(); + } + + public void addColumn(String parentName, String name, Type type) { + SchemaAddColumn addCol = new SchemaAddColumn(parentName, name, type); + addColumns.put(addCol.key(), addCol); + } + + public boolean empty() { + return addColumns.isEmpty() && changeTypes.isEmpty(); + } + + public void changeType(String name, Type.PrimitiveType type) { + changeTypes.put(name, new SchemaChangeType(name, type)); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/state/IcebergSinkState.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/state/IcebergSinkState.java new file mode 100644 index 000000000000..1cd69130eb78 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/state/IcebergSinkState.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.state; + +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.WriteResult; + +import lombok.AllArgsConstructor; +import lombok.Data; + +import java.io.Serializable; +import java.util.List; + +@Data +@AllArgsConstructor +public class IcebergSinkState implements Serializable { + private static final long serialVersionUID = 1L; + private List writeResults; + private String commitUser; + private long checkpointId; +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/BaseDeltaTaskWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/BaseDeltaTaskWriter.java new file mode 100644 index 000000000000..c79e8d705c38 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/BaseDeltaTaskWriter.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.seatunnel.shade.com.google.common.collect.Sets; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.BaseTaskWriter; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.types.TypeUtil; + +import java.io.IOException; +import java.util.Set; + +abstract class BaseDeltaTaskWriter extends BaseTaskWriter { + + private final Schema schema; + private final Schema deleteSchema; + + private final InternalRecordWrapper wrapper; + private final InternalRecordWrapper keyWrapper; + private final RecordProjection keyProjection; + + private final boolean upsert; + + BaseDeltaTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + Set identifierFieldIds, + boolean upsert) { + super(spec, format, appenderFactory, fileFactory, io, targetFileSize); + this.schema = schema; + this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(identifierFieldIds)); + this.wrapper = new InternalRecordWrapper(schema.asStruct()); + this.keyWrapper = new InternalRecordWrapper(deleteSchema.asStruct()); + this.keyProjection = RecordProjection.create(schema, deleteSchema); + this.upsert = upsert; + } + + abstract RowDataDeltaWriter route(IcebergRecord row); + + InternalRecordWrapper wrapper() { + return wrapper; + } + + @Override + public void write(Record record) throws IOException { + + if (!(record instanceof IcebergRecord)) { + throw new RuntimeException(); + } + IcebergRecord row = (IcebergRecord) record; + RowDataDeltaWriter writer = route(row); + switch (row.getRowKind()) { + case INSERT: + case UPDATE_AFTER: + if (upsert) { + writer.deleteKey(keyProjection.wrap(row)); + } + writer.write(row); + break; + case UPDATE_BEFORE: + if (upsert) { + break; + } + writer.delete(row); + break; + case DELETE: + if (upsert) { + writer.deleteKey(keyProjection.wrap(row)); + } else { + writer.delete(row); + } + break; + + default: + throw new UnsupportedOperationException("Unknown row kind: " + row.getRowKind()); + } + } + + class RowDataDeltaWriter extends BaseEqualityDeltaWriter { + RowDataDeltaWriter(PartitionKey partition) { + super(partition, schema, deleteSchema); + } + + @Override + protected StructLike asStructLike(Record data) { + return wrapper.wrap(data); + } + + @Override + protected StructLike asStructLikeKey(Record data) { + return keyWrapper.wrap(data); + } + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergRecord.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergRecord.java new file mode 100644 index 000000000000..163bf118c768 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergRecord.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.seatunnel.api.table.type.RowKind; + +import org.apache.iceberg.data.Record; +import org.apache.iceberg.types.Types; + +import java.util.Map; + +public class IcebergRecord implements Record { + + private final Record delegate; + private final RowKind rowKind; + + public IcebergRecord(Record delegate, RowKind rowKind) { + this.delegate = delegate; + this.rowKind = rowKind; + } + + public RowKind getRowKind() { + return rowKind; + } + + @Override + public Types.StructType struct() { + return delegate.struct(); + } + + @Override + public Object getField(String name) { + return delegate.getField(name); + } + + @Override + public void setField(String name, Object value) { + delegate.setField(name, value); + } + + @Override + public Object get(int pos) { + return delegate.get(pos); + } + + @Override + public Record copy() { + return new IcebergRecord(delegate.copy(), rowKind); + } + + @Override + public Record copy(Map overwriteValues) { + return new IcebergRecord(delegate.copy(overwriteValues), rowKind); + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public T get(int pos, Class javaClass) { + return delegate.get(pos, javaClass); + } + + @Override + public void set(int pos, T value) { + delegate.set(pos, value); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergRecordWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergRecordWriter.java new file mode 100644 index 000000000000..fcf442debe7c --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergRecordWriter.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.seatunnel.shade.com.google.common.collect.Lists; + +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema.SchemaChangeWrapper; +import org.apache.seatunnel.connectors.seatunnel.iceberg.utils.SchemaUtils; +import org.apache.seatunnel.connectors.seatunnel.iceberg.utils.SeaTunnelRowConverter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.utils.Utilities; + +import org.apache.iceberg.Table; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.TaskWriter; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.List; + +public class IcebergRecordWriter implements RecordWriter { + private final Table table; + private final SinkConfig config; + private final List writerResults; + private TaskWriter writer; + private SeaTunnelRowConverter recordConverter; + + public IcebergRecordWriter(Table table, SinkConfig config) { + this.config = config; + this.table = table; + this.writerResults = Lists.newArrayList(); + this.recordConverter = new SeaTunnelRowConverter(table, config); + this.writer = createTaskWriter(); + } + + private TaskWriter createTaskWriter() { + return Utilities.createTableWriter(table, config); + } + + @Override + public void write(SeaTunnelRow seaTunnelRow, SeaTunnelRowType rowType) { + SchemaChangeWrapper updates = new SchemaChangeWrapper(); + Record record = recordConverter.convert(seaTunnelRow, rowType, updates); + if (!updates.empty()) { + record = applySchemaUpdate(seaTunnelRow, rowType, updates); + } + IcebergRecord icebergRecord = new IcebergRecord(record, seaTunnelRow.getRowKind()); + try { + this.writer.write(icebergRecord); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * apply schema update + * + * @param seaTunnelRow + * @param rowType + * @param updates + * @return + */ + private Record applySchemaUpdate( + SeaTunnelRow seaTunnelRow, SeaTunnelRowType rowType, SchemaChangeWrapper updates) { + // complete the current file + flush(); + // apply the schema updates, this will refresh the table + SchemaUtils.applySchemaUpdates(table, updates); + // initialize a new writer with the new schema + resetWriter(); + // convert the row again, this time using the new table schema + return recordConverter.convert(seaTunnelRow, rowType); + } + + @Override + public List complete() { + flush(); + List result = Lists.newArrayList(writerResults); + writerResults.clear(); + resetWriter(); + return result; + } + + /** Reset record writer */ + private void resetWriter() { + this.writer = createTaskWriter(); + this.recordConverter = new SeaTunnelRowConverter(table, config); + } + + private void flush() { + if (writer == null) { + return; + } + org.apache.iceberg.io.WriteResult writeResult; + try { + writeResult = writer.complete(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + writerResults.add( + new WriteResult( + Arrays.asList(writeResult.dataFiles()), + Arrays.asList(writeResult.deleteFiles()), + table.spec().partitionType())); + writer = null; + } + + @Override + public void close() {} +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergWriterFactory.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergWriterFactory.java new file mode 100644 index 000000000000..0f92a8159629 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/IcebergWriterFactory.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.seatunnel.shade.com.google.common.annotations.VisibleForTesting; + +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.utils.SchemaUtils; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.exceptions.NoSuchTableException; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.util.Tasks; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import lombok.extern.slf4j.Slf4j; + +import java.util.List; +import java.util.concurrent.atomic.AtomicReference; + +@Slf4j +public class IcebergWriterFactory { + private static final Logger LOG = LoggerFactory.getLogger(IcebergWriterFactory.class); + private final IcebergTableLoader tableLoader; + private final SinkConfig config; + + public IcebergWriterFactory(IcebergTableLoader tableLoader, SinkConfig config) { + this.tableLoader = tableLoader; + this.config = config; + } + + public RecordWriter createWriter(SeaTunnelRowType rowType) { + Table table; + try { + table = tableLoader.loadTable(); + } catch (NoSuchTableException exception) { + if (config.isAutoCreateEnabled()) { + // Enabled auto create table + table = autoCreateTable(rowType); + } else { + throw exception; + } + } + return new IcebergRecordWriter(table, config); + } + + @VisibleForTesting + public Table autoCreateTable(SeaTunnelRowType rowType) { + // Generate struct type + StructType structType = SchemaUtils.toIcebergType(rowType).asStructType(); + org.apache.iceberg.Schema schema = new org.apache.iceberg.Schema(structType.fields()); + List partitionBy = config.tableConfig(config.getTable()).partitionBy(); + PartitionSpec spec; + try { + spec = SchemaUtils.createPartitionSpec(schema, partitionBy); + } catch (Exception e) { + LOG.error( + "Unable to create partition spec {}, table {} will be unpartitioned", + partitionBy, + tableLoader.getTableIdentifier(), + e); + spec = PartitionSpec.unpartitioned(); + } + PartitionSpec partitionSpec = spec; + AtomicReference result = new AtomicReference<>(); + Tasks.range(1) + .retry(SinkConfig.CREATE_TABLE_RETRIES) + .run( + notUsed -> { + log.info( + "starting auto create table: " + + tableLoader.getTableIdentifier()); + Catalog catalog = tableLoader.getCatalog(); + Table table = + catalog.createTable( + tableLoader.getTableIdentifier(), + schema, + partitionSpec, + config.getAutoCreateProps()); + result.set(table); + log.info("auto create table end: " + tableLoader.getTableIdentifier()); + }); + return result.get(); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/PartitionedAppendWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/PartitionedAppendWriter.java new file mode 100644 index 000000000000..a7dbcd5c290a --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/PartitionedAppendWriter.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.InternalRecordWrapper; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.io.PartitionedFanoutWriter; + +public class PartitionedAppendWriter extends PartitionedFanoutWriter { + + private final PartitionKey partitionKey; + private final InternalRecordWrapper wrapper; + + public PartitionedAppendWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema) { + super(spec, format, appenderFactory, fileFactory, io, targetFileSize); + this.partitionKey = new PartitionKey(spec, schema); + this.wrapper = new InternalRecordWrapper(schema.asStruct()); + } + + @Override + protected PartitionKey partition(Record row) { + partitionKey.partition(wrapper.wrap(row)); + return partitionKey; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/PartitionedDeltaWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/PartitionedDeltaWriter.java new file mode 100644 index 000000000000..df90b3bbd4c1 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/PartitionedDeltaWriter.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.seatunnel.shade.com.google.common.collect.Maps; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionKey; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.util.Tasks; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Map; +import java.util.Set; + +public class PartitionedDeltaWriter extends BaseDeltaTaskWriter { + private final PartitionKey partitionKey; + + private final Map writers = Maps.newHashMap(); + + public PartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + Set identifierFieldIds, + boolean upsertMode) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + identifierFieldIds, + upsertMode); + this.partitionKey = new PartitionKey(spec, schema); + } + + @Override + RowDataDeltaWriter route(IcebergRecord row) { + partitionKey.partition(wrapper().wrap(row)); + + RowDataDeltaWriter writer = writers.get(partitionKey); + if (writer == null) { + // NOTICE: we need to copy a new partition key here, in case of messing up the keys in + // writers. + PartitionKey copiedKey = partitionKey.copy(); + writer = new RowDataDeltaWriter(copiedKey); + writers.put(copiedKey, writer); + } + + return writer; + } + + @Override + public void close() { + try { + Tasks.foreach(writers.values()) + .throwFailureWhenFinished() + .noRetry() + .run(RowDataDeltaWriter::close, IOException.class); + + writers.clear(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to close equality delta writer", e); + } + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/RecordProjection.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/RecordProjection.java new file mode 100644 index 000000000000..f2c68aed7103 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/RecordProjection.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.seatunnel.shade.com.google.common.base.Preconditions; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StructType; + +import java.util.List; +import java.util.Map; + +/** + * This is modified from {@link org.apache.iceberg.util.StructProjection} to support record types. + */ +public class RecordProjection implements Record { + + public static RecordProjection create(Schema dataSchema, Schema projectedSchema) { + return new RecordProjection(dataSchema.asStruct(), projectedSchema.asStruct()); + } + + private final StructType type; + private final int[] positionMap; + private final RecordProjection[] nestedProjections; + private IcebergRecord record; + + private RecordProjection(StructType structType, StructType projection) { + this(structType, projection, false); + } + + @SuppressWarnings("checkstyle:CyclomaticComplexity") + private RecordProjection(StructType structType, StructType projection, boolean allowMissing) { + this.type = projection; + this.positionMap = new int[projection.fields().size()]; + this.nestedProjections = new RecordProjection[projection.fields().size()]; + + // set up the projection positions and any nested projections that are needed + List dataFields = structType.fields(); + for (int pos = 0; pos < positionMap.length; pos += 1) { + NestedField projectedField = projection.fields().get(pos); + + boolean found = false; + for (int i = 0; !found && i < dataFields.size(); i += 1) { + NestedField dataField = dataFields.get(i); + if (projectedField.fieldId() == dataField.fieldId()) { + found = true; + positionMap[pos] = i; + switch (projectedField.type().typeId()) { + case STRUCT: + nestedProjections[pos] = + new RecordProjection( + dataField.type().asStructType(), + projectedField.type().asStructType()); + break; + case MAP: + MapType projectedMap = projectedField.type().asMapType(); + MapType originalMap = dataField.type().asMapType(); + + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap + .valueType() + .equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, + "Cannot project a partial map key or value struct. Trying to project %s out of %s", + projectedField, + dataField); + + nestedProjections[pos] = null; + break; + case LIST: + ListType projectedList = projectedField.type().asListType(); + ListType originalList = dataField.type().asListType(); + + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList + .elementType() + .equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, + "Cannot project a partial list element struct. Trying to project %s out of %s", + projectedField, + dataField); + + nestedProjections[pos] = null; + break; + default: + nestedProjections[pos] = null; + } + } + } + + if (!found && projectedField.isOptional() && allowMissing) { + positionMap[pos] = -1; + nestedProjections[pos] = null; + } else if (!found) { + throw new IllegalArgumentException( + String.format("Cannot find field %s in %s", projectedField, structType)); + } + } + } + + public RecordProjection wrap(IcebergRecord newRecord) { + this.record = newRecord; + return this; + } + + @Override + public int size() { + return type.fields().size(); + } + + @Override + public T get(int pos, Class javaClass) { + // struct can be null if wrap is not called first before the get call + // or if a null struct is wrapped. + if (record == null) { + return null; + } + + int recordPos = positionMap[pos]; + if (nestedProjections[pos] != null) { + IcebergRecord nestedStruct = record.get(recordPos, IcebergRecord.class); + if (nestedStruct == null) { + return null; + } + return javaClass.cast(nestedProjections[pos].wrap(nestedStruct)); + } + + if (recordPos != -1) { + return record.get(recordPos, javaClass); + } else { + return null; + } + } + + @Override + public void set(int pos, T value) { + throw new UnsupportedOperationException(); + } + + @Override + public StructType struct() { + return type; + } + + @Override + public Object getField(String name) { + throw new UnsupportedOperationException(); + } + + @Override + public void setField(String name, Object value) { + throw new UnsupportedOperationException(); + } + + @Override + public Object get(int pos) { + return get(pos, Object.class); + } + + @Override + public Record copy() { + throw new UnsupportedOperationException(); + } + + @Override + public Record copy(Map overwriteValues) { + throw new UnsupportedOperationException(); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/RecordWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/RecordWriter.java new file mode 100644 index 000000000000..85e2cb147c1f --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/RecordWriter.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.seatunnel.shade.com.google.common.collect.ImmutableList; + +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; + +import java.util.List; + +public interface RecordWriter extends Cloneable { + + void write(SeaTunnelRow seaTunnelRow, SeaTunnelRowType rowType); + + default List complete() { + return ImmutableList.of(); + } + + default void close() {} +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/UnpartitionedDeltaWriter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/UnpartitionedDeltaWriter.java new file mode 100644 index 000000000000..3b3700ae2bd8 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/UnpartitionedDeltaWriter.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.OutputFileFactory; + +import java.io.IOException; +import java.util.Set; + +public class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { + private final RowDataDeltaWriter writer; + + public UnpartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + Set identifierFieldIds, + boolean upsertMode) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + identifierFieldIds, + upsertMode); + this.writer = new RowDataDeltaWriter(null); + } + + @Override + RowDataDeltaWriter route(IcebergRecord row) { + return writer; + } + + @Override + public void close() throws IOException { + writer.close(); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/WriteResult.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/WriteResult.java new file mode 100644 index 000000000000..c695eef1e22e --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/sink/writer/WriteResult.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer; + +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.types.Types.StructType; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.io.Serializable; +import java.util.List; + +/** Write result */ +@Data +@NoArgsConstructor +@AllArgsConstructor +public class WriteResult implements Serializable { + private List dataFiles; + private List deleteFiles; + private StructType partitionStruct; +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java index 9be5303fb820..44f22ad9bafb 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java @@ -20,7 +20,7 @@ import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.api.common.JobContext; -import org.apache.seatunnel.api.common.PrepareFailException; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.source.Boundedness; import org.apache.seatunnel.api.source.SeaTunnelSource; import org.apache.seatunnel.api.source.SourceReader; @@ -71,17 +71,16 @@ public class IcebergSource private SeaTunnelRowType seaTunnelRowType; private JobContext jobContext; - @Override - public String getPluginName() { - return "Iceberg"; + public IcebergSource(ReadonlyConfig config) { + this.sourceConfig = SourceConfig.loadConfig(config); + this.tableSchema = loadIcebergSchema(sourceConfig); + this.seaTunnelRowType = loadSeaTunnelRowType(tableSchema, config.toConfig()); + this.projectedSchema = tableSchema.select(seaTunnelRowType.getFieldNames()); } @Override - public void prepare(Config pluginConfig) throws PrepareFailException { - this.sourceConfig = SourceConfig.loadConfig(pluginConfig); - this.tableSchema = loadIcebergSchema(sourceConfig); - this.seaTunnelRowType = loadSeaTunnelRowType(tableSchema, pluginConfig); - this.projectedSchema = tableSchema.select(seaTunnelRowType.getFieldNames()); + public String getPluginName() { + return "Iceberg"; } @SneakyThrows diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java index 1318ab8a9d72..a166cfb9cf41 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSourceFactory.java @@ -19,20 +19,21 @@ import org.apache.seatunnel.api.configuration.util.OptionRule; import org.apache.seatunnel.api.source.SeaTunnelSource; +import org.apache.seatunnel.api.source.SourceSplit; import org.apache.seatunnel.api.table.catalog.schema.TableSchemaOptions; +import org.apache.seatunnel.api.table.connector.TableSource; import org.apache.seatunnel.api.table.factory.Factory; import org.apache.seatunnel.api.table.factory.TableSourceFactory; +import org.apache.seatunnel.api.table.factory.TableSourceFactoryContext; import com.google.auto.service.AutoService; +import java.io.Serializable; + import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_CASE_SENSITIVE; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_CATALOG_NAME; -import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_CATALOG_TYPE; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_NAMESPACE; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_TABLE; -import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_URI; -import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig.KEY_WAREHOUSE; -import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HIVE; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_END_SNAPSHOT_ID; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_START_SNAPSHOT_ID; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig.KEY_START_SNAPSHOT_TIMESTAMP; @@ -51,9 +52,7 @@ public String factoryIdentifier() { @Override public OptionRule optionRule() { return OptionRule.builder() - .required( - KEY_CATALOG_NAME, KEY_CATALOG_TYPE, KEY_WAREHOUSE, KEY_NAMESPACE, KEY_TABLE) - .conditional(KEY_CATALOG_TYPE, HIVE, KEY_URI) + .required(KEY_CATALOG_NAME, KEY_NAMESPACE, KEY_TABLE) .optional( TableSchemaOptions.SCHEMA, KEY_CASE_SENSITIVE, @@ -66,6 +65,12 @@ public OptionRule optionRule() { .build(); } + @Override + public + TableSource createSource(TableSourceFactoryContext context) { + return () -> (SeaTunnelSource) new IcebergSource(context.getOptions()); + } + @Override public Class getSourceClass() { return IcebergSource.class; diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskReader.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskReader.java index c1d2138ee17d..e178dc481ed8 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskReader.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/reader/IcebergFileScanTaskReader.java @@ -32,7 +32,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.data.avro.DataReader; import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.expressions.Evaluator; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; @@ -52,6 +51,8 @@ import java.io.Closeable; import java.util.Map; +import static org.apache.iceberg.data.parquet.GenericParquetReaders.buildReader; + @Builder public class IcebergFileScanTaskReader implements Closeable { @@ -130,8 +131,7 @@ private CloseableIterable openFile(FileScanTask task, Schema fileProject .project(fileProjection) .createReaderFunc( fileSchema -> - GenericParquetReaders.buildReader( - fileProjection, fileSchema, partition)) + buildReader(fileProjection, fileSchema, partition)) .split(task.start(), task.length()) .filter(task.residual()); if (reuseContainers) { diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SchemaConverter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SchemaConverter.java new file mode 100644 index 000000000000..92f2e4521c37 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SchemaConverter.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.utils; + +import org.apache.seatunnel.api.table.type.ArrayType; +import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; + +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; + +import java.util.ArrayList; +import java.util.List; + +public class SchemaConverter { + private int fieldId = 1; + + private SchemaConverter() {} + + public static SchemaConverter of() { + return new SchemaConverter(); + } + + public Type toIcebergType(SeaTunnelDataType rowType) { + switch (rowType.getSqlType()) { + case BOOLEAN: + return Types.BooleanType.get(); + case BYTES: + return Types.BinaryType.get(); + case SMALLINT: + case TINYINT: + case INT: + return Types.IntegerType.get(); + case BIGINT: + return Types.LongType.get(); + case FLOAT: + return Types.FloatType.get(); + case DOUBLE: + return Types.DoubleType.get(); + case ARRAY: + ArrayType arrayType = (ArrayType) rowType; + Type elementType = toIcebergType(arrayType.getElementType()); + return Types.ListType.ofOptional(nextId(), elementType); + case MAP: + org.apache.seatunnel.api.table.type.MapType mapType = + (org.apache.seatunnel.api.table.type.MapType) rowType; + Type keyType = toIcebergType(mapType.getKeyType()); + Type valueType = toIcebergType(mapType.getValueType()); + return Types.MapType.ofOptional(nextId(), nextId(), keyType, valueType); + case ROW: + SeaTunnelRowType seaTunnelRowType = (SeaTunnelRowType) rowType; + List structFields = new ArrayList<>(); + for (int i = 0; i < seaTunnelRowType.getFieldNames().length; i++) { + String field = seaTunnelRowType.getFieldName(i); + SeaTunnelDataType dataType = seaTunnelRowType.getFieldType(i); + structFields.add( + Types.NestedField.of(nextId(), true, field, toIcebergType(dataType))); + } + return Types.StructType.of(structFields); + case DATE: + return Types.DateType.get(); + case TIME: + return Types.TimeType.get(); + case TIMESTAMP: + return Types.TimestampType.withZone(); + case STRING: + default: + return Types.StringType.get(); + } + } + + private int nextId() { + return fieldId++; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SchemaUtils.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SchemaUtils.java new file mode 100644 index 000000000000..c96206b8004f --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SchemaUtils.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.utils; + +import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.api.table.type.SqlType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema.SchemaAddColumn; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema.SchemaChangeType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema.SchemaChangeWrapper; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.UpdateSchema; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.Tasks; + +import lombok.extern.slf4j.Slf4j; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static java.util.stream.Collectors.toList; + +@Slf4j +public class SchemaUtils { + private static final Pattern TRANSFORM_REGEX = Pattern.compile("(\\w+)\\((.+)\\)"); + + private SchemaUtils() {} + + public static Type.PrimitiveType needsDataTypeUpdate( + Type currentIcebergType, SeaTunnelDataType dataType) { + if (currentIcebergType.typeId() == Type.TypeID.FLOAT + && dataType.getSqlType() == SqlType.DOUBLE) { + return Types.DoubleType.get(); + } + if (currentIcebergType.typeId() == Type.TypeID.INTEGER + && dataType.getSqlType() == SqlType.BIGINT) { + return Types.LongType.get(); + } + return null; + } + + public static void applySchemaUpdates(Table table, SchemaChangeWrapper wrapper) { + if (wrapper == null || wrapper.empty()) { + // no updates to apply + return; + } + Tasks.range(1) + .retry(SinkConfig.SCHEMA_UPDATE_RETRIES) + .run(notUsed -> commitSchemaUpdates(table, wrapper)); + } + + /** + * Commit table schema updates + * + * @param table + * @param wrapper + */ + private static void commitSchemaUpdates(Table table, SchemaChangeWrapper wrapper) { + // get the latest schema in case another process updated it + table.refresh(); + // filter out columns that have already been added + List addColumns = + wrapper.addColumns().stream() + .filter(addCol -> !columnExists(table.schema(), addCol)) + .collect(toList()); + + // filter out columns that have the updated type + List updateTypes = + wrapper.changeTypes().stream() + .filter(updateType -> !typeMatches(table.schema(), updateType)) + .collect(toList()); + + if (addColumns.isEmpty() && updateTypes.isEmpty()) { + // no updates to apply + log.info("Schema for table {} already up-to-date", table.name()); + return; + } + + // apply the updates + UpdateSchema updateSchema = table.updateSchema(); + addColumns.forEach( + update -> + updateSchema.addColumn(update.parentName(), update.name(), update.type())); + updateTypes.forEach(update -> updateSchema.updateColumn(update.name(), update.type())); + updateSchema.commit(); + log.info("Schema for table {} updated with new columns", table.name()); + } + + private static boolean columnExists(org.apache.iceberg.Schema schema, SchemaAddColumn update) { + Types.StructType struct = + update.parentName() == null + ? schema.asStruct() + : schema.findType(update.parentName()).asStructType(); + return struct.field(update.name()) != null; + } + + private static boolean typeMatches(org.apache.iceberg.Schema schema, SchemaChangeType update) { + return schema.findType(update.name()).typeId() == update.type().typeId(); + } + + public static Type toIcebergType(SeaTunnelDataType rowType) { + return SchemaConverter.of().toIcebergType(rowType); + } + + public static PartitionSpec createPartitionSpec( + org.apache.iceberg.Schema schema, List partitionBy) { + if (partitionBy.isEmpty()) { + return PartitionSpec.unpartitioned(); + } + PartitionSpec.Builder specBuilder = PartitionSpec.builderFor(schema); + partitionBy.forEach( + partitionField -> { + Matcher matcher = TRANSFORM_REGEX.matcher(partitionField); + if (matcher.matches()) { + String transform = matcher.group(1); + switch (transform) { + case "year": + case "years": + specBuilder.year(matcher.group(2)); + break; + case "month": + case "months": + specBuilder.month(matcher.group(2)); + break; + case "day": + case "days": + specBuilder.day(matcher.group(2)); + break; + case "hour": + case "hours": + specBuilder.hour(matcher.group(2)); + break; + case "bucket": + { + Pair args = transformArgPair(matcher.group(2)); + specBuilder.bucket(args.first(), args.second()); + break; + } + case "truncate": + { + Pair args = transformArgPair(matcher.group(2)); + specBuilder.truncate(args.first(), args.second()); + break; + } + default: + throw new UnsupportedOperationException( + "Unsupported transform: " + transform); + } + } else { + specBuilder.identity(partitionField); + } + }); + return specBuilder.build(); + } + + private static Pair transformArgPair(String argsStr) { + String[] parts = argsStr.split(","); + if (parts.length != 2) { + throw new IllegalArgumentException( + "Invalid argument " + argsStr + ", should have 2 parts"); + } + return Pair.of(parts[0].trim(), Integer.parseInt(parts[1].trim())); + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SeaTunnelRowConverter.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SeaTunnelRowConverter.java new file mode 100644 index 000000000000..a9fb4ef5ae47 --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/SeaTunnelRowConverter.java @@ -0,0 +1,534 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.utils; + +import org.apache.seatunnel.shade.com.google.common.base.Preconditions; +import org.apache.seatunnel.shade.com.google.common.collect.Maps; + +import org.apache.seatunnel.api.table.type.MapType; +import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.schema.SchemaChangeWrapper; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.mapping.MappedField; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.mapping.NameMappingParser; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.DateTimeUtil; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.nio.ByteBuffer; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.temporal.Temporal; +import java.util.Arrays; +import java.util.Base64; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.UUID; + +import static java.util.stream.Collectors.toList; + +public class SeaTunnelRowConverter { + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static final DateTimeFormatter OFFSET_TS_FMT = + new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + .appendOffset("+HHmm", "Z") + .toFormatter(); + + private final Schema tableSchema; + private final NameMapping nameMapping; + private final SinkConfig config; + private final Map> structNameMap = Maps.newHashMap(); + + public SeaTunnelRowConverter(Table table, SinkConfig config) { + this.tableSchema = table.schema(); + this.nameMapping = createNameMapping(table); + this.config = config; + } + + private NameMapping createNameMapping(Table table) { + String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); + return nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + } + + public Record convert(Object row, SeaTunnelDataType rowType) { + return convertStructValue(row, rowType, tableSchema.asStruct(), -1, null); + } + + public Record convert(Object row, SeaTunnelDataType rowType, SchemaChangeWrapper wrapper) { + return convertStructValue(row, rowType, tableSchema.asStruct(), -1, wrapper); + } + + protected GenericRecord convertStructValue( + Object value, + SeaTunnelDataType fromType, + Types.StructType schema, + int parentFieldId, + SchemaChangeWrapper wrapper) { + switch (fromType.getSqlType()) { + case ROW: + return convertToStruct( + (SeaTunnelRow) value, + (SeaTunnelRowType) fromType, + schema, + parentFieldId, + wrapper); + case MAP: + return convertToStruct( + (Map) value, + (MapType) fromType, + schema, + parentFieldId, + wrapper); + default: + throw new IllegalArgumentException( + "Cannot convert to struct: " + fromType.getSqlType().name()); + } + } + + /** + * convert map + * + * @param row + * @param fromType + * @param schema + * @param structFieldId + * @return + */ + private GenericRecord convertToStruct( + Map row, + MapType fromType, + Types.StructType schema, + int structFieldId, + SchemaChangeWrapper wrapper) { + GenericRecord result = GenericRecord.create(schema); + row.forEach( + (recordFieldNameObj, recordFieldValue) -> { + String recordFieldName = recordFieldNameObj.toString(); + Types.NestedField tableField = + lookupStructField(recordFieldName, schema, structFieldId); + result.setField( + tableField.name(), + convertValue( + recordFieldValue, + fromType.getValueType(), + tableField.type(), + tableField.fieldId(), + wrapper)); + }); + return result; + } + + /** + * Convert RowType + * + * @param row + * @param fromType + * @param schema + * @param structFieldId + * @return + */ + private GenericRecord convertToStruct( + SeaTunnelRow row, + SeaTunnelRowType fromType, + Types.StructType schema, + int structFieldId, + SchemaChangeWrapper wrapper) { + GenericRecord result = GenericRecord.create(schema); + String[] filedNames = fromType.getFieldNames(); + for (int i = 0; i < filedNames.length; i++) { + String recordField = filedNames[i]; + SeaTunnelDataType dataType = fromType.getFieldType(i); + Types.NestedField tableField = lookupStructField(recordField, schema, structFieldId); + if (Objects.isNull(tableField)) { + if (config.isTableSchemaEvolutionEnabled() && Objects.nonNull(wrapper)) { + // add the column if schema evolution is on + String parentFieldName = + structFieldId < 0 ? null : tableSchema.findColumnName(structFieldId); + Type type = SchemaUtils.toIcebergType(dataType); + wrapper.addColumn(parentFieldName, recordField, type); + } + } else { + boolean hasSchemaUpdates = false; + if (Objects.nonNull(wrapper)) { + // update the type if needed and schema evolution is on + Type.PrimitiveType evolveDataType = + SchemaUtils.needsDataTypeUpdate(tableField.type(), dataType); + if (evolveDataType != null) { + String fieldName = tableSchema.findColumnName(tableField.fieldId()); + wrapper.changeType(fieldName, evolveDataType); + hasSchemaUpdates = true; + } + } + if (!hasSchemaUpdates) { + result.setField( + tableField.name(), + convertValue( + row.getField(i), + fromType.getFieldType(i), + tableField.type(), + tableField.fieldId(), + wrapper)); + } + } + } + return result; + } + + public Object convertValue( + Object value, + SeaTunnelDataType fromType, + Type type, + int fieldId, + SchemaChangeWrapper wrapper) { + if (value == null) { + return null; + } + switch (type.typeId()) { + case STRUCT: + return convertStructValue(value, fromType, type.asStructType(), fieldId, wrapper); + case LIST: + return convertListValue(value, fromType, type.asListType(), wrapper); + case MAP: + return convertMapValue(value, fromType, type.asMapType(), wrapper); + case INTEGER: + return convertInt(value); + case LONG: + return convertLong(value); + case FLOAT: + return convertFloat(value); + case DOUBLE: + return convertDouble(value); + case DECIMAL: + return convertDecimal(value, (Types.DecimalType) type); + case BOOLEAN: + return convertBoolean(value); + case STRING: + return convertString(value); + case UUID: + return convertUUID(value); + case BINARY: + case FIXED: + return convertBase64Binary(value); + case DATE: + return convertDateValue(value); + case TIME: + return convertTimeValue(value); + case TIMESTAMP: + return convertTimestampValue(value, (Types.TimestampType) type); + } + throw new UnsupportedOperationException("Unsupported type: " + type.typeId()); + } + + private Types.NestedField lookupStructField( + String fieldName, Types.StructType schema, int structFieldId) { + if (nameMapping == null) { + return config.isCaseSensitive() + ? schema.caseInsensitiveField(fieldName) + : schema.field(fieldName); + } + + return structNameMap + .computeIfAbsent(structFieldId, notUsed -> createStructNameMap(schema)) + .get(fieldName); + } + + private Map createStructNameMap(Types.StructType schema) { + Map map = Maps.newHashMap(); + schema.fields() + .forEach( + col -> { + MappedField mappedField = nameMapping.find(col.fieldId()); + if (mappedField != null && !mappedField.names().isEmpty()) { + mappedField.names().forEach(name -> map.put(name, col)); + } else { + map.put(col.name(), col); + } + }); + return map; + } + + protected List convertListValue( + Object value, + SeaTunnelDataType fromType, + Types.ListType type, + SchemaChangeWrapper wrapper) { + Preconditions.checkArgument(value.getClass().isArray()); + Object[] list = (Object[]) value; + return Arrays.stream(list) + .map( + element -> { + int fieldId = type.fields().get(0).fieldId(); + return convertValue( + element, fromType, type.elementType(), fieldId, wrapper); + }) + .collect(toList()); + } + + protected Map convertMapValue( + Object value, + SeaTunnelDataType fromType, + Types.MapType type, + SchemaChangeWrapper wrapper) { + Preconditions.checkArgument(value instanceof Map); + Map map = (Map) value; + Map result = Maps.newHashMap(); + map.forEach( + (k, v) -> { + int keyFieldId = type.fields().get(0).fieldId(); + int valueFieldId = type.fields().get(1).fieldId(); + result.put( + convertValue(k, fromType, type.keyType(), keyFieldId, wrapper), + convertValue(v, fromType, type.valueType(), valueFieldId, wrapper)); + }); + return result; + } + + protected int convertInt(Object value) { + if (value instanceof Number) { + return ((Number) value).intValue(); + } else if (value instanceof String) { + return Integer.parseInt((String) value); + } + throw new IllegalArgumentException("Cannot convert to int: " + value.getClass().getName()); + } + + protected long convertLong(Object value) { + if (value instanceof Number) { + return ((Number) value).longValue(); + } else if (value instanceof String) { + return Long.parseLong((String) value); + } + throw new IllegalArgumentException("Cannot convert to long: " + value.getClass().getName()); + } + + protected float convertFloat(Object value) { + if (value instanceof Number) { + return ((Number) value).floatValue(); + } else if (value instanceof String) { + return Float.parseFloat((String) value); + } + throw new IllegalArgumentException( + "Cannot convert to float: " + value.getClass().getName()); + } + + protected double convertDouble(Object value) { + if (value instanceof Number) { + return ((Number) value).doubleValue(); + } else if (value instanceof String) { + return Double.parseDouble((String) value); + } + throw new IllegalArgumentException( + "Cannot convert to double: " + value.getClass().getName()); + } + + protected BigDecimal convertDecimal(Object value, Types.DecimalType type) { + BigDecimal bigDecimal; + if (value instanceof BigDecimal) { + bigDecimal = (BigDecimal) value; + } else if (value instanceof Number) { + Number num = (Number) value; + Double dbl = num.doubleValue(); + if (dbl.equals(Math.floor(dbl))) { + bigDecimal = BigDecimal.valueOf(num.longValue()); + } else { + bigDecimal = BigDecimal.valueOf(dbl); + } + } else if (value instanceof String) { + bigDecimal = new BigDecimal((String) value); + } else { + throw new IllegalArgumentException( + "Cannot convert to BigDecimal: " + value.getClass().getName()); + } + return bigDecimal.setScale(type.scale(), RoundingMode.HALF_UP); + } + + protected boolean convertBoolean(Object value) { + if (value instanceof Boolean) { + return (boolean) value; + } else if (value instanceof String) { + return Boolean.parseBoolean((String) value); + } + throw new IllegalArgumentException( + "Cannot convert to boolean: " + value.getClass().getName()); + } + + protected String convertString(Object value) { + try { + if (value instanceof String) { + return (String) value; + } else if (value instanceof Number || value instanceof Boolean) { + return value.toString(); + } else if (value instanceof Map || value instanceof List) { + return MAPPER.writeValueAsString(value); + } else { + return MAPPER.writeValueAsString(value); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + protected UUID convertUUID(Object value) { + if (value instanceof String) { + return UUID.fromString((String) value); + } else if (value instanceof UUID) { + return (UUID) value; + } + throw new IllegalArgumentException("Cannot convert to UUID: " + value.getClass().getName()); + } + + protected ByteBuffer convertBase64Binary(Object value) { + if (value instanceof String) { + return ByteBuffer.wrap(Base64.getDecoder().decode((String) value)); + } else if (value instanceof byte[]) { + return ByteBuffer.wrap((byte[]) value); + } else if (value instanceof ByteBuffer) { + return (ByteBuffer) value; + } + throw new IllegalArgumentException( + "Cannot convert to binary: " + value.getClass().getName()); + } + + protected LocalDate convertDateValue(Object value) { + if (value instanceof Number) { + int days = ((Number) value).intValue(); + return DateTimeUtil.dateFromDays(days); + } else if (value instanceof String) { + return LocalDate.parse((String) value); + } else if (value instanceof LocalDate) { + return (LocalDate) value; + } else if (value instanceof Date) { + int days = (int) (((Date) value).getTime() / 1000 / 60 / 60 / 24); + return DateTimeUtil.dateFromDays(days); + } + throw new RuntimeException("Cannot convert date: " + value); + } + + protected LocalTime convertTimeValue(Object value) { + if (value instanceof Number) { + long millis = ((Number) value).longValue(); + return DateTimeUtil.timeFromMicros(millis * 1000); + } else if (value instanceof String) { + return LocalTime.parse((String) value); + } else if (value instanceof LocalTime) { + return (LocalTime) value; + } else if (value instanceof Date) { + long millis = ((Date) value).getTime(); + return DateTimeUtil.timeFromMicros(millis * 1000); + } + throw new RuntimeException("Cannot convert time: " + value); + } + + protected Temporal convertTimestampValue(Object value, Types.TimestampType type) { + if (type.shouldAdjustToUTC()) { + return convertOffsetDateTime(value); + } + return convertLocalDateTime(value); + } + + private OffsetDateTime convertOffsetDateTime(Object value) { + if (value instanceof Number) { + long millis = ((Number) value).longValue(); + return DateTimeUtil.timestamptzFromMicros(millis * 1000); + } else if (value instanceof String) { + return parseOffsetDateTime((String) value); + } else if (value instanceof OffsetDateTime) { + return (OffsetDateTime) value; + } else if (value instanceof LocalDateTime) { + return ((LocalDateTime) value).atOffset(ZoneOffset.UTC); + } else if (value instanceof Date) { + return DateTimeUtil.timestamptzFromMicros(((Date) value).getTime() * 1000); + } + throw new RuntimeException( + "Cannot convert timestamptz: " + value + ", type: " + value.getClass()); + } + + private OffsetDateTime parseOffsetDateTime(String str) { + String tsStr = ensureTimestampFormat(str); + try { + return OFFSET_TS_FMT.parse(tsStr, OffsetDateTime::from); + } catch (DateTimeParseException e) { + return LocalDateTime.parse(tsStr, DateTimeFormatter.ISO_LOCAL_DATE_TIME) + .atOffset(ZoneOffset.UTC); + } + } + + private LocalDateTime convertLocalDateTime(Object value) { + if (value instanceof Number) { + long millis = ((Number) value).longValue(); + return DateTimeUtil.timestampFromMicros(millis * 1000); + } else if (value instanceof String) { + return parseLocalDateTime((String) value); + } else if (value instanceof LocalDateTime) { + return (LocalDateTime) value; + } else if (value instanceof OffsetDateTime) { + return ((OffsetDateTime) value).toLocalDateTime(); + } else if (value instanceof Date) { + return DateTimeUtil.timestampFromMicros(((Date) value).getTime() * 1000); + } + throw new RuntimeException( + "Cannot convert timestamp: " + value + ", type: " + value.getClass()); + } + + private LocalDateTime parseLocalDateTime(String str) { + String tsStr = ensureTimestampFormat(str); + try { + return LocalDateTime.parse(tsStr, DateTimeFormatter.ISO_LOCAL_DATE_TIME); + } catch (DateTimeParseException e) { + return OFFSET_TS_FMT.parse(tsStr, OffsetDateTime::from).toLocalDateTime(); + } + } + + private String ensureTimestampFormat(String str) { + String result = str; + if (result.charAt(10) == ' ') { + result = result.substring(0, 10) + 'T' + result.substring(11); + } + if (result.length() > 22 && result.charAt(19) == '+' && result.charAt(22) == ':') { + result = result.substring(0, 19) + result.substring(19).replace(":", ""); + } + return result; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/Utilities.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/Utilities.java new file mode 100644 index 000000000000..d5bf98fc528a --- /dev/null +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/utils/Utilities.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.iceberg.utils; + +import org.apache.seatunnel.shade.com.google.common.collect.Maps; +import org.apache.seatunnel.shade.com.google.common.collect.Sets; +import org.apache.seatunnel.shade.com.google.common.primitives.Ints; + +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.PartitionedAppendWriter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.PartitionedDeltaWriter; +import org.apache.seatunnel.connectors.seatunnel.iceberg.sink.writer.UnpartitionedDeltaWriter; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderFactory; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.FileAppenderFactory; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.io.TaskWriter; +import org.apache.iceberg.io.UnpartitionedWriter; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PropertyUtil; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; + +import static java.util.stream.Collectors.toSet; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT; + +public class Utilities { + private static final Logger LOG = LoggerFactory.getLogger(Utilities.class.getName()); + + public static TaskWriter createTableWriter(Table table, SinkConfig config) { + Map tableProps = Maps.newHashMap(table.properties()); + tableProps.putAll(config.getWriteProps()); + + String formatStr = + tableProps.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + FileFormat format = FileFormat.valueOf(formatStr.toUpperCase()); + + long targetFileSize = + PropertyUtil.propertyAsLong( + tableProps, + WRITE_TARGET_FILE_SIZE_BYTES, + WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); + + Set identifierFieldIds = table.schema().identifierFieldIds(); + + // override the identifier fields if the config is set + List idCols = config.tableConfig(config.getTable()).idColumns(); + if (!idCols.isEmpty()) { + identifierFieldIds = + idCols.stream() + .map(colName -> table.schema().findField(colName).fieldId()) + .collect(toSet()); + } + + FileAppenderFactory appenderFactory; + if (identifierFieldIds == null || identifierFieldIds.isEmpty()) { + appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, null) + .setAll(tableProps); + } else { + appenderFactory = + new GenericAppenderFactory( + table.schema(), + table.spec(), + Ints.toArray(identifierFieldIds), + TypeUtil.select( + table.schema(), Sets.newHashSet(identifierFieldIds)), + null) + .setAll(tableProps); + } + + // (partition ID + task ID + operation ID) must be unique + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, 1, System.currentTimeMillis()) + .defaultSpec(table.spec()) + .operationId(UUID.randomUUID().toString()) + .format(format) + .build(); + + TaskWriter writer; + if (table.spec().isUnpartitioned()) { + if (identifierFieldIds.isEmpty() && !config.isUpsertModeEnabled()) { + // No delta writer + writer = + new UnpartitionedWriter<>( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize); + } else { + // Delta writer + writer = + new UnpartitionedDeltaWriter( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize, + table.schema(), + identifierFieldIds, + config.isUpsertModeEnabled()); + } + } else { + if (identifierFieldIds.isEmpty() && !config.isUpsertModeEnabled()) { + // No delta writer + writer = + new PartitionedAppendWriter( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize, + table.schema()); + } else { + // Delta writer + writer = + new PartitionedDeltaWriter( + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + targetFileSize, + table.schema(), + identifierFieldIds, + config.isUpsertModeEnabled()); + } + } + return writer; + } +} diff --git a/seatunnel-connectors-v2/connector-iceberg/src/test/java/org/apache/seatunnel/connectors/seatunnel/iceberg/TestIcebergMetastore.java b/seatunnel-connectors-v2/connector-iceberg/src/test/java/org/apache/seatunnel/connectors/seatunnel/iceberg/TestIcebergMetastore.java index 7280199a3832..02c4c114bcfe 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/test/java/org/apache/seatunnel/connectors/seatunnel/iceberg/TestIcebergMetastore.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/test/java/org/apache/seatunnel/connectors/seatunnel/iceberg/TestIcebergMetastore.java @@ -17,6 +17,10 @@ package org.apache.seatunnel.connectors.seatunnel.iceberg; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SinkConfig; + import org.apache.hadoop.hive.conf.HiveConf; import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.hive.HiveCatalog; @@ -29,6 +33,8 @@ import org.junit.jupiter.api.Test; import java.io.File; +import java.util.HashMap; +import java.util.Map; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HIVE; @@ -50,11 +56,19 @@ public void testUseHiveMetastore() { String warehousePath = "/tmp/seatunnel/iceberg/hive/"; new File(warehousePath).mkdirs(); + Map configs = new HashMap<>(); + Map catalogProps = new HashMap<>(); + catalogProps.put("type", HIVE.getType()); + catalogProps.put("warehouse", "file://" + warehousePath); + catalogProps.put("uri", METASTORE_URI); + + configs.put(CommonConfig.KEY_CATALOG_NAME.key(), "seatunnel"); + configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); + HiveCatalog catalog = (HiveCatalog) - new IcebergCatalogFactory( - "seatunnel", HIVE, "file://" + warehousePath, METASTORE_URI) - .create(); + new IcebergCatalogFactory(new SinkConfig(ReadonlyConfig.fromMap(configs))) + .loadCatalog(); catalog.createNamespace(Namespace.of("test_database")); Assertions.assertTrue(catalog.namespaceExists(Namespace.of("test_database"))); } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/pom.xml index 50754d6d46b3..11c147432d03 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/pom.xml +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/pom.xml @@ -25,7 +25,24 @@ connector-iceberg-e2e SeaTunnel : E2E : Connector V2 : Iceberg + + + + org.apache.seatunnel + connector-jdbc + ${project.version} + pom + import + + + + + + org.apache.seatunnel + connector-fake + ${project.version} + org.apache.seatunnel connector-iceberg @@ -51,6 +68,43 @@ optional test + + + + + org.apache.seatunnel + connector-jdbc + ${project.version} + test + + + + org.apache.seatunnel + connector-cdc-mysql + ${project.version} + test + + + + org.apache.seatunnel + connector-cdc-mysql + ${project.version} + test-jar + test + + + + org.testcontainers + mysql + ${testcontainer.version} + + + + mysql + mysql-connector-java + test + + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSinkCDCIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSinkCDCIT.java new file mode 100644 index 000000000000..d4efd489d0ea --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSinkCDCIT.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.e2e.connector.iceberg; + +import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.common.utils.FileUtils; +import org.apache.seatunnel.connectors.seatunnel.cdc.mysql.testutils.MySqlContainer; +import org.apache.seatunnel.connectors.seatunnel.cdc.mysql.testutils.MySqlVersion; +import org.apache.seatunnel.connectors.seatunnel.cdc.mysql.testutils.UniqueDatabase; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; +import org.apache.seatunnel.e2e.common.TestResource; +import org.apache.seatunnel.e2e.common.TestSuiteBase; +import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; +import org.apache.seatunnel.e2e.common.container.EngineType; +import org.apache.seatunnel.e2e.common.container.TestContainer; +import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; +import org.apache.seatunnel.e2e.common.junit.TestContainerExtension; + +import org.apache.iceberg.Table; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.CloseableIterable; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.condition.DisabledOnOs; +import org.junit.jupiter.api.condition.OS; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.lifecycle.Startables; +import org.testcontainers.utility.DockerLoggerFactory; + +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; + +import static java.lang.Thread.sleep; +import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HADOOP; +import static org.awaitility.Awaitility.given; + +@Slf4j +@DisabledOnContainer( + value = {}, + type = {EngineType.SPARK}, + disabledReason = "Currently SPARK do not support cdc") +@DisabledOnOs(OS.WINDOWS) +public class IcebergSinkCDCIT extends TestSuiteBase implements TestResource { + + private static final String CATALOG_DIR = "/tmp/seatunnel/iceberg/hadoop-cdc-sink/"; + + private static final String NAMESPACE = "seatunnel_namespace"; + + // mysql + private static final String MYSQL_HOST = "mysql_cdc_e2e"; + private static final String MYSQL_USER_NAME = "st_user"; + private static final String MYSQL_USER_PASSWORD = "seatunnel"; + private static final String MYSQL_DATABASE = "mysql_cdc"; + private static final MySqlContainer MYSQL_CONTAINER = createMySqlContainer(MySqlVersion.V8_0); + + private final UniqueDatabase inventoryDatabase = + new UniqueDatabase( + MYSQL_CONTAINER, MYSQL_DATABASE, "mysqluser", "mysqlpw", MYSQL_DATABASE); + + private static MySqlContainer createMySqlContainer(MySqlVersion version) { + return new MySqlContainer(version) + .withConfigurationOverride("mysql/server-gtids/my.cnf") + .withSetupSQL("mysql/setup.sql") + .withNetwork(NETWORK) + .withNetworkAliases(MYSQL_HOST) + .withDatabaseName(MYSQL_DATABASE) + .withUsername(MYSQL_USER_NAME) + .withPassword(MYSQL_USER_PASSWORD) + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger("mysql-mysql-image"))); + } + + private String driverUrl() { + return "https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.32/mysql-connector-j-8.0.32.jar"; + } + + private String zstdUrl() { + return "https://repo1.maven.org/maven2/com/github/luben/zstd-jni/1.5.5-5/zstd-jni-1.5.5-5.jar"; + } + + @TestContainerExtension + protected final ContainerExtendedFactory extendedFactory = + container -> { + container.execInContainer("sh", "-c", "mkdir -p " + CATALOG_DIR); + container.execInContainer("sh", "-c", "chmod -R 777 " + CATALOG_DIR); + Container.ExecResult extraCommandsZSTD = + container.execInContainer( + "sh", + "-c", + "mkdir -p /tmp/seatunnel/plugins/Iceberg/lib && cd /tmp/seatunnel/plugins/Iceberg/lib && wget " + + zstdUrl()); + Assertions.assertEquals( + 0, extraCommandsZSTD.getExitCode(), extraCommandsZSTD.getStderr()); + Container.ExecResult extraCommands = + container.execInContainer( + "sh", + "-c", + "mkdir -p /tmp/seatunnel/plugins/MySQL-CDC/lib && cd /tmp/seatunnel/plugins/MySQL-CDC/lib && wget " + + driverUrl()); + Assertions.assertEquals(0, extraCommands.getExitCode(), extraCommands.getStderr()); + }; + + private final String NAMESPACE_TAR = NAMESPACE + ".tar.gz"; + protected final ContainerExtendedFactory containerExtendedFactory = + new ContainerExtendedFactory() { + @Override + public void extend(GenericContainer container) + throws IOException, InterruptedException { + FileUtils.createNewDir(CATALOG_DIR); + container.execInContainer( + "sh", + "-c", + "cd " + + CATALOG_DIR + + " && tar -czvf " + + NAMESPACE_TAR + + " " + + NAMESPACE); + container.copyFileFromContainer( + CATALOG_DIR + NAMESPACE_TAR, CATALOG_DIR + NAMESPACE_TAR); + extractFiles(); + } + + private void extractFiles() { + ProcessBuilder processBuilder = new ProcessBuilder(); + processBuilder.command( + "sh", "-c", "cd " + CATALOG_DIR + " && tar -zxvf " + NAMESPACE_TAR); + try { + Process process = processBuilder.start(); + // 等待命令执行完成 + int exitCode = process.waitFor(); + if (exitCode == 0) { + log.info("Extract files successful."); + } else { + log.error("Extract files failed with exit code " + exitCode); + } + } catch (IOException | InterruptedException e) { + e.printStackTrace(); + } + } + }; + + private static final String SOURCE_TABLE = "mysql_cdc_e2e_source_table"; + + @BeforeAll + @Override + public void startUp() throws Exception { + log.info("The second stage: Starting Mysql containers..."); + Startables.deepStart(Stream.of(MYSQL_CONTAINER)).join(); + log.info("Mysql Containers are started"); + inventoryDatabase.createAndInitialize(); + log.info("Mysql ddl execution is complete"); + } + + @TestTemplate + public void testMysqlCdcCheckDataE2e(TestContainer container) + throws IOException, InterruptedException { + // Clear related content to ensure that multiple operations are not affected + clearTable(MYSQL_DATABASE, SOURCE_TABLE); + CompletableFuture.supplyAsync( + () -> { + try { + container.executeJob("/iceberg/mysql_cdc_to_iceberg.conf"); + } catch (Exception e) { + log.error("Commit task exception :" + e.getMessage()); + throw new RuntimeException(e); + } + return null; + }); + insertAndCheckData(container); + upsertAndCheckData(container); + } + + private void upsertAndCheckData(TestContainer container) + throws InterruptedException, IOException { + upsertDeleteSourceTable(MYSQL_DATABASE, SOURCE_TABLE); + // Waiting 30s for source capture data + sleep(30000); + + // stream stage + given().ignoreExceptions() + .await() + .atMost(120000, TimeUnit.MILLISECONDS) + .untilAsserted( + () -> { + // copy iceberg to local + container.executeExtraCommands(containerExtendedFactory); + List records = loadIcebergTable(); + Assertions.assertEquals(4, records.size()); + for (Record record : records) { + Integer id = (Integer) record.getField("id"); + Long f_bigint = (Long) record.getField("f_bigint"); + if (id == 3) { + Assertions.assertEquals(10000, f_bigint); + } + } + }); + } + + private void insertAndCheckData(TestContainer container) + throws InterruptedException, IOException { + // Init table data + initSourceTableData(MYSQL_DATABASE, SOURCE_TABLE); + // Waiting 30s for source capture data + sleep(30000); + + // stream stage + given().ignoreExceptions() + .await() + .atMost(60000, TimeUnit.MILLISECONDS) + .untilAsserted( + () -> { + // copy iceberg to local + container.executeExtraCommands(containerExtendedFactory); + Assertions.assertEquals(3, loadIcebergTable().size()); + }); + } + + private List loadIcebergTable() { + List results = new ArrayList<>(); + Map configs = new HashMap<>(); + Map catalogProps = new HashMap<>(); + catalogProps.put("type", HADOOP.getType()); + catalogProps.put("warehouse", "file://" + CATALOG_DIR); + configs.put(CommonConfig.KEY_CATALOG_NAME.key(), "seatunnel_test"); + configs.put(CommonConfig.KEY_NAMESPACE.key(), "seatunnel_namespace"); + configs.put(CommonConfig.KEY_TABLE.key(), "iceberg_sink_table"); + configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); + IcebergTableLoader tableLoader = + IcebergTableLoader.create(new SourceConfig(ReadonlyConfig.fromMap(configs))); + tableLoader.open(); + try { + Table table = tableLoader.loadTable(); + try (CloseableIterable records = IcebergGenerics.read(table).build()) { + for (Record record : records) { + results.add(record); + } + } catch (IOException e) { + e.printStackTrace(); + } + } catch (Exception ex) { + ex.printStackTrace(); + } + return results; + } + + private void clearTable(String database, String tableName) { + executeSql("truncate table " + database + "." + tableName); + } + + // Execute SQL + private void executeSql(String sql) { + try (Connection connection = getJdbcConnection()) { + connection.createStatement().execute(sql); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private Connection getJdbcConnection() throws SQLException { + return DriverManager.getConnection( + MYSQL_CONTAINER.getJdbcUrl(), + MYSQL_CONTAINER.getUsername(), + MYSQL_CONTAINER.getPassword()); + } + + @AfterAll + @Override + public void tearDown() throws Exception { + // close Container + if (MYSQL_CONTAINER != null) { + MYSQL_CONTAINER.close(); + } + } + + private void initSourceTableData(String database, String tableName) { + executeSql( + "INSERT INTO " + + database + + "." + + tableName + + " ( id, f_binary, f_blob, f_long_varbinary, f_longblob, f_tinyblob, f_varbinary, f_smallint,\n" + + " f_smallint_unsigned, f_mediumint, f_mediumint_unsigned, f_int, f_int_unsigned, f_integer,\n" + + " f_integer_unsigned, f_bigint, f_bigint_unsigned, f_numeric, f_decimal, f_float, f_double,\n" + + " f_double_precision, f_longtext, f_mediumtext, f_text, f_tinytext, f_varchar, f_date, f_datetime,\n" + + " f_timestamp, f_bit1, f_bit64, f_char, f_enum, f_mediumblob, f_long_varchar, f_real, f_time,\n" + + " f_tinyint, f_tinyint_unsigned, f_json, f_year )\n" + + "VALUES ( 1, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,\n" + + " 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL,\n" + + " 0x74696E79626C6F62, 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321,\n" + + " 123456789, 987654321, 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field',\n" + + " 'This is a text field', 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00',\n" + + " '2023-04-27 11:08:40', 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2',\n" + + " 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field',\n" + + " 12.345, '14:30:00', -128, 255, '{ \"key\": \"value\" }', 2022 ),\n" + + " ( 2, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,\n" + + " 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, 0x74696E79626C6F62,\n" + + " 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, 123456789, 987654321,\n" + + " 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', 'This is a text field',\n" + + " 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', '2023-04-27 11:08:40',\n" + + " 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2',\n" + + " 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field',\n" + + " 112.345, '14:30:00', -128, 22, '{ \"key\": \"value\" }', 2013 ),\n" + + " ( 3, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,\n" + + " 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, 0x74696E79626C6F62,\n" + + " 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, 123456789, 987654321, 123,\n" + + " 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', 'This is a text field',\n" + + " 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', '2023-04-27 11:08:40',\n" + + " 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2',\n" + + " 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field', 112.345,\n" + + " '14:30:00', -128, 22, '{ \"key\": \"value\" }', 2021 )"); + } + + private void upsertDeleteSourceTable(String database, String tableName) { + executeSql( + "INSERT INTO " + + database + + "." + + tableName + + " ( id, f_binary, f_blob, f_long_varbinary, f_longblob, f_tinyblob, f_varbinary, f_smallint,\n" + + " f_smallint_unsigned, f_mediumint, f_mediumint_unsigned, f_int, f_int_unsigned, f_integer,\n" + + " f_integer_unsigned, f_bigint, f_bigint_unsigned, f_numeric, f_decimal, f_float, f_double,\n" + + " f_double_precision, f_longtext, f_mediumtext, f_text, f_tinytext, f_varchar, f_date, f_datetime,\n" + + " f_timestamp, f_bit1, f_bit64, f_char, f_enum, f_mediumblob, f_long_varchar, f_real, f_time,\n" + + " f_tinyint, f_tinyint_unsigned, f_json, f_year )\n" + + "VALUES ( 5, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,\n" + + " 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL,\n" + + " 0x74696E79626C6F62, 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321,\n" + + " 123456789, 987654321, 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field',\n" + + " 'This is a text field', 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00',\n" + + " '2023-04-27 11:08:40', 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2',\n" + + " 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field',\n" + + " 12.345, '14:30:00', -128, 255, '{ \"key\": \"value\" }', 1992 )"); + executeSql( + "INSERT INTO " + + database + + "." + + tableName + + " ( id, f_binary, f_blob, f_long_varbinary, f_longblob, f_tinyblob, f_varbinary, f_smallint,\n" + + " f_smallint_unsigned, f_mediumint, f_mediumint_unsigned, f_int, f_int_unsigned, f_integer,\n" + + " f_integer_unsigned, f_bigint, f_bigint_unsigned, f_numeric, f_decimal, f_float, f_double,\n" + + " f_double_precision, f_longtext, f_mediumtext, f_text, f_tinytext, f_varchar, f_date, f_datetime,\n" + + " f_timestamp, f_bit1, f_bit64, f_char, f_enum, f_mediumblob, f_long_varchar, f_real, f_time,\n" + + " f_tinyint, f_tinyint_unsigned, f_json, f_year )\n" + + "VALUES ( 6, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,\n" + + " 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL,\n" + + " 0x74696E79626C6F62, 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321,\n" + + " 123456789, 987654321, 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field',\n" + + " 'This is a text field', 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00',\n" + + " '2023-04-27 11:08:40', 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2',\n" + + " 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field',\n" + + " 12.345, '14:30:00', -128, 255, '{ \"key\": \"value\" }', 1999 )"); + executeSql("DELETE FROM " + database + "." + tableName + " where id = 2"); + + executeSql("UPDATE " + database + "." + tableName + " SET f_bigint = 10000 where id = 3"); + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSinkIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSinkIT.java new file mode 100644 index 000000000000..20c1b02914e5 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSinkIT.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.e2e.connector.iceberg; + +import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.common.utils.FileUtils; +import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergTableLoader; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; +import org.apache.seatunnel.e2e.common.TestSuiteBase; +import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; +import org.apache.seatunnel.e2e.common.container.TestContainer; +import org.apache.seatunnel.e2e.common.container.TestContainerId; +import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; +import org.apache.seatunnel.e2e.common.junit.TestContainerExtension; + +import org.apache.iceberg.Table; +import org.apache.iceberg.data.IcebergGenerics; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.io.CloseableIterable; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.condition.DisabledOnOs; +import org.junit.jupiter.api.condition.OS; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.GenericContainer; + +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HADOOP; +import static org.awaitility.Awaitility.given; + +@Slf4j +@DisabledOnContainer( + value = {TestContainerId.SPARK_2_4}, + type = {}, + disabledReason = "") +@DisabledOnOs(OS.WINDOWS) +public class IcebergSinkIT extends TestSuiteBase { + + private static final String CATALOG_DIR = "/tmp/seatunnel/iceberg/hadoop-sink/"; + + private static final String NAMESPACE = "seatunnel_namespace"; + + private String zstdUrl() { + return "https://repo1.maven.org/maven2/com/github/luben/zstd-jni/1.5.5-5/zstd-jni-1.5.5-5.jar"; + } + + @TestContainerExtension + protected final ContainerExtendedFactory extendedFactory = + container -> { + container.execInContainer("sh", "-c", "mkdir -p " + CATALOG_DIR); + container.execInContainer("sh", "-c", "chmod -R 777 " + CATALOG_DIR); + container.execInContainer( + "sh", + "-c", + "mkdir -p /tmp/seatunnel/plugins/Iceberg/lib && cd /tmp/seatunnel/plugins/Iceberg/lib && wget " + + zstdUrl()); + }; + + private final String NAMESPACE_TAR = NAMESPACE + ".tar.gz"; + protected final ContainerExtendedFactory containerExtendedFactory = + new ContainerExtendedFactory() { + @Override + public void extend(GenericContainer container) + throws IOException, InterruptedException { + FileUtils.createNewDir(CATALOG_DIR); + container.execInContainer( + "sh", + "-c", + "cd " + + CATALOG_DIR + + " && tar -czvf " + + NAMESPACE_TAR + + " " + + NAMESPACE); + container.copyFileFromContainer( + CATALOG_DIR + NAMESPACE_TAR, CATALOG_DIR + NAMESPACE_TAR); + extractFiles(); + } + + private void extractFiles() { + ProcessBuilder processBuilder = new ProcessBuilder(); + processBuilder.command( + "sh", "-c", "cd " + CATALOG_DIR + " && tar -zxvf " + NAMESPACE_TAR); + try { + Process process = processBuilder.start(); + // 等待命令执行完成 + int exitCode = process.waitFor(); + if (exitCode == 0) { + log.info("Extract files successful."); + } else { + log.error("Extract files failed with exit code " + exitCode); + } + } catch (IOException | InterruptedException e) { + e.printStackTrace(); + } + } + }; + + @TestTemplate + public void testInsertAndCheckDataE2e(TestContainer container) + throws IOException, InterruptedException { + Container.ExecResult textWriteResult = + container.executeJob("/iceberg/fake_to_iceberg.conf"); + Assertions.assertEquals(0, textWriteResult.getExitCode()); + // stream stage + given().ignoreExceptions() + .await() + .atMost(60000, TimeUnit.MILLISECONDS) + .untilAsserted( + () -> { + // copy iceberg to local + container.executeExtraCommands(containerExtendedFactory); + Assertions.assertEquals(100, loadIcebergTable().size()); + }); + } + + private List loadIcebergTable() { + List results = new ArrayList<>(); + Map configs = new HashMap<>(); + Map catalogProps = new HashMap<>(); + catalogProps.put("type", HADOOP.getType()); + catalogProps.put("warehouse", "file://" + CATALOG_DIR); + configs.put(CommonConfig.KEY_CATALOG_NAME.key(), "seatunnel_test"); + configs.put(CommonConfig.KEY_NAMESPACE.key(), "seatunnel_namespace"); + configs.put(CommonConfig.KEY_TABLE.key(), "iceberg_sink_table"); + configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); + IcebergTableLoader tableLoader = + IcebergTableLoader.create(new SourceConfig(ReadonlyConfig.fromMap(configs))); + tableLoader.open(); + try { + Table table = tableLoader.loadTable(); + try (CloseableIterable records = IcebergGenerics.read(table).build()) { + for (Record record : records) { + results.add(record); + } + } catch (IOException e) { + e.printStackTrace(); + } + } catch (Exception ex) { + ex.printStackTrace(); + } + return results; + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java index 434a1a866847..131d74519ae1 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/IcebergSourceIT.java @@ -17,8 +17,11 @@ package org.apache.seatunnel.e2e.connector.iceberg; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergCatalogFactory; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; import org.apache.seatunnel.e2e.common.TestResource; import org.apache.seatunnel.e2e.common.TestSuiteBase; import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; @@ -63,7 +66,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HADOOP; @@ -133,7 +138,19 @@ public void testIcebergSource(TestContainer container) } private void initializeIcebergTable() { - CATALOG = new IcebergCatalogFactory(CATALOG_NAME, CATALOG_TYPE, WAREHOUSE, null).create(); + + Map configs = new HashMap<>(); + // build catalog props + Map catalogProps = new HashMap<>(); + catalogProps.put("type", CATALOG_TYPE.getType()); + catalogProps.put("warehouse", WAREHOUSE); + + configs.put(CommonConfig.KEY_CATALOG_NAME.key(), CATALOG_NAME); + configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); + + CATALOG = + new IcebergCatalogFactory(new SourceConfig(ReadonlyConfig.fromMap(configs))) + .loadCatalog(); if (!CATALOG.tableExists(TABLE)) { CATALOG.createTable(TABLE, SCHEMA); } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/ddl/inventory.sql b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/ddl/inventory.sql new file mode 100644 index 000000000000..9e9fff3f8f4c --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/ddl/inventory.sql @@ -0,0 +1,95 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +-- ---------------------------------------------------------------------------------------------------------------- +-- DATABASE: inventory +-- ---------------------------------------------------------------------------------------------------------------- + +-- Create and populate our products using a single insert with many rows +CREATE TABLE products ( + id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255) NOT NULL DEFAULT 'SeaTunnel', + description VARCHAR(512), + weight FLOAT +); +ALTER TABLE products AUTO_INCREMENT = 101; + +INSERT INTO products +VALUES (default,"scooter","Small 2-wheel scooter",3.14), + (default,"car battery","12V car battery",8.1), + (default,"12-pack drill bits","12-pack of drill bits with sizes ranging from #40 to #3",0.8), + (default,"hammer","12oz carpenter's hammer",0.75), + (default,"hammer","14oz carpenter's hammer",0.875), + (default,"hammer","16oz carpenter's hammer",1.0), + (default,"rocks","box of assorted rocks",5.3), + (default,"jacket","water resistent black wind breaker",0.1), + (default,"spare tire","24 inch spare tire",22.2); + +-- Create and populate the products on hand using multiple inserts +CREATE TABLE products_on_hand ( + product_id INTEGER NOT NULL PRIMARY KEY, + quantity INTEGER NOT NULL, + FOREIGN KEY (product_id) REFERENCES products(id) +); + +INSERT INTO products_on_hand VALUES (101,3); +INSERT INTO products_on_hand VALUES (102,8); +INSERT INTO products_on_hand VALUES (103,18); +INSERT INTO products_on_hand VALUES (104,4); +INSERT INTO products_on_hand VALUES (105,5); +INSERT INTO products_on_hand VALUES (106,0); +INSERT INTO products_on_hand VALUES (107,44); +INSERT INTO products_on_hand VALUES (108,2); +INSERT INTO products_on_hand VALUES (109,5); + +-- Create some customers ... +CREATE TABLE customers ( + id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + first_name VARCHAR(255) NOT NULL, + last_name VARCHAR(255) NOT NULL, + email VARCHAR(255) NOT NULL UNIQUE KEY +) AUTO_INCREMENT=1001; + + +INSERT INTO customers +VALUES (default,"Sally","Thomas","sally.thomas@acme.com"), + (default,"George","Bailey","gbailey@foobar.com"), + (default,"Edward","Walker","ed@walker.com"), + (default,"Anne","Kretchmar","annek@noanswer.org"); + +-- Create some very simple orders +CREATE TABLE orders ( + order_number INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + order_date DATE NOT NULL, + purchaser INTEGER NOT NULL, + quantity INTEGER NOT NULL, + product_id INTEGER NOT NULL, + FOREIGN KEY order_customer (purchaser) REFERENCES customers(id), + FOREIGN KEY ordered_product (product_id) REFERENCES products(id) +) AUTO_INCREMENT = 10001; + +INSERT INTO orders +VALUES (default, '2016-01-16', 1001, 1, 102), + (default, '2016-01-17', 1002, 2, 105), + (default, '2016-02-18', 1004, 3, 109), + (default, '2016-02-19', 1002, 2, 106), + (default, '16-02-21', 1003, 1, 107); + +CREATE TABLE category ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + category_name VARCHAR(255) +); \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/ddl/mysql_cdc.sql b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/ddl/mysql_cdc.sql new file mode 100644 index 000000000000..63dbc16bb6ab --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/ddl/mysql_cdc.sql @@ -0,0 +1,186 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +-- ---------------------------------------------------------------------------------------------------------------- +-- DATABASE: inventory +-- ---------------------------------------------------------------------------------------------------------------- +CREATE DATABASE IF NOT EXISTS `mysql_cdc`; + +use mysql_cdc; +-- Create a mysql data source table +CREATE TABLE mysql_cdc_e2e_source_table +( + `id` int NOT NULL AUTO_INCREMENT, + `f_binary` binary(64) DEFAULT NULL, + `f_blob` blob, + `f_long_varbinary` mediumblob, + `f_longblob` longblob, + `f_tinyblob` tinyblob, + `f_varbinary` varbinary(100) DEFAULT NULL, + `f_smallint` smallint DEFAULT NULL, + `f_smallint_unsigned` smallint unsigned DEFAULT NULL, + `f_mediumint` mediumint DEFAULT NULL, + `f_mediumint_unsigned` mediumint unsigned DEFAULT NULL, + `f_int` int DEFAULT NULL, + `f_int_unsigned` int unsigned DEFAULT NULL, + `f_integer` int DEFAULT NULL, + `f_integer_unsigned` int unsigned DEFAULT NULL, + `f_bigint` bigint DEFAULT NULL, + `f_bigint_unsigned` bigint unsigned DEFAULT NULL, + `f_numeric` decimal(10, 0) DEFAULT NULL, + `f_decimal` decimal(10, 0) DEFAULT NULL, + `f_float` float DEFAULT NULL, + `f_double` double DEFAULT NULL, + `f_double_precision` double DEFAULT NULL, + `f_longtext` longtext, + `f_mediumtext` mediumtext, + `f_text` text, + `f_tinytext` tinytext, + `f_varchar` varchar(100) DEFAULT NULL, + `f_date` date DEFAULT NULL, + `f_datetime` datetime DEFAULT NULL, + `f_timestamp` timestamp NULL DEFAULT NULL, + `f_bit1` bit(1) DEFAULT NULL, + `f_bit64` bit(64) DEFAULT NULL, + `f_char` char(1) DEFAULT NULL, + `f_enum` enum ('enum1','enum2','enum3') DEFAULT NULL, + `f_mediumblob` mediumblob, + `f_long_varchar` mediumtext, + `f_real` double DEFAULT NULL, + `f_time` time DEFAULT NULL, + `f_tinyint` tinyint DEFAULT NULL, + `f_tinyint_unsigned` tinyint unsigned DEFAULT NULL, + `f_json` json DEFAULT NULL, + `f_year` year DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE = InnoDB + AUTO_INCREMENT = 2 + DEFAULT CHARSET = utf8mb4 + COLLATE = utf8mb4_0900_ai_ci; + +CREATE TABLE mysql_cdc_e2e_source_table_no_primary_key +( + `id` int NOT NULL, + `f_binary` binary(64) DEFAULT NULL, + `f_blob` blob, + `f_long_varbinary` mediumblob, + `f_longblob` longblob, + `f_tinyblob` tinyblob, + `f_varbinary` varbinary(100) DEFAULT NULL, + `f_smallint` smallint DEFAULT NULL, + `f_smallint_unsigned` smallint unsigned DEFAULT NULL, + `f_mediumint` mediumint DEFAULT NULL, + `f_mediumint_unsigned` mediumint unsigned DEFAULT NULL, + `f_int` int DEFAULT NULL, + `f_int_unsigned` int unsigned DEFAULT NULL, + `f_integer` int DEFAULT NULL, + `f_integer_unsigned` int unsigned DEFAULT NULL, + `f_bigint` bigint DEFAULT NULL, + `f_bigint_unsigned` bigint unsigned DEFAULT NULL, + `f_numeric` decimal(10, 0) DEFAULT NULL, + `f_decimal` decimal(10, 0) DEFAULT NULL, + `f_float` float DEFAULT NULL, + `f_double` double DEFAULT NULL, + `f_double_precision` double DEFAULT NULL, + `f_longtext` longtext, + `f_mediumtext` mediumtext, + `f_text` text, + `f_tinytext` tinytext, + `f_varchar` varchar(100) DEFAULT NULL, + `f_date` date DEFAULT NULL, + `f_datetime` datetime DEFAULT NULL, + `f_timestamp` timestamp NULL DEFAULT NULL, + `f_bit1` bit(1) DEFAULT NULL, + `f_bit64` bit(64) DEFAULT NULL, + `f_char` char(1) DEFAULT NULL, + `f_enum` enum ('enum1','enum2','enum3') DEFAULT NULL, + `f_mediumblob` mediumblob, + `f_long_varchar` mediumtext, + `f_real` double DEFAULT NULL, + `f_time` time DEFAULT NULL, + `f_tinyint` tinyint DEFAULT NULL, + `f_tinyint_unsigned` tinyint unsigned DEFAULT NULL, + `f_json` json DEFAULT NULL, + `f_year` year DEFAULT NULL +) ENGINE = InnoDB + DEFAULT CHARSET = utf8mb4 + COLLATE = utf8mb4_0900_ai_ci; + +truncate table mysql_cdc_e2e_source_table; +truncate table mysql_cdc_e2e_source_table_no_primary_key; + +INSERT INTO mysql_cdc_e2e_source_table ( id, f_binary, f_blob, f_long_varbinary, f_longblob, f_tinyblob, f_varbinary, f_smallint, + f_smallint_unsigned, f_mediumint, f_mediumint_unsigned, f_int, f_int_unsigned, f_integer, + f_integer_unsigned, f_bigint, f_bigint_unsigned, f_numeric, f_decimal, f_float, f_double, + f_double_precision, f_longtext, f_mediumtext, f_text, f_tinytext, f_varchar, f_date, f_datetime, + f_timestamp, f_bit1, f_bit64, f_char, f_enum, f_mediumblob, f_long_varchar, f_real, f_time, + f_tinyint, f_tinyint_unsigned, f_json, f_year ) +VALUES ( 1, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, + 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, + 0x74696E79626C6F62, 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, + 123456789, 987654321, 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', + 'This is a text field', 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', + '2023-04-27 11:08:40', 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2', + 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field', + 12.345, '14:30:00', -128, 255, '{ "key": "value" }', 2022 ), + ( 2, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, + 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, 0x74696E79626C6F62, + 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, 123456789, 987654321, + 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', 'This is a text field', + 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', '2023-04-27 11:08:40', + 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2', + 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field', + 112.345, '14:30:00', -128, 22, '{ "key": "value" }', 2013 ), + ( 3, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, + 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, 0x74696E79626C6F62, + 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, 123456789, 987654321, 123, + 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', 'This is a text field', + 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', '2023-04-27 11:08:40', + 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2', + 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field', 112.345, + '14:30:00', -128, 22, '{ "key": "value" }', 2021 ); + +INSERT INTO mysql_cdc_e2e_source_table_no_primary_key ( id, f_binary, f_blob, f_long_varbinary, f_longblob, f_tinyblob, f_varbinary, f_smallint, + f_smallint_unsigned, f_mediumint, f_mediumint_unsigned, f_int, f_int_unsigned, f_integer, + f_integer_unsigned, f_bigint, f_bigint_unsigned, f_numeric, f_decimal, f_float, f_double, + f_double_precision, f_longtext, f_mediumtext, f_text, f_tinytext, f_varchar, f_date, f_datetime, + f_timestamp, f_bit1, f_bit64, f_char, f_enum, f_mediumblob, f_long_varchar, f_real, f_time, + f_tinyint, f_tinyint_unsigned, f_json, f_year ) +VALUES ( 1, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, + 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, + 0x74696E79626C6F62, 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, + 123456789, 987654321, 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', + 'This is a text field', 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', + '2023-04-27 11:08:40', 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2', + 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field', + 12.345, '14:30:00', -128, 255, '{ "key": "value" }', 2022 ), + ( 2, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, + 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, 0x74696E79626C6F62, + 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, 123456789, 987654321, + 123, 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', 'This is a text field', + 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', '2023-04-27 11:08:40', + 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2', + 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field', + 112.345, '14:30:00', -128, 22, '{ "key": "value" }', 2013 ), + ( 3, 0x61626374000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, + 0x68656C6C6F, 0x18000000789C0BC9C82C5600A244859CFCBC7485B2C4A2A4CCBCC4A24A00697308D4, NULL, 0x74696E79626C6F62, + 0x48656C6C6F20776F726C64, 12345, 54321, 123456, 654321, 1234567, 7654321, 1234567, 7654321, 123456789, 987654321, 123, + 789, 12.34, 56.78, 90.12, 'This is a long text field', 'This is a medium text field', 'This is a text field', + 'This is a tiny text field', 'This is a varchar field', '2022-04-27', '2022-04-27 14:30:00', '2023-04-27 11:08:40', + 1, b'0101010101010101010101010101010101010101010101010101010101010101', 'C', 'enum2', + 0x1B000000789C0BC9C82C5600A24485DCD494CCD25C85A49CFC2485B4CCD49C140083FF099A, 'This is a long varchar field', 112.345, + '14:30:00', -128, 22, '{ "key": "value" }', 2021 ); \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/fake_to_iceberg.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/fake_to_iceberg.conf new file mode 100644 index 000000000000..a32b50d6173e --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/fake_to_iceberg.conf @@ -0,0 +1,75 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + parallelism = 1 + job.mode = "BATCH" + + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + FakeSource { + row.num = 100 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + result_table_name = "fake" + } +} + +transform { +} + +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + "type"="hadoop" + "warehouse"="file:///tmp/seatunnel/iceberg/hadoop-sink/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=10 + } + iceberg.table.auto-create-enabled=true + iceberg.table.partition-keys="c_timestamp" + case_sensitive=true + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf index 969e85e1232b..d6e29483acb9 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/iceberg_source.conf @@ -45,8 +45,10 @@ source { } } catalog_name = "seatunnel" - catalog_type = "hadoop" - warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" + iceberg.catalog.config={ + "type"="hadoop" + "warehouse"="file:///tmp/seatunnel/iceberg/hadoop/" + } namespace = "database1" table = "source" result_table_name = "iceberg" diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/mysql_cdc_to_iceberg.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/mysql_cdc_to_iceberg.conf new file mode 100644 index 000000000000..df4f7558403d --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/iceberg/mysql_cdc_to_iceberg.conf @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +###### +###### This config file is a demonstration of streaming processing in seatunnel config +###### + +env { + parallelism = 1 + job.mode = "STREAMING" + checkpoint.interval = 5000 +} + +source { + MySQL-CDC { + result_table_name="customer_result_table" + catalog { + factory = Mysql + } + database-names=["mysql_cdc"] + table-names = ["mysql_cdc.mysql_cdc_e2e_source_table"] + format=DEFAULT + username = "st_user" + password = "seatunnel" + base-url = "jdbc:mysql://mysql_cdc_e2e:3306/mysql_cdc" + } +} + +transform { +} + +sink { + Iceberg { + catalog_name="seatunnel_test" + iceberg.catalog.config={ + "type"="hadoop" + "warehouse"="file:///tmp/seatunnel/iceberg/hadoop-cdc-sink/" + } + namespace="seatunnel_namespace" + table="iceberg_sink_table" + iceberg.table.write-props={ + write.format.default="parquet" + write.target-file-size-bytes=10 + } + iceberg.table.auto-create-enabled=true + iceberg.table.primary-keys="id" + iceberg.table.partition-keys="f_datetime" + iceberg.table.upsert-mode-enabled=true + iceberg.table.schema-evolution-enabled=true + case_sensitive=true + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/mysql/server-gtids/my.cnf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/mysql/server-gtids/my.cnf new file mode 100644 index 000000000000..a390897885d0 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/mysql/server-gtids/my.cnf @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# For advice on how to change settings please see +# http://dev.mysql.com/doc/refman/5.7/en/server-configuration-defaults.html + +[mysqld] +# +# Remove leading # and set to the amount of RAM for the most important data +# cache in MySQL. Start at 70% of total RAM for dedicated server, else 10%. +# innodb_buffer_pool_size = 128M +# +# Remove leading # to turn on a very important data integrity option: logging +# changes to the binary log between backups. +# log_bin +# +# Remove leading # to set options mainly useful for reporting servers. +# The server defaults are faster for transactions and fast SELECTs. +# Adjust sizes as needed, experiment to find the optimal values. +# join_buffer_size = 128M +# sort_buffer_size = 2M +# read_rnd_buffer_size = 2M +skip-host-cache +skip-name-resolve +#datadir=/var/lib/mysql +#socket=/var/lib/mysql/mysql.sock +secure-file-priv=/var/lib/mysql +user=mysql + +# Disabling symbolic-links is recommended to prevent assorted security risks +symbolic-links=0 + +#log-error=/var/log/mysqld.log +#pid-file=/var/run/mysqld/mysqld.pid + +# ---------------------------------------------- +# Enable the binlog for replication & CDC +# ---------------------------------------------- + +# Enable binary replication log and set the prefix, expiration, and log format. +# The prefix is arbitrary, expiration can be short for integration tests but would +# be longer on a production system. Row-level info is required for ingest to work. +# Server ID is required, but this will vary on production systems +server-id = 223344 +log_bin = mysql-bin +expire_logs_days = 1 +binlog_format = row + +# enable gtid mode +gtid_mode = on +enforce_gtid_consistency = on \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/mysql/setup.sql b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/mysql/setup.sql new file mode 100644 index 000000000000..aa4534e0ad54 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/src/test/resources/mysql/setup.sql @@ -0,0 +1,27 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +-- In production you would almost certainly limit the replication user must be on the follower (slave) machine, +-- to prevent other clients accessing the log from other machines. For example, 'replicator'@'follower.acme.com'. +-- However, in this database we'll grant 2 users different privileges: +-- +-- 1) 'st_user' - all privileges required by the snapshot reader AND binlog reader (used for testing) +-- 2) 'mysqluser' - all privileges +-- +GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT, DROP, LOCK TABLES ON *.* TO 'st_user'@'%'; +CREATE USER 'mysqluser' IDENTIFIED BY 'mysqlpw'; +GRANT ALL PRIVILEGES ON *.* TO 'mysqluser'@'%'; diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/~ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/~ new file mode 100644 index 000000000000..18fd29cf0a79 Binary files /dev/null and b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-e2e/~ differ diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java index f63510e3f3e5..fec452c45323 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/java/org/apache/seatunnel/e2e/connector/iceberg/hadoop3/IcebergSourceIT.java @@ -17,8 +17,11 @@ package org.apache.seatunnel.e2e.connector.iceberg.hadoop3; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.IcebergCatalogFactory; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.CommonConfig; import org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType; +import org.apache.seatunnel.connectors.seatunnel.iceberg.config.SourceConfig; import org.apache.seatunnel.e2e.common.TestResource; import org.apache.seatunnel.e2e.common.TestSuiteBase; import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; @@ -63,7 +66,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HADOOP; @@ -133,7 +138,19 @@ public void testIcebergSource(TestContainer container) } private void initializeIcebergTable() { - CATALOG = new IcebergCatalogFactory(CATALOG_NAME, CATALOG_TYPE, WAREHOUSE, null).create(); + Map configs = new HashMap<>(); + + // add catalog properties + Map catalogProps = new HashMap<>(); + catalogProps.put("type", CATALOG_TYPE.getType()); + catalogProps.put("warehouse", WAREHOUSE); + + configs.put(CommonConfig.KEY_CATALOG_NAME.key(), CATALOG_NAME); + + configs.put(CommonConfig.CATALOG_PROPS.key(), catalogProps); + + ReadonlyConfig readonlyConfig = ReadonlyConfig.fromMap(configs); + CATALOG = new IcebergCatalogFactory(new SourceConfig(readonlyConfig)).loadCatalog(); if (!CATALOG.tableExists(TABLE)) { CATALOG.createTable(TABLE, SCHEMA); } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/resources/iceberg/iceberg_source.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/resources/iceberg/iceberg_source.conf index 97637f9a0480..c7cd57699ee9 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/resources/iceberg/iceberg_source.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-iceberg-hadoop3-e2e/src/test/resources/iceberg/iceberg_source.conf @@ -49,8 +49,10 @@ source { } } catalog_name = "seatunnel" - catalog_type = "hadoop" - warehouse = "file:///tmp/seatunnel/iceberg/hadoop3/" + iceberg.catalog.config={ + "type"="hadoop" + "warehouse"="file:///tmp/seatunnel/iceberg/hadoop3/" + } namespace = "database1" table = "source" result_table_name = "iceberg" diff --git a/seatunnel-shade/seatunnel-hadoop3-3.1.4-uber/pom.xml b/seatunnel-shade/seatunnel-hadoop3-3.1.4-uber/pom.xml index b80f07ea91df..00b55265c495 100644 --- a/seatunnel-shade/seatunnel-hadoop3-3.1.4-uber/pom.xml +++ b/seatunnel-shade/seatunnel-hadoop3-3.1.4-uber/pom.xml @@ -47,7 +47,6 @@ hadoop-client ${hadoop3.version} -