[spark] Support migrate database procedure for spark

apache · Sep 19, 2024 · 86796c7 · 86796c7
1 parent 2c45ac0
commit 86796c7
Show file tree

Hide file tree

Showing 5 changed files with 276 additions and 18 deletions.
diff --git a/docs/content/spark/procedures.md b/docs/content/spark/procedures.md
@@ -130,6 +130,18 @@ This section introduce all available spark procedures about paimon.
           CALL sys.rollback(table => 'default.T', version => 10)
       </td>
     </tr>
+    <tr>
+      <td>migrate_database</td>
+      <td>
+         Migrate hive table to a paimon table. Arguments:
+            <li>source_type: the origin table's type to be migrated, such as hive. Cannot be empty.</li>
+            <li>database: name of the origin database to be migrated. Cannot be empty.</li>
+            <li>options: the table options of the paimon table to migrate.</li>
+            <li>options_map: Options map for adding key-value options which is a map.</li>
+            <li>parallelism: the parallelism for migrate process, default is core numbers of machine.</li>
+      </td>
+      <td>CALL sys.migrate_table(source_type => 'hive', table => 'default.T', options => 'file.format=parquet', options_map => map('k1','v1'), parallelism => 6)</td>
+    </tr>
     <tr>
       <td>migrate_table</td>
       <td>

diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkProcedures.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkProcedures.java
@@ -18,24 +18,7 @@
 
 package org.apache.paimon.spark;
 
-import org.apache.paimon.spark.procedure.CompactProcedure;
-import org.apache.paimon.spark.procedure.CreateBranchProcedure;
-import org.apache.paimon.spark.procedure.CreateTagFromTimestampProcedure;
-import org.apache.paimon.spark.procedure.CreateTagProcedure;
-import org.apache.paimon.spark.procedure.DeleteBranchProcedure;
-import org.apache.paimon.spark.procedure.DeleteTagProcedure;
-import org.apache.paimon.spark.procedure.ExpirePartitionsProcedure;
-import org.apache.paimon.spark.procedure.ExpireSnapshotsProcedure;
-import org.apache.paimon.spark.procedure.FastForwardProcedure;
-import org.apache.paimon.spark.procedure.MarkPartitionDoneProcedure;
-import org.apache.paimon.spark.procedure.MigrateFileProcedure;
-import org.apache.paimon.spark.procedure.MigrateTableProcedure;
-import org.apache.paimon.spark.procedure.Procedure;
-import org.apache.paimon.spark.procedure.ProcedureBuilder;
-import org.apache.paimon.spark.procedure.RemoveOrphanFilesProcedure;
-import org.apache.paimon.spark.procedure.RepairProcedure;
-import org.apache.paimon.spark.procedure.ResetConsumerProcedure;
-import org.apache.paimon.spark.procedure.RollbackProcedure;
+import org.apache.paimon.spark.procedure.*;
 
 import org.apache.paimon.shade.guava30.com.google.common.collect.ImmutableMap;
 
@@ -66,6 +49,7 @@ private static Map<String, Supplier<ProcedureBuilder>> initProcedureBuilders() {
         procedureBuilders.put("create_branch", CreateBranchProcedure::builder);
         procedureBuilders.put("delete_branch", DeleteBranchProcedure::builder);
         procedureBuilders.put("compact", CompactProcedure::builder);
+        procedureBuilders.put("migrate_database", MigrateDatabaseProcedure::builder);
         procedureBuilders.put("migrate_table", MigrateTableProcedure::builder);
         procedureBuilders.put("migrate_file", MigrateFileProcedure::builder);
         procedureBuilders.put("remove_orphan_files", RemoveOrphanFilesProcedure::builder);

diff --git a/...park-common/src/main/java/org/apache/paimon/spark/procedure/MigrateDatabaseProcedure.java b/...park-common/src/main/java/org/apache/paimon/spark/procedure/MigrateDatabaseProcedure.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure;
+
+import org.apache.paimon.catalog.Catalog;
+import org.apache.paimon.migrate.Migrator;
+import org.apache.paimon.spark.catalog.WithPaimonCatalog;
+import org.apache.paimon.spark.utils.TableMigrationUtils;
+import org.apache.paimon.utils.ParameterUtils;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.util.MapData;
+import org.apache.spark.sql.connector.catalog.TableCatalog;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+
+/**
+ * Migrate database procedure. Usage:
+ *
+ * <pre><code>
+ *  CALL sys.migrate_database(source_type => 'hive', database => 'db01', options => 'x1=y1,x2=y2')
+ * </code></pre>
+ */
+public class MigrateDatabaseProcedure extends BaseProcedure {
+
+    private static final ProcedureParameter[] PARAMETERS =
+            new ProcedureParameter[] {
+                ProcedureParameter.required("source_type", StringType),
+                ProcedureParameter.required("database", StringType),
+                ProcedureParameter.optional("options", StringType),
+                ProcedureParameter.optional(
+                        "options_map", DataTypes.createMapType(StringType, StringType)),
+                ProcedureParameter.optional("parallelism", IntegerType),
+            };
+
+    private static final StructType OUTPUT_TYPE =
+            new StructType(
+                    new StructField[] {
+                        new StructField("result", DataTypes.BooleanType, true, Metadata.empty())
+                    });
+
+    protected MigrateDatabaseProcedure(TableCatalog tableCatalog) {
+        super(tableCatalog);
+    }
+
+    @Override
+    public ProcedureParameter[] parameters() {
+        return PARAMETERS;
+    }
+
+    @Override
+    public StructType outputType() {
+        return OUTPUT_TYPE;
+    }
+
+    @Override
+    public InternalRow[] call(InternalRow args) {
+        String format = args.getString(0);
+        String database = args.getString(1);
+        String properties = args.isNullAt(2) ? null : args.getString(2);
+        MapData mapData = args.isNullAt(3) ? null : args.getMap(3);
+        Map<String, String> optionMap = mapDataToHashMap(mapData);
+        int parallelism =
+                args.isNullAt(4) ? Runtime.getRuntime().availableProcessors() : args.getInt(4);
+
+        Catalog paimonCatalog = ((WithPaimonCatalog) tableCatalog()).paimonCatalog();
+
+        Map<String, String> options = ParameterUtils.parseCommaSeparatedKeyValues(properties);
+        options.putAll(optionMap);
+
+        try {
+            List<Migrator> migrators =
+                    TableMigrationUtils.getImporters(
+                            format, paimonCatalog, database, parallelism, options);
+
+            for (Migrator migrator : migrators) {
+                migrator.executeMigrate();
+                migrator.renameTable(false);
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Call migrate_database error: " + e.getMessage(), e);
+        }
+
+        return new InternalRow[] {newInternalRow(true)};
+    }
+
+    public static Map<String, String> mapDataToHashMap(MapData mapData) {
+        HashMap<String, String> map = new HashMap<>();
+        if (mapData != null) {
+            for (int index = 0; index < mapData.numElements(); index++) {
+                map.put(
+                        mapData.keyArray().getUTF8String(index).toString(),
+                        mapData.valueArray().getUTF8String(index).toString());
+            }
+        }
+        return map;
+    }
+
+    public static ProcedureBuilder builder() {
+        return new BaseProcedure.Builder<MigrateDatabaseProcedure>() {
+            @Override
+            public MigrateDatabaseProcedure doBuild() {
+                return new MigrateDatabaseProcedure(tableCatalog());
+            }
+        };
+    }
+
+    @Override
+    public String description() {
+        return "MigrateDatabaseProcedure";
+    }
+}
diff --git a/.../paimon-spark-common/src/main/java/org/apache/paimon/spark/utils/TableMigrationUtils.java b/.../paimon-spark-common/src/main/java/org/apache/paimon/spark/utils/TableMigrationUtils.java
@@ -24,6 +24,7 @@
 import org.apache.paimon.hive.migrate.HiveMigrator;
 import org.apache.paimon.migrate.Migrator;
 
+import java.util.List;
 import java.util.Map;
 
 /** Migration util to choose importer according to connector. */
@@ -58,4 +59,25 @@ public static Migrator getImporter(
                 throw new UnsupportedOperationException("Unsupported connector " + connector);
         }
     }
+
+    public static List<Migrator> getImporters(
+            String connector,
+            Catalog catalog,
+            String sourceDatabase,
+            Integer parallelism,
+            Map<String, String> options) {
+        switch (connector) {
+            case "hive":
+                if (catalog instanceof CachingCatalog) {
+                    catalog = ((CachingCatalog) catalog).wrapped();
+                }
+                if (!(catalog instanceof HiveCatalog)) {
+                    throw new IllegalArgumentException("Only support Hive Catalog");
+                }
+                return HiveMigrator.databaseMigrators(
+                        (HiveCatalog) catalog, sourceDatabase, options, parallelism);
+            default:
+                throw new UnsupportedOperationException("Don't support connector " + connector);
+        }
+    }
 }
diff --git a/...ommon/src/test/scala/org/apache/paimon/spark/procedure/MigrateDatabaseProcedureTest.scala b/...ommon/src/test/scala/org/apache/paimon/spark/procedure/MigrateDatabaseProcedureTest.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonHiveTestBase
+
+import org.apache.spark.sql.Row
+class MigrateDatabaseProcedureTest extends PaimonHiveTestBase {
+  Seq("parquet", "orc", "avro").foreach(
+    format => {
+      test(s"Paimon migrate database procedure: migrate $format non-partitioned database") {
+        withTable("hive_tbl", "hive_tbl1") {
+          // create hive table
+          spark.sql(s"""
+                       |CREATE TABLE hive_tbl (id STRING, name STRING, pt STRING)
+                       |USING $format
+                       |""".stripMargin)
+
+          spark.sql(s"""
+                       |CREATE TABLE hive_tbl1 (id STRING, name STRING, pt STRING)
+                       |USING $format
+                       |""".stripMargin)
+
+          var rows0 = spark.sql("SHOW CREATE TABLE hive_tbl").collect()
+          assert(!rows0.apply(0).toString().contains("USING paimon"))
+
+          rows0 = spark.sql("SHOW CREATE TABLE hive_tbl1").collect()
+          assert(!rows0.apply(0).toString().contains("USING paimon"))
+
+          spark.sql(s"INSERT INTO hive_tbl VALUES ('1', 'a', 'p1'), ('2', 'b', 'p2')")
+
+          spark.sql(
+            s"CALL sys.migrate_database(source_type => 'hive', database => '$hiveDbName', options => 'file.format=$format')")
+
+          checkAnswer(
+            spark.sql(s"SELECT * FROM hive_tbl ORDER BY id"),
+            Row("1", "a", "p1") :: Row("2", "b", "p2") :: Nil)
+
+          var rows1 = spark.sql("SHOW CREATE TABLE hive_tbl").collect()
+          assert(rows1.apply(0).toString().contains("USING paimon"))
+
+          rows1 = spark.sql("SHOW CREATE TABLE hive_tbl1").collect()
+          assert(rows1.apply(0).toString().contains("USING paimon"))
+
+        }
+      }
+    })
+
+  Seq("parquet", "orc", "avro").foreach(
+    format => {
+      test(
+        s"Paimon migrate database procedure: migrate $format database with setting parallelism") {
+        withTable("hive_tbl_01", "hive_tbl_02") {
+          // create hive table
+          spark.sql(s"""
+                       |CREATE TABLE hive_tbl_01 (id STRING, name STRING, pt STRING)
+                       |USING $format
+                       |""".stripMargin)
+
+          spark.sql(s"""
+                       |CREATE TABLE hive_tbl_02 (id STRING, name STRING, pt STRING)
+                       |USING $format
+                       |""".stripMargin)
+
+          var rows0 = spark.sql("SHOW CREATE TABLE hive_tbl_01").collect()
+          assert(!rows0.apply(0).toString().contains("USING paimon"))
+
+          rows0 = spark.sql("SHOW CREATE TABLE hive_tbl_02").collect()
+          assert(!rows0.apply(0).toString().contains("USING paimon"))
+
+          spark.sql(s"INSERT INTO hive_tbl_01 VALUES ('1', 'a', 'p1'), ('2', 'b', 'p2')")
+
+          spark.sql(
+            s"CALL sys.migrate_database(source_type => 'hive', database => '$hiveDbName', options => 'file.format=$format', parallelism => 6)")
+
+          checkAnswer(
+            spark.sql(s"SELECT * FROM hive_tbl_01 ORDER BY id"),
+            Row("1", "a", "p1") :: Row("2", "b", "p2") :: Nil)
+
+          var rows1 = spark.sql("SHOW CREATE TABLE hive_tbl_01").collect()
+          assert(rows1.apply(0).toString().contains("USING paimon"))
+
+          rows1 = spark.sql("SHOW CREATE TABLE hive_tbl_02").collect()
+          assert(rows1.apply(0).toString().contains("USING paimon"))
+        }
+      }
+    })
+}