Skip to content

Commit

Permalink
[spark] Support migrate database procedure for spark
Browse files Browse the repository at this point in the history
  • Loading branch information
xuzifu666 committed Sep 19, 2024
1 parent 2c45ac0 commit 86796c7
Show file tree
Hide file tree
Showing 5 changed files with 276 additions and 18 deletions.
12 changes: 12 additions & 0 deletions docs/content/spark/procedures.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,18 @@ This section introduce all available spark procedures about paimon.
CALL sys.rollback(table => 'default.T', version => 10)
</td>
</tr>
<tr>
<td>migrate_database</td>
<td>
Migrate hive table to a paimon table. Arguments:
<li>source_type: the origin table's type to be migrated, such as hive. Cannot be empty.</li>
<li>database: name of the origin database to be migrated. Cannot be empty.</li>
<li>options: the table options of the paimon table to migrate.</li>
<li>options_map: Options map for adding key-value options which is a map.</li>
<li>parallelism: the parallelism for migrate process, default is core numbers of machine.</li>
</td>
<td>CALL sys.migrate_table(source_type => 'hive', table => 'default.T', options => 'file.format=parquet', options_map => map('k1','v1'), parallelism => 6)</td>
</tr>
<tr>
<td>migrate_table</td>
<td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,7 @@

package org.apache.paimon.spark;

import org.apache.paimon.spark.procedure.CompactProcedure;
import org.apache.paimon.spark.procedure.CreateBranchProcedure;
import org.apache.paimon.spark.procedure.CreateTagFromTimestampProcedure;
import org.apache.paimon.spark.procedure.CreateTagProcedure;
import org.apache.paimon.spark.procedure.DeleteBranchProcedure;
import org.apache.paimon.spark.procedure.DeleteTagProcedure;
import org.apache.paimon.spark.procedure.ExpirePartitionsProcedure;
import org.apache.paimon.spark.procedure.ExpireSnapshotsProcedure;
import org.apache.paimon.spark.procedure.FastForwardProcedure;
import org.apache.paimon.spark.procedure.MarkPartitionDoneProcedure;
import org.apache.paimon.spark.procedure.MigrateFileProcedure;
import org.apache.paimon.spark.procedure.MigrateTableProcedure;
import org.apache.paimon.spark.procedure.Procedure;
import org.apache.paimon.spark.procedure.ProcedureBuilder;
import org.apache.paimon.spark.procedure.RemoveOrphanFilesProcedure;
import org.apache.paimon.spark.procedure.RepairProcedure;
import org.apache.paimon.spark.procedure.ResetConsumerProcedure;
import org.apache.paimon.spark.procedure.RollbackProcedure;
import org.apache.paimon.spark.procedure.*;

import org.apache.paimon.shade.guava30.com.google.common.collect.ImmutableMap;

Expand Down Expand Up @@ -66,6 +49,7 @@ private static Map<String, Supplier<ProcedureBuilder>> initProcedureBuilders() {
procedureBuilders.put("create_branch", CreateBranchProcedure::builder);
procedureBuilders.put("delete_branch", DeleteBranchProcedure::builder);
procedureBuilders.put("compact", CompactProcedure::builder);
procedureBuilders.put("migrate_database", MigrateDatabaseProcedure::builder);
procedureBuilders.put("migrate_table", MigrateTableProcedure::builder);
procedureBuilders.put("migrate_file", MigrateFileProcedure::builder);
procedureBuilders.put("remove_orphan_files", RemoveOrphanFilesProcedure::builder);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.spark.procedure;

import org.apache.paimon.catalog.Catalog;
import org.apache.paimon.migrate.Migrator;
import org.apache.paimon.spark.catalog.WithPaimonCatalog;
import org.apache.paimon.spark.utils.TableMigrationUtils;
import org.apache.paimon.utils.ParameterUtils;

import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.util.MapData;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.apache.spark.sql.types.DataTypes.*;

/**
* Migrate database procedure. Usage:
*
* <pre><code>
* CALL sys.migrate_database(source_type => 'hive', database => 'db01', options => 'x1=y1,x2=y2')
* </code></pre>
*/
public class MigrateDatabaseProcedure extends BaseProcedure {

private static final ProcedureParameter[] PARAMETERS =
new ProcedureParameter[] {
ProcedureParameter.required("source_type", StringType),
ProcedureParameter.required("database", StringType),
ProcedureParameter.optional("options", StringType),
ProcedureParameter.optional(
"options_map", DataTypes.createMapType(StringType, StringType)),
ProcedureParameter.optional("parallelism", IntegerType),
};

private static final StructType OUTPUT_TYPE =
new StructType(
new StructField[] {
new StructField("result", DataTypes.BooleanType, true, Metadata.empty())
});

protected MigrateDatabaseProcedure(TableCatalog tableCatalog) {
super(tableCatalog);
}

@Override
public ProcedureParameter[] parameters() {
return PARAMETERS;
}

@Override
public StructType outputType() {
return OUTPUT_TYPE;
}

@Override
public InternalRow[] call(InternalRow args) {
String format = args.getString(0);
String database = args.getString(1);
String properties = args.isNullAt(2) ? null : args.getString(2);
MapData mapData = args.isNullAt(3) ? null : args.getMap(3);
Map<String, String> optionMap = mapDataToHashMap(mapData);
int parallelism =
args.isNullAt(4) ? Runtime.getRuntime().availableProcessors() : args.getInt(4);

Catalog paimonCatalog = ((WithPaimonCatalog) tableCatalog()).paimonCatalog();

Map<String, String> options = ParameterUtils.parseCommaSeparatedKeyValues(properties);
options.putAll(optionMap);

try {
List<Migrator> migrators =
TableMigrationUtils.getImporters(
format, paimonCatalog, database, parallelism, options);

for (Migrator migrator : migrators) {
migrator.executeMigrate();
migrator.renameTable(false);
}
} catch (Exception e) {
throw new RuntimeException("Call migrate_database error: " + e.getMessage(), e);
}

return new InternalRow[] {newInternalRow(true)};
}

public static Map<String, String> mapDataToHashMap(MapData mapData) {
HashMap<String, String> map = new HashMap<>();
if (mapData != null) {
for (int index = 0; index < mapData.numElements(); index++) {
map.put(
mapData.keyArray().getUTF8String(index).toString(),
mapData.valueArray().getUTF8String(index).toString());
}
}
return map;
}

public static ProcedureBuilder builder() {
return new BaseProcedure.Builder<MigrateDatabaseProcedure>() {
@Override
public MigrateDatabaseProcedure doBuild() {
return new MigrateDatabaseProcedure(tableCatalog());
}
};
}

@Override
public String description() {
return "MigrateDatabaseProcedure";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.paimon.hive.migrate.HiveMigrator;
import org.apache.paimon.migrate.Migrator;

import java.util.List;
import java.util.Map;

/** Migration util to choose importer according to connector. */
Expand Down Expand Up @@ -58,4 +59,25 @@ public static Migrator getImporter(
throw new UnsupportedOperationException("Unsupported connector " + connector);
}
}

public static List<Migrator> getImporters(
String connector,
Catalog catalog,
String sourceDatabase,
Integer parallelism,
Map<String, String> options) {
switch (connector) {
case "hive":
if (catalog instanceof CachingCatalog) {
catalog = ((CachingCatalog) catalog).wrapped();
}
if (!(catalog instanceof HiveCatalog)) {
throw new IllegalArgumentException("Only support Hive Catalog");
}
return HiveMigrator.databaseMigrators(
(HiveCatalog) catalog, sourceDatabase, options, parallelism);
default:
throw new UnsupportedOperationException("Don't support connector " + connector);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.paimon.spark.procedure

import org.apache.paimon.spark.PaimonHiveTestBase

import org.apache.spark.sql.Row
class MigrateDatabaseProcedureTest extends PaimonHiveTestBase {
Seq("parquet", "orc", "avro").foreach(
format => {
test(s"Paimon migrate database procedure: migrate $format non-partitioned database") {
withTable("hive_tbl", "hive_tbl1") {
// create hive table
spark.sql(s"""
|CREATE TABLE hive_tbl (id STRING, name STRING, pt STRING)
|USING $format
|""".stripMargin)

spark.sql(s"""
|CREATE TABLE hive_tbl1 (id STRING, name STRING, pt STRING)
|USING $format
|""".stripMargin)

var rows0 = spark.sql("SHOW CREATE TABLE hive_tbl").collect()
assert(!rows0.apply(0).toString().contains("USING paimon"))

rows0 = spark.sql("SHOW CREATE TABLE hive_tbl1").collect()
assert(!rows0.apply(0).toString().contains("USING paimon"))

spark.sql(s"INSERT INTO hive_tbl VALUES ('1', 'a', 'p1'), ('2', 'b', 'p2')")

spark.sql(
s"CALL sys.migrate_database(source_type => 'hive', database => '$hiveDbName', options => 'file.format=$format')")

checkAnswer(
spark.sql(s"SELECT * FROM hive_tbl ORDER BY id"),
Row("1", "a", "p1") :: Row("2", "b", "p2") :: Nil)

var rows1 = spark.sql("SHOW CREATE TABLE hive_tbl").collect()
assert(rows1.apply(0).toString().contains("USING paimon"))

rows1 = spark.sql("SHOW CREATE TABLE hive_tbl1").collect()
assert(rows1.apply(0).toString().contains("USING paimon"))

}
}
})

Seq("parquet", "orc", "avro").foreach(
format => {
test(
s"Paimon migrate database procedure: migrate $format database with setting parallelism") {
withTable("hive_tbl_01", "hive_tbl_02") {
// create hive table
spark.sql(s"""
|CREATE TABLE hive_tbl_01 (id STRING, name STRING, pt STRING)
|USING $format
|""".stripMargin)

spark.sql(s"""
|CREATE TABLE hive_tbl_02 (id STRING, name STRING, pt STRING)
|USING $format
|""".stripMargin)

var rows0 = spark.sql("SHOW CREATE TABLE hive_tbl_01").collect()
assert(!rows0.apply(0).toString().contains("USING paimon"))

rows0 = spark.sql("SHOW CREATE TABLE hive_tbl_02").collect()
assert(!rows0.apply(0).toString().contains("USING paimon"))

spark.sql(s"INSERT INTO hive_tbl_01 VALUES ('1', 'a', 'p1'), ('2', 'b', 'p2')")

spark.sql(
s"CALL sys.migrate_database(source_type => 'hive', database => '$hiveDbName', options => 'file.format=$format', parallelism => 6)")

checkAnswer(
spark.sql(s"SELECT * FROM hive_tbl_01 ORDER BY id"),
Row("1", "a", "p1") :: Row("2", "b", "p2") :: Nil)

var rows1 = spark.sql("SHOW CREATE TABLE hive_tbl_01").collect()
assert(rows1.apply(0).toString().contains("USING paimon"))

rows1 = spark.sql("SHOW CREATE TABLE hive_tbl_02").collect()
assert(rows1.apply(0).toString().contains("USING paimon"))
}
}
})
}

0 comments on commit 86796c7

Please sign in to comment.