ClickHouse · Yxang · May 17, 2023 · May 18, 2023 · May 19, 2023 · May 19, 2023
diff --git a/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala b/clickhouse-core/src/main/scala/xenon/clickhouse/spec/NodeSpec.scala
@@ -97,4 +97,6 @@ case class ClusterSpec(
   override def toString: String = s"cluster: $name, shards: [${shards.mkString(", ")}]"
 
   @JsonIgnore @transient override lazy val nodes: Array[NodeSpec] = shards.sorted.flatMap(_.nodes)
+
+  def totalWeight: Int = shards.map(_.weight).sum
 }
diff --git a/...rc/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala b/...rc/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterHashUDFSuite.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.clickhouse.cluster
+
+import org.apache.spark.sql.clickhouse.TestUtils.om
+import xenon.clickhouse.func.{
+  ClickHouseXxHash64Shard,
+  CompositeFunctionRegistry,
+  DynamicFunctionRegistry,
+  StaticFunctionRegistry
+}
+
+import java.lang.{Long => JLong}
+
+class ClickHouseClusterHashUDFSuite extends SparkClickHouseClusterTest {
+  // only for query function names
+  val dummyRegistry: CompositeFunctionRegistry = {
+    val dynamicFunctionRegistry = new DynamicFunctionRegistry
+    val xxHash64ShardFunc = new ClickHouseXxHash64Shard(Seq.empty)
+    dynamicFunctionRegistry.register("ck_xx_hash64_shard", xxHash64ShardFunc) // for compatible
+    dynamicFunctionRegistry.register("clickhouse_shard_xxHash64", xxHash64ShardFunc)
+    new CompositeFunctionRegistry(Array(StaticFunctionRegistry, dynamicFunctionRegistry))
+  }
+
+  def runTest(funcSparkName: String, funcCkName: String, stringVal: String): Unit = {
+    val sparkResult = spark.sql(
+      s"""SELECT
+         |  $funcSparkName($stringVal)                         AS hash_value
+         |""".stripMargin
+    ).collect
+    assert(sparkResult.length == 1)
+    val sparkHashVal = sparkResult.head.getAs[Long]("hash_value")
+
+    val clickhouseResultJsonStr = runClickHouseSQL(
+      s"""SELECT
+         |  $funcCkName($stringVal)     AS hash_value
+         |""".stripMargin
+    ).head.getString(0)
+    val clickhouseResultJson = om.readTree(clickhouseResultJsonStr)
+    val clickhouseHashVal = JLong.parseUnsignedLong(clickhouseResultJson.get("hash_value").asText)
+    assert(
+      sparkHashVal == clickhouseHashVal,
+      s"ck_function: $funcCkName, spark_function: $funcSparkName, args: ($stringVal)"
+    )
+  }
+
+  Seq(
+    "clickhouse_xxHash64",
+    "clickhouse_murmurHash3_64",
+    "clickhouse_murmurHash3_32",
+    "clickhouse_murmurHash2_64",
+    "clickhouse_murmurHash2_32",
+    "clickhouse_cityHash64"
+  ).foreach { funcSparkName =>
+    val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName)
+    test(s"UDF $funcSparkName") {
+      Seq(
+        "spark-clickhouse-connector",
+        "Apache Spark",
+        "ClickHouse",
+        "Yandex",
+        "热爱",
+        "在传统的行式数据库系统中，数据按如下顺序存储：",
+        "🇨🇳"
+      ).foreach { rawStringVal =>
+        val stringVal = s"\'$rawStringVal\'"
+        runTest(funcSparkName, funcCkName, stringVal)
+      }
+    }
+  }
+
+  Seq(
+    "clickhouse_murmurHash3_64",
+    "clickhouse_murmurHash3_32",
+    "clickhouse_murmurHash2_64",
+    "clickhouse_murmurHash2_32",
+    "clickhouse_cityHash64"
+  ).foreach { funcSparkName =>
+    val funcCkName = dummyRegistry.getFuncMappingBySpark(funcSparkName)
+    test(s"UDF $funcSparkName multiple args") {
+      val strings = Seq(
+        "\'spark-clickhouse-connector\'",
+        "\'Apache Spark\'",
+        "\'ClickHouse\'",
+        "\'Yandex\'",
+        "\'热爱\'",
+        "\'在传统的行式数据库系统中，数据按如下顺序存储：\'",
+        "\'🇨🇳\'"
+      )
+      val test_5 = strings.combinations(5)
+      test_5.foreach { seq =>
+        val stringVal = seq.mkString(", ")
+        runTest(funcSparkName, funcCkName, stringVal)
+      }
+    }
+  }
+}
diff --git a/...it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala b/...it/src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClickHouseClusterUDFSuite.scala
diff --git a/...src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala b/...src/test/scala/org/apache/spark/sql/clickhouse/cluster/ClusterShardByTransformSuite.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.clickhouse.cluster
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.Row
+
+class ClusterShardByTransformSuite extends SparkClickHouseClusterTest {
+  override protected def sparkConf: SparkConf = {
+    val _conf = super.sparkConf
+      .set("spark.clickhouse.write.distributed.convertLocal", "true")
+    _conf
+  }
+
+  def runTest(func_name: String, func_args: Array[String]): Unit = {
+    val func_expr = s"$func_name(${func_args.mkString(",")})"
+    val cluster = "single_replica"
+    val db = s"db_${func_name}_shard_transform"
+    val tbl_dist = s"tbl_${func_name}_shard"
+    val tbl_local = s"${tbl_dist}_local"
+
+    try {
+      runClickHouseSQL(s"CREATE DATABASE IF NOT EXISTS $db ON CLUSTER $cluster")
+
+      spark.sql(
+        s"""CREATE TABLE $db.$tbl_local (
+           |  create_time TIMESTAMP NOT NULL,
+           |  create_date DATE NOT NULL,
+           |  value       STRING NOT NULL
+           |) USING ClickHouse
+           |TBLPROPERTIES (
+           |  cluster = '$cluster',
+           |  engine = 'MergeTree()',
+           |  order_by = 'create_time'
+           |)
+           |""".stripMargin
+      )
+
+      runClickHouseSQL(
+        s"""CREATE TABLE $db.$tbl_dist ON CLUSTER $cluster
+           |AS $db.$tbl_local
+           |ENGINE = Distributed($cluster, '$db', '$tbl_local', $func_expr)
+           |""".stripMargin
+      )
+      spark.sql(
+        s"""INSERT INTO `$db`.`$tbl_dist`
+           |VALUES
+           |  (timestamp'2021-01-01 10:10:10', date'2021-01-01', '1'),
+           |  (timestamp'2022-02-02 11:10:10', date'2022-02-02', '2'),
+           |  (timestamp'2023-03-03 12:10:10', date'2023-03-03', '3'),
+           |  (timestamp'2024-04-04 13:10:10', date'2024-04-04', '4')
+           |  AS tab(create_time, create_date, value)
+           |""".stripMargin
+      )
+      // check that data is indeed written
+      checkAnswer(
+        spark.table(s"$db.$tbl_dist").select("value").orderBy("create_time"),
+        Seq(Row("1"), Row("2"), Row("3"), Row("4"))
+      )
+
+      // check same data is sharded in the same server comparing native sharding
+      runClickHouseSQL(
+        s"""INSERT INTO `$db`.`$tbl_dist`
+           |VALUES
+           |  (timestamp'2021-01-01 10:10:10', date'2021-01-01', '1'),
+           |  (timestamp'2022-02-02 11:10:10', date'2022-02-02', '2'),
+           |  (timestamp'2023-03-03 12:10:10', date'2023-03-03', '3'),
+           |  (timestamp'2024-04-04 13:10:10', date'2024-04-04', '4')
+           |""".stripMargin
+      )
+      checkAnswer(
+        spark.table(s"$db.$tbl_local")
+          .groupBy("value").count().filter("count != 2"),
+        Seq.empty
+      )
+
+    } finally {
+      runClickHouseSQL(s"DROP TABLE IF EXISTS $db.$tbl_dist ON CLUSTER $cluster")
+      runClickHouseSQL(s"DROP TABLE IF EXISTS $db.$tbl_local ON CLUSTER $cluster")
+      runClickHouseSQL(s"DROP DATABASE IF EXISTS $db ON CLUSTER $cluster")
+    }
+  }
+
+  Seq(
+    // wait for SPARK-44180 to be fixed, then add implicit cast test cases
+    ("toYear", Array("create_date")),
+//    ("toYear", Array("create_time")),
+    ("toYYYYMM", Array("create_date")),
+//    ("toYYYYMM", Array("create_time")),
+    ("toYYYYMMDD", Array("create_date")),
+//    ("toYYYYMMDD", Array("create_time")),
+    ("toHour", Array("create_time")),
+    ("xxHash64", Array("value")),
+    ("murmurHash2_64", Array("value")),
+    ("murmurHash2_32", Array("value")),
+    ("murmurHash3_64", Array("value")),
+    ("murmurHash3_32", Array("value")),
+    ("cityHash64", Array("value")),
+    ("modulo", Array("toYYYYMM(create_date)", "10"))
+  ).foreach {
+    case (func_name: String, func_args: Array[String]) =>
+      test(s"shard by $func_name(${func_args.mkString(",")})")(runTest(func_name, func_args))
+  }
+
+}
diff --git a/...test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala b/...test/scala/org/apache/spark/sql/clickhouse/single/WriteDistributionAndOrderingSuite.scala
@@ -78,12 +78,8 @@ class WriteDistributionAndOrderingSuite extends SparkClickHouseSingleTest {
     WRITE_REPARTITION_BY_PARTITION.key -> repartitionByPartition.toString,
     WRITE_LOCAL_SORT_BY_KEY.key -> localSortByKey.toString
   ) {
-    if (!ignoreUnsupportedTransform && repartitionByPartition) {
-      intercept[AnalysisException](write())
-    } else {
-      write()
-      check()
-    }
+    write()
+    check()
   }
 
   Seq(true, false).foreach { ignoreUnsupportedTransform =>