Skip to content

Commit

Permalink
[KERNEL] Extended StringType to have CollationIdentifier (#3627)
Browse files Browse the repository at this point in the history
## Description
Extended StringType to have attribute collationIdentifier.

## How was this patch tested?
Tests added to `CollatioinIdentifierSuite` and `StringTypeSuite`

## Does this PR introduce _any_ user-facing changes?
Yes. Previously, users could use StringType just as StringType.STRING,
but now they can create StringType instances with arbitrary
CollationIdentifier values.
  • Loading branch information
ilicmarkodb committed Sep 24, 2024
1 parent 2514222 commit a8cc4b4
Show file tree
Hide file tree
Showing 4 changed files with 305 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Copyright (2024) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.kernel.types;

import static io.delta.kernel.internal.util.Preconditions.checkArgument;

import io.delta.kernel.annotation.Evolving;
import java.util.Objects;
import java.util.Optional;

/**
* Identifies collation for string type. <a
* href="https://github.com/delta-io/delta/blob/master/protocol_rfcs/collated-string-type.md#collation-identifiers">
* Collation identifiers</a>
*
* @since 3.3.0
*/
@Evolving
public class CollationIdentifier {

private final String provider;
private final String name;
private final Optional<String> version;

private CollationIdentifier(String provider, String collationName) {
this(provider, collationName, Optional.empty());
}

private CollationIdentifier(String provider, String collationName, Optional<String> version) {
Objects.requireNonNull(provider, "Collation provider cannot be null.");
Objects.requireNonNull(collationName, "Collation name cannot be null.");
Objects.requireNonNull(version, "Collation version cannot be null.");

this.provider = provider.toUpperCase();
this.name = collationName.toUpperCase();
this.version = version.map(String::toUpperCase);
}

/** @return collation provider. */
public String getProvider() {
return provider;
}

/** @return collation name. */
public String getName() {
return name;
}

/** @return collation version. */
public Optional<String> getVersion() {
return version;
}

/**
* @param identifier collation identifier in string form of <br>
* {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]}.
* @return appropriate collation identifier object
*/
public static CollationIdentifier fromString(String identifier) {
long numDots = identifier.chars().filter(ch -> ch == '.').count();
checkArgument(numDots > 0, String.format("Invalid collation identifier: %s", identifier));
if (numDots == 1) {
String[] parts = identifier.split("\\.");
return new CollationIdentifier(parts[0], parts[1]);
} else {
String[] parts = identifier.split("\\.", 3);
return new CollationIdentifier(parts[0], parts[1], Optional.of(parts[2]));
}
}

/** Collation identifiers are identical when the provider, name, and version are the same. */
@Override
public boolean equals(Object o) {
if (!(o instanceof CollationIdentifier)) {
return false;
}

CollationIdentifier other = (CollationIdentifier) o;
return this.provider.equals(other.provider)
&& this.name.equals(other.name)
&& this.version.equals(other.version);
}

/** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME}. */
public String toStringWithoutVersion() {
return String.format("%s.%s", provider, name);
}

/** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]} */
@Override
public String toString() {
if (version.isPresent()) {
return String.format("%s.%s.%s", provider, name, version.get());
} else {
return String.format("%s.%s", provider, name);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,42 @@
*/
@Evolving
public class StringType extends BasePrimitiveType {
public static final StringType STRING = new StringType();
public static final StringType STRING =
new StringType(CollationIdentifier.fromString("SPARK.UTF8_BINARY"));

private StringType() {
private final CollationIdentifier collationIdentifier;

/**
* @param collationIdentifier An identifier representing the collation to be used for string
* comparison and sorting. This determines how strings will be ordered and compared in query
* operations.
*/
public StringType(CollationIdentifier collationIdentifier) {
super("string");
this.collationIdentifier = collationIdentifier;
}

/**
* @param collationName name of collation in which this StringType will be observed. In form of
* {@code PROVIDER.COLLATION_NAME[.VERSION]}
*/
public StringType(String collationName) {
super("string");
this.collationIdentifier = CollationIdentifier.fromString(collationName);
}

/** @return StringType's collation identifier */
public CollationIdentifier getCollationIdentifier() {
return collationIdentifier;
}

@Override
public boolean equals(Object o) {
if (!(o instanceof StringType)) {
return false;
}

StringType that = (StringType) o;
return collationIdentifier.equals(that.collationIdentifier);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright (2024) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.kernel.types

import org.scalatest.funsuite.AnyFunSuite

import java.util.Optional

class CollationIdentifierSuite extends AnyFunSuite {
val PROVIDER_SPARK = "SPARK"
val PROVIDER_ICU = "ICU"
val DEFAULT_COLLATION_NAME = "UTF8_BINARY"
val DEFAULT_COLLATION_IDENTIFIER = CollationIdentifier.fromString("SPARK.UTF8_BINARY")

test("check fromString with valid string") {
Seq(
(
s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME",
DEFAULT_COLLATION_IDENTIFIER
),
(
s"$PROVIDER_ICU.sr_Cyrl_SRB",
CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB")
),
(
s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1",
CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1")
)
).foreach {
case(stringIdentifier, collationIdentifier) =>
assert(CollationIdentifier.fromString(stringIdentifier).equals(collationIdentifier))
}
}

test("check fromString with invalid string") {
Seq(
PROVIDER_SPARK,
s"${PROVIDER_SPARK}_sr_Cyrl_SRB"
).foreach {
stringIdentifier =>
val e = intercept[IllegalArgumentException] {
val collationIdentifier = CollationIdentifier.fromString(stringIdentifier)
}
assert(e.getMessage == String.format("Invalid collation identifier: %s", stringIdentifier))
}
}

test("check toStringWithoutVersion") {
Seq(
(
DEFAULT_COLLATION_IDENTIFIER,
s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME"
),
(
CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"),
s"$PROVIDER_ICU.SR_CYRL_SRB"
),
(
CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"),
s"$PROVIDER_ICU.SR_CYRL_SRB"
)
).foreach {
case(collationIdentifier, toStringWithoutVersion) =>
assert(collationIdentifier.toStringWithoutVersion == toStringWithoutVersion)
}
}

test("check toString") {
Seq(
(
DEFAULT_COLLATION_IDENTIFIER,
s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME"
),
(
CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"),
s"$PROVIDER_ICU.SR_CYRL_SRB"
),
(
CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"),
s"$PROVIDER_ICU.SR_CYRL_SRB.75.1"
)
).foreach {
case(collationIdentifier, toString) =>
assert(collationIdentifier.toString == toString)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (2024) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.kernel.types

import org.scalatest.funsuite.AnyFunSuite

class StringTypeSuite extends AnyFunSuite {
test("check equals") {
// Testcase: (instance1, instance2, expected value for `instance1 == instance2`)
Seq(
(
StringType.STRING,
StringType.STRING,
true
),
(
StringType.STRING,
new StringType("sPark.UTF8_bINary"),
true
),
(
StringType.STRING,
new StringType("SPARK.UTF8_LCASE"),
false
),
(
new StringType("ICU.UNICODE"),
new StringType("SPARK.UTF8_LCASE"),
false
),
(
new StringType("ICU.UNICODE"),
new StringType("ICU.UNICODE_CI"),
false
),
(
new StringType("ICU.UNICODE_CI"),
new StringType("icU.uniCODe_Ci"),
true
)
).foreach {
case (st1, st2, expResult) =>
assert(st1.equals(st2) == expResult)
}
}
}

0 comments on commit a8cc4b4

Please sign in to comment.