Skip to content

Commit 6fd1924

Browse files
authored
util/collate: implement utf8mb4_0900_ai_ci collation (pingcap#45650)
close pingcap#37566
1 parent d426bcd commit 6fd1924

36 files changed

+64000
-115
lines changed

.gitattributes

+3
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@
33

44
# Declare files that will always have LF line endings on checkout.
55
*.y text eol=lf
6+
7+
util/collate/unicode_0*_ci.go linguist-generated=true
8+
util/collate/ucadata/unicode_*_data.go linguist-generated=true

LICENSES/Unicode-DFS-2016-LICENSE

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
2+
3+
COPYRIGHT AND PERMISSION NOTICE
4+
5+
Copyright © 1991-2023 Unicode, Inc.
6+
7+
NOTICE TO USER: Carefully read the following legal agreement. BY
8+
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
9+
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
10+
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
11+
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
12+
13+
Permission is hereby granted, free of charge, to any person obtaining a
14+
copy of data files and any associated documentation (the "Data Files") or
15+
software and any associated documentation (the "Software") to deal in the
16+
Data Files or Software without restriction, including without limitation
17+
the rights to use, copy, modify, merge, publish, distribute, and/or sell
18+
copies of the Data Files or Software, and to permit persons to whom the
19+
Data Files or Software are furnished to do so, provided that either (a)
20+
this copyright and permission notice appear with all copies of the Data
21+
Files or Software, or (b) this copyright and permission notice appear in
22+
associated Documentation.
23+
24+
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
25+
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
27+
THIRD PARTY RIGHTS.
28+
29+
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
30+
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
31+
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
32+
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
33+
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
34+
FILES OR SOFTWARE.
35+
36+
Except as contained in this notice, the name of a copyright holder shall
37+
not be used in advertising or otherwise to promote the sale, use or other
38+
dealings in these Data Files or Software without prior written
39+
authorization of the copyright holder.

build/nogo_config.json

+5
Original file line numberDiff line numberDiff line change
@@ -1238,20 +1238,23 @@
12381238
"exclude_files": {
12391239
"parser/parser.go": "parser/parser.go code",
12401240
".*_test.go": "ignore test code",
1241+
".*_generated\\.go$": "ignore generated code",
12411242
"external/": "no need to vet third party code"
12421243
}
12431244
},
12441245
"deferrecover": {
12451246
"exclude_files": {
12461247
"parser/parser.go": "parser/parser.go code",
12471248
".*_test.go": "ignore test code",
1249+
".*_generated\\.go$": "ignore generated code",
12481250
"external/": "no need to vet third party code"
12491251
}
12501252
},
12511253
"QF1002": {
12521254
"exclude_files": {
12531255
"parser/parser.go": "parser/parser.go code",
12541256
".*_test.go": "ignore test code",
1257+
".*_generated\\.go$": "ignore generated code",
12551258
"external/": "no need to vet third party code"
12561259
}
12571260
},
@@ -1260,6 +1263,7 @@
12601263
"parser/parser.go": "parser/parser.go code",
12611264
".*_test.go": "ignore test code",
12621265
"external/": "no need to vet third party code",
1266+
".*_generated\\.go$": "ignore generated code",
12631267
"/cgo/": "no need to vet third party code for cgo"
12641268
}
12651269
},
@@ -1268,6 +1272,7 @@
12681272
"parser/parser.go": "parser/parser.go code",
12691273
".*_test.go": "ignore test code",
12701274
"external/": "no need to vet third party code",
1275+
".*_generated\\.go$": "ignore generated code",
12711276
"/cgo/": "no need to vet third party code for cgo"
12721277
}
12731278
}

cmd/explaintest/r/collation_misc_enabled.result

+2
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ latin1 47 1
104104
utf8 83 1
105105
utf8 33 1
106106
utf8 192 1
107+
utf8mb4 255 1
107108
utf8mb4 46 1
108109
utf8mb4 45 1
109110
utf8mb4 224 1
@@ -128,6 +129,7 @@ latin1_bin latin1 47 Yes Yes 1
128129
utf8_bin utf8 83 Yes Yes 1
129130
utf8_general_ci utf8 33 Yes 1
130131
utf8_unicode_ci utf8 192 Yes 1
132+
utf8mb4_0900_ai_ci utf8mb4 255 Yes 1
131133
utf8mb4_bin utf8mb4 46 Yes Yes 1
132134
utf8mb4_general_ci utf8mb4 45 Yes 1
133135
utf8mb4_unicode_ci utf8mb4 224 Yes 1

ddl/table_modify_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ func TestCreateTable(t *testing.T) {
8484
tk.MustGetErrCode("create table t_enum (a enum('abc','Abc')) charset=utf8 collate=utf8_general_ci;", errno.ErrDuplicatedValueInType)
8585
tk.MustGetErrCode("create table t_enum (a enum('e','E')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
8686
tk.MustGetErrCode("create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
87+
tk.MustGetErrCode("create table t_enum (a enum('æ','ae')) charset=utf8mb4 collate=utf8mb4_0900_ai_ci;", errno.ErrDuplicatedValueInType)
8788
// test for set column
8889
tk.MustGetErrCode("create table t_enum (a set('e','e'));", errno.ErrDuplicatedValueInType)
8990
tk.MustGetErrCode("create table t_enum (a set('e','E')) charset=utf8 collate=utf8_general_ci;", errno.ErrDuplicatedValueInType)
@@ -92,6 +93,7 @@ func TestCreateTable(t *testing.T) {
9293
tk.MustGetErrCode("create table t_enum (a set('e','E')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
9394
tk.MustGetErrCode("create table t_enum (a set('ss','ß')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
9495
tk.MustGetErrMsg("create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;", "[types:1291]Column 'a' has duplicated value 'ß' in ENUM")
96+
tk.MustGetErrCode("create table t_enum (a set('æ','ae')) charset=utf8mb4 collate=utf8mb4_0900_ai_ci;", errno.ErrDuplicatedValueInType)
9597

9698
// test for table option "union" not supported
9799
tk.MustExec("use test")

executor/test/seqtest/seq_executor_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,7 @@ func TestShowForNewCollations(t *testing.T) {
12021202
"utf8_bin utf8 83 Yes Yes 1",
12031203
"utf8_general_ci utf8 33 Yes 1",
12041204
"utf8_unicode_ci utf8 192 Yes 1",
1205+
"utf8mb4_0900_ai_ci utf8mb4 255 Yes 1",
12051206
"utf8mb4_bin utf8mb4 46 Yes Yes 1",
12061207
"utf8mb4_general_ci utf8mb4 45 Yes 1",
12071208
"utf8mb4_unicode_ci utf8mb4 224 Yes 1",

expression/builtin_like_test.go

+42-27
Original file line numberDiff line numberDiff line change
@@ -100,37 +100,40 @@ func TestRegexp(t *testing.T) {
100100
func TestCILike(t *testing.T) {
101101
ctx := createContext(t)
102102
tests := []struct {
103-
input string
104-
pattern string
105-
generalMatch int
106-
unicodeMatch int
103+
input string
104+
pattern string
105+
generalMatch int
106+
unicodeMatch int
107+
unicode0900Match int
107108
}{
108-
{"a", "", 0, 0},
109-
{"a", "a", 1, 1},
110-
{"a", "á", 1, 1},
111-
{"a", "b", 0, 0},
112-
{"aA", "Aa", 1, 1},
113-
{"áAb", `Aa%`, 1, 1},
114-
{"áAb", `%ab%`, 1, 1},
115-
{"áAb", `%ab`, 1, 1},
116-
{"ÀAb", "aA_", 1, 1},
117-
{"áééá", "a_%a", 1, 1},
118-
{"áééá", "a%_a", 1, 1},
119-
{"áéá", "a_%a", 1, 1},
120-
{"áéá", "a%_a", 1, 1},
121-
{"áá", "a_%a", 0, 0},
122-
{"áá", "a%_a", 0, 0},
123-
{"áééáííí", "a_%a%", 1, 1},
109+
{"a", "", 0, 0, 0},
110+
{"a", "a", 1, 1, 1},
111+
{"a", "á", 1, 1, 1},
112+
{"a", "b", 0, 0, 0},
113+
{"aA", "Aa", 1, 1, 1},
114+
{"áAb", `Aa%`, 1, 1, 1},
115+
{"áAb", `%ab%`, 1, 1, 1},
116+
{"áAb", `%ab`, 1, 1, 1},
117+
{"ÀAb", "aA_", 1, 1, 1},
118+
{"áééá", "a_%a", 1, 1, 1},
119+
{"áééá", "a%_a", 1, 1, 1},
120+
{"áéá", "a_%a", 1, 1, 1},
121+
{"áéá", "a%_a", 1, 1, 1},
122+
{"áá", "a_%a", 0, 0, 0},
123+
{"áá", "a%_a", 0, 0, 0},
124+
{"áééáííí", "a_%a%", 1, 1, 1},
125+
{"数汉据字库", "数%据_库", 1, 1, 1},
124126

125127
// performs matching on a per-character basis
126128
// https://dev.mysql.com/doc/refman/5.7/en/string-comparison-functions.html#operator_like
127-
{"ß", "s%", 1, 0},
128-
{"ß", "%s", 1, 0},
129-
{"ß", "ss", 0, 0},
130-
{"ß", "s", 1, 0},
131-
{"ss", "%ß%", 1, 0},
132-
{"ß", "_", 1, 1},
133-
{"ß", "__", 0, 0},
129+
{"ß", "s%", 1, 0, 0},
130+
{"ß", "%s", 1, 0, 0},
131+
{"ß", "ss", 0, 0, 0},
132+
{"ß", "s", 1, 0, 0},
133+
{"ss", "%ß%", 1, 0, 0},
134+
{"ß", "_", 1, 1, 1},
135+
{"ß", "__", 0, 0, 0},
136+
{"Ⱕ", "ⱕ", 0, 0, 1},
134137
}
135138
for _, tt := range tests {
136139
comment := fmt.Sprintf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
@@ -155,4 +158,16 @@ func TestCILike(t *testing.T) {
155158
require.NoError(t, err, comment)
156159
testutil.DatumEqual(t, types.NewDatum(tt.unicodeMatch), r, comment)
157160
}
161+
162+
for _, tt := range tests {
163+
comment := fmt.Sprintf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
164+
fc := funcs[ast.Like]
165+
inputs := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, 0))
166+
f, err := fc.getFunction(ctx, inputs)
167+
require.NoError(t, err, comment)
168+
f.setCollator(collate.GetCollator("utf8mb4_0900_ai_ci"))
169+
r, err := evalBuiltinFunc(f, chunk.Row{})
170+
require.NoError(t, err, comment)
171+
testutil.DatumEqual(t, types.NewDatum(tt.unicode0900Match), r, comment)
172+
}
158173
}

expression/builtin_string_test.go

+19-1
Original file line numberDiff line numberDiff line change
@@ -2708,7 +2708,7 @@ func TestCIWeightString(t *testing.T) {
27082708
}
27092709
res, err := result.ToString()
27102710
require.NoError(t, err)
2711-
require.Equal(t, test.expect, res)
2711+
require.Equal(t, test.expect, res, "test case: '%s' '%s' %d", test.str, test.padding, test.length)
27122712
}
27132713
}
27142714

@@ -2746,6 +2746,24 @@ func TestCIWeightString(t *testing.T) {
27462746
{"中", "BINARY", 5, "中\x00\x00"},
27472747
}
27482748

2749+
unicode0900Tests := []weightStringTest{
2750+
{"aAÁàãăâ", "NONE", 0, "\x1cG\x1cG\x1cG\x1cG\x1cG\x1cG\x1cG"},
2751+
{"中", "NONE", 0, "\xfb\x40\xce\x2d"},
2752+
{"a", "CHAR", 5, "\x1c\x47\x02\x09\x02\x09\x02\x09\x02\x09"},
2753+
{"a ", "CHAR", 5, "\x1c\x47\x02\x09\x02\x09\x02\x09\x02\x09"},
2754+
{"中", "CHAR", 5, "\xfb\x40\xce\x2d\x02\x09\x02\x09\x02\x09\x02\x09"},
2755+
{"中 ", "CHAR", 5, "\xfb\x40\xce\x2d\x02\x09\x02\x09\x02\x09\x02\x09"},
2756+
{"a", "BINARY", 1, "a"},
2757+
{"ab", "BINARY", 1, "a"},
2758+
{"a", "BINARY", 5, "a\x00\x00\x00\x00"},
2759+
{"a ", "BINARY", 5, "a \x00\x00\x00"},
2760+
{"中", "BINARY", 1, "\xe4"},
2761+
{"中", "BINARY", 2, "\xe4\xb8"},
2762+
{"中", "BINARY", 3, "中"},
2763+
{"中", "BINARY", 5, "中\x00\x00"},
2764+
}
2765+
27492766
checkResult("utf8mb4_general_ci", generalTests)
27502767
checkResult("utf8mb4_unicode_ci", unicodeTests)
2768+
checkResult("utf8mb4_0900_ai_ci", unicode0900Tests)
27512769
}

expression/collation_test.go

+9
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,15 @@ func TestCompareString(t *testing.T) {
723723
require.NotEqual(t, 0, types.CompareString("ß", "s", "utf8_unicode_ci"))
724724
require.Equal(t, 0, types.CompareString("ß", "ss", "utf8_unicode_ci"))
725725

726+
require.Equal(t, 0, types.CompareString("a", "A", "utf8mb4_0900_ai_ci"))
727+
require.Equal(t, 0, types.CompareString("À", "A", "utf8mb4_0900_ai_ci"))
728+
require.NotEqual(t, 0, types.CompareString("😜", "😃", "utf8mb4_0900_ai_ci"))
729+
require.NotEqual(t, 0, types.CompareString("a ", "a ", "utf8mb4_0900_ai_ci"))
730+
require.NotEqual(t, 0, types.CompareString("ß", "s", "utf8mb4_0900_ai_ci"))
731+
require.Equal(t, 0, types.CompareString("ß", "ss", "utf8mb4_0900_ai_ci"))
732+
require.NotEqual(t, 0, types.CompareString("\U000FFFFE", "\U000FFFFF", "utf8mb4_0900_ai_ci"))
733+
require.Equal(t, 0, types.CompareString("æ", "ae", "utf8mb4_0900_ai_ci"))
734+
726735
require.NotEqual(t, 0, types.CompareString("a", "A", "binary"))
727736
require.NotEqual(t, 0, types.CompareString("À", "A", "binary"))
728737
require.NotEqual(t, 0, types.CompareString("😜", "😃", "binary"))

expression/test/collation/BUILD.bazel

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_test")
2+
3+
go_test(
4+
name = "collation_test",
5+
timeout = "short",
6+
srcs = ["uca_test.go"],
7+
flaky = True,
8+
deps = ["//testkit"],
9+
)

expression/test/collation/uca_test.go

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// Copyright 2023 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package collation
16+
17+
import (
18+
"testing"
19+
20+
"github.com/pingcap/tidb/testkit"
21+
)
22+
23+
func TestUTF8MB40900AICIOrder(t *testing.T) {
24+
store := testkit.CreateMockStore(t)
25+
26+
tk := testkit.NewTestKit(t, store)
27+
tk.MustExec("USE test;")
28+
tk.MustExec("create table t (id int primary key auto_increment, str VARCHAR(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci)")
29+
tk.MustExec("insert into t(str) values ('カ'), ('カ'), ('abc'), ('abuFFFEc'), ('abⓒ'), ('𝒶bc'), ('𝕒bc'), ('ガ'), ('が'), ('abç'), ('äbc'), ('ヵ'), ('か'), ('Abc'), ('abC'), ('File-3'), ('file-12'), ('filé-110'), ('🍣'), ('🍺')")
30+
tk.MustQuery("select min(id) from t group by str order by str").Check(testkit.Rows(
31+
"19", "20", "3", "4", "18", "17", "16", "1"))
32+
}
33+
34+
func TestUTF8MB40900AICIStrFunc(t *testing.T) {
35+
store := testkit.CreateMockStore(t)
36+
37+
tk := testkit.NewTestKit(t, store)
38+
tk.MustExec("USE test;")
39+
// test locate
40+
tk.MustQuery("select LOCATE('bar' collate utf8mb4_0900_ai_ci, 'FOOBAR' collate utf8mb4_0900_ai_ci)").Check(
41+
testkit.Rows("4"),
42+
)
43+
// test regexp
44+
tk.MustQuery("select 'FOOBAR' collate utf8mb4_0900_ai_ci REGEXP 'foo.*' collate utf8mb4_0900_ai_ci").Check(
45+
testkit.Rows("1"),
46+
)
47+
}

server/tests/tidb_serial_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ func TestDefaultCharacterAndCollation(t *testing.T) {
477477
variable string
478478
except string
479479
}{
480-
{"collation_connection", "utf8mb4_bin"},
480+
{"collation_connection", "utf8mb4_0900_ai_ci"},
481481
{"character_set_connection", "utf8mb4"},
482482
{"character_set_client", "utf8mb4"},
483483
}

util/collate/BUILD.bazel

+5-2
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,18 @@ go_library(
1111
"gbk_chinese_ci_data.go",
1212
"general_ci.go",
1313
"pinyin_tidb_as_cs.go",
14-
"unicode_ci.go",
15-
"unicode_ci_data.go",
14+
"unicode_0400_ci_generated.go",
15+
"unicode_0400_ci_impl.go",
16+
"unicode_0900_ai_ci_generated.go",
17+
"unicode_0900_ai_ci_impl.go",
1618
],
1719
importpath = "github.com/pingcap/tidb/util/collate",
1820
visibility = ["//visibility:public"],
1921
deps = [
2022
"//parser/charset",
2123
"//parser/mysql",
2224
"//parser/terror",
25+
"//util/collate/ucadata",
2326
"//util/dbterror",
2427
"//util/hack",
2528
"//util/logutil",

util/collate/collate.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,8 @@ func runeLen(b byte) int {
325325
// IsCICollation returns if the collation is case-insensitive
326326
func IsCICollation(collate string) bool {
327327
return collate == "utf8_general_ci" || collate == "utf8mb4_general_ci" ||
328-
collate == "utf8_unicode_ci" || collate == "utf8mb4_unicode_ci" || collate == "gbk_chinese_ci"
328+
collate == "utf8_unicode_ci" || collate == "utf8mb4_unicode_ci" || collate == "gbk_chinese_ci" ||
329+
collate == "utf8mb4_0900_ai_ci"
329330
}
330331

331332
// ConvertAndGetBinCollation converts collator to binary collator
@@ -339,6 +340,8 @@ func ConvertAndGetBinCollation(collate string) Collator {
339340
return GetCollator("utf8mb4_bin")
340341
case "utf8mb4_unicode_ci":
341342
return GetCollator("utf8mb4_bin")
343+
case "utf8mb4_0900_ai_ci":
344+
return GetCollator("utf8mb4_bin")
342345
case "gbk_chinese_ci":
343346
return GetCollator("gbk_bin")
344347
}
@@ -407,6 +410,8 @@ func init() {
407410
newCollatorIDMap[CollationName2ID("utf8_general_ci")] = &generalCICollator{}
408411
newCollatorMap["utf8mb4_unicode_ci"] = &unicodeCICollator{}
409412
newCollatorIDMap[CollationName2ID("utf8mb4_unicode_ci")] = &unicodeCICollator{}
413+
newCollatorMap["utf8mb4_0900_ai_ci"] = &unicode0900AICICollator{}
414+
newCollatorIDMap[CollationName2ID("utf8mb4_0900_ai_ci")] = &unicode0900AICICollator{}
410415
newCollatorMap["utf8_unicode_ci"] = &unicodeCICollator{}
411416
newCollatorIDMap[CollationName2ID("utf8_unicode_ci")] = &unicodeCICollator{}
412417
newCollatorMap["utf8mb4_zh_pinyin_tidb_as_cs"] = &zhPinyinTiDBASCSCollator{}

0 commit comments

Comments
 (0)