From 83ae537508c8948c3fda173e28a353b344cfb098 Mon Sep 17 00:00:00 2001 From: James Ring Date: Tue, 9 Jun 2020 09:23:20 -0700 Subject: [PATCH] eliminate runtime binary search in simpleCodeFold This implements the approach taken by @mykeul in https://github.com/google/re2j/pull/101. Instead of a dense array of (codepoint, case-folded codepoint) mappings, CASE_ORBIT becomes a sparse array whose indices represent the key in this map. The previous pull request was written before UnicodeTablesGenerator existed. --- java/com/google/re2j/Unicode.java | 14 +- java/com/google/re2j/UnicodeTables.java | 135 +++++++++--------- .../google/re2j/UnicodeTablesGenerator.java | 20 ++- 3 files changed, 84 insertions(+), 85 deletions(-) diff --git a/java/com/google/re2j/Unicode.java b/java/com/google/re2j/Unicode.java index e4a3a375..ca2f9144 100644 --- a/java/com/google/re2j/Unicode.java +++ b/java/com/google/re2j/Unicode.java @@ -106,18 +106,8 @@ static boolean isPrint(int r) { // static int simpleFold(int r) { // Consult caseOrbit table for special cases. - int lo = 0; - int hi = UnicodeTables.CASE_ORBIT.length; - while (lo < hi) { - int m = lo + (hi - lo) / 2; - if (UnicodeTables.CASE_ORBIT[m][0] < r) { - lo = m + 1; - } else { - hi = m; - } - } - if (lo < UnicodeTables.CASE_ORBIT.length && UnicodeTables.CASE_ORBIT[lo][0] == r) { - return UnicodeTables.CASE_ORBIT[lo][1]; + if (r < UnicodeTables.CASE_ORBIT.length && UnicodeTables.CASE_ORBIT[r] != 0) { + return UnicodeTables.CASE_ORBIT[r]; } // No folding specified. This is a one- or two-element diff --git a/java/com/google/re2j/UnicodeTables.java b/java/com/google/re2j/UnicodeTables.java index e568e941..5ee46caf 100644 --- a/java/com/google/re2j/UnicodeTables.java +++ b/java/com/google/re2j/UnicodeTables.java @@ -1,4 +1,4 @@ -// Generated at 2020-06-02T04:01:21.503708Z by Java 11.0.7 using Unicode version 6.0.0.0. +// Generated at 2020-06-09T16:20:06.352Z by Java 1.8.0_181 using Unicode version 6.0.0.0. // Do not change this file, your edits will be lost. Instead change UnicodeTablesGenerator.java. package com.google.re2j; @@ -7,71 +7,7 @@ import java.util.Map; final class UnicodeTables { - static final int[][] CASE_ORBIT = { - {0x004B, 0x006B}, - {0x0053, 0x0073}, - {0x006B, 0x212A}, - {0x0073, 0x017F}, - {0x00B5, 0x039C}, - {0x00C5, 0x00E5}, - {0x00DF, 0x1E9E}, - {0x00E5, 0x212B}, - {0x0130, 0x0130}, - {0x0131, 0x0131}, - {0x017F, 0x0053}, - {0x01C4, 0x01C5}, - {0x01C5, 0x01C6}, - {0x01C6, 0x01C4}, - {0x01C7, 0x01C8}, - {0x01C8, 0x01C9}, - {0x01C9, 0x01C7}, - {0x01CA, 0x01CB}, - {0x01CB, 0x01CC}, - {0x01CC, 0x01CA}, - {0x01F1, 0x01F2}, - {0x01F2, 0x01F3}, - {0x01F3, 0x01F1}, - {0x0345, 0x0399}, - {0x0392, 0x03B2}, - {0x0395, 0x03B5}, - {0x0398, 0x03B8}, - {0x0399, 0x03B9}, - {0x039A, 0x03BA}, - {0x039C, 0x03BC}, - {0x03A0, 0x03C0}, - {0x03A1, 0x03C1}, - {0x03A3, 0x03C2}, - {0x03A6, 0x03C6}, - {0x03A9, 0x03C9}, - {0x03B2, 0x03D0}, - {0x03B5, 0x03F5}, - {0x03B8, 0x03D1}, - {0x03B9, 0x1FBE}, - {0x03BA, 0x03F0}, - {0x03BC, 0x00B5}, - {0x03C0, 0x03D6}, - {0x03C1, 0x03F1}, - {0x03C2, 0x03C3}, - {0x03C3, 0x03A3}, - {0x03C6, 0x03D5}, - {0x03C9, 0x2126}, - {0x03D0, 0x0392}, - {0x03D1, 0x03F4}, - {0x03D5, 0x03A6}, - {0x03D6, 0x03A0}, - {0x03F0, 0x039A}, - {0x03F1, 0x03A1}, - {0x03F4, 0x0398}, - {0x03F5, 0x0395}, - {0x1E60, 0x1E61}, - {0x1E61, 0x1E9B}, - {0x1E9B, 0x1E60}, - {0x1E9E, 0x00DF}, - {0x1FBE, 0x0345}, - {0x2126, 0x03A9}, - {0x212A, 0x004B}, - {0x212B, 0x00C5}, - }; + static final char[] CASE_ORBIT; static final int[][] Lu = make_Lu(); @@ -361,6 +297,73 @@ final class UnicodeTables { static final Map FOLD_CATEGORIES = FoldCategory(); + static { + CASE_ORBIT = new char[8492]; + CASE_ORBIT[0x4b] = 0x6b; + CASE_ORBIT[0x53] = 0x73; + CASE_ORBIT[0x6b] = 0x212a; + CASE_ORBIT[0x73] = 0x17f; + CASE_ORBIT[0xb5] = 0x39c; + CASE_ORBIT[0xc5] = 0xe5; + CASE_ORBIT[0xdf] = 0x1e9e; + CASE_ORBIT[0xe5] = 0x212b; + CASE_ORBIT[0x130] = 0x130; + CASE_ORBIT[0x131] = 0x131; + CASE_ORBIT[0x17f] = 0x53; + CASE_ORBIT[0x1c4] = 0x1c5; + CASE_ORBIT[0x1c5] = 0x1c6; + CASE_ORBIT[0x1c6] = 0x1c4; + CASE_ORBIT[0x1c7] = 0x1c8; + CASE_ORBIT[0x1c8] = 0x1c9; + CASE_ORBIT[0x1c9] = 0x1c7; + CASE_ORBIT[0x1ca] = 0x1cb; + CASE_ORBIT[0x1cb] = 0x1cc; + CASE_ORBIT[0x1cc] = 0x1ca; + CASE_ORBIT[0x1f1] = 0x1f2; + CASE_ORBIT[0x1f2] = 0x1f3; + CASE_ORBIT[0x1f3] = 0x1f1; + CASE_ORBIT[0x345] = 0x399; + CASE_ORBIT[0x392] = 0x3b2; + CASE_ORBIT[0x395] = 0x3b5; + CASE_ORBIT[0x398] = 0x3b8; + CASE_ORBIT[0x399] = 0x3b9; + CASE_ORBIT[0x39a] = 0x3ba; + CASE_ORBIT[0x39c] = 0x3bc; + CASE_ORBIT[0x3a0] = 0x3c0; + CASE_ORBIT[0x3a1] = 0x3c1; + CASE_ORBIT[0x3a3] = 0x3c2; + CASE_ORBIT[0x3a6] = 0x3c6; + CASE_ORBIT[0x3a9] = 0x3c9; + CASE_ORBIT[0x3b2] = 0x3d0; + CASE_ORBIT[0x3b5] = 0x3f5; + CASE_ORBIT[0x3b8] = 0x3d1; + CASE_ORBIT[0x3b9] = 0x1fbe; + CASE_ORBIT[0x3ba] = 0x3f0; + CASE_ORBIT[0x3bc] = 0xb5; + CASE_ORBIT[0x3c0] = 0x3d6; + CASE_ORBIT[0x3c1] = 0x3f1; + CASE_ORBIT[0x3c2] = 0x3c3; + CASE_ORBIT[0x3c3] = 0x3a3; + CASE_ORBIT[0x3c6] = 0x3d5; + CASE_ORBIT[0x3c9] = 0x2126; + CASE_ORBIT[0x3d0] = 0x392; + CASE_ORBIT[0x3d1] = 0x3f4; + CASE_ORBIT[0x3d5] = 0x3a6; + CASE_ORBIT[0x3d6] = 0x3a0; + CASE_ORBIT[0x3f0] = 0x39a; + CASE_ORBIT[0x3f1] = 0x3a1; + CASE_ORBIT[0x3f4] = 0x398; + CASE_ORBIT[0x3f5] = 0x395; + CASE_ORBIT[0x1e60] = 0x1e61; + CASE_ORBIT[0x1e61] = 0x1e9b; + CASE_ORBIT[0x1e9b] = 0x1e60; + CASE_ORBIT[0x1e9e] = 0xdf; + CASE_ORBIT[0x1fbe] = 0x345; + CASE_ORBIT[0x2126] = 0x3a9; + CASE_ORBIT[0x212a] = 0x4b; + CASE_ORBIT[0x212b] = 0xc5; + } + private UnicodeTables() {} private static int[][] make_Lu() { diff --git a/unicode/src/main/java/com/google/re2j/UnicodeTablesGenerator.java b/unicode/src/main/java/com/google/re2j/UnicodeTablesGenerator.java index 0efc8d55..e3be9506 100644 --- a/unicode/src/main/java/com/google/re2j/UnicodeTablesGenerator.java +++ b/unicode/src/main/java/com/google/re2j/UnicodeTablesGenerator.java @@ -11,6 +11,7 @@ import com.ibm.icu.lang.UCharacterEnums; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; +import com.squareup.javapoet.CodeBlock; import com.squareup.javapoet.FieldSpec; import com.squareup.javapoet.JavaFile; import com.squareup.javapoet.MethodSpec; @@ -44,7 +45,7 @@ public class UnicodeTablesGenerator { private final TypeSpec.Builder unicodeTables = TypeSpec.classBuilder("UnicodeTables").addModifiers(Modifier.FINAL); - private final Map sortedOrbits = generateCaseFoldOrbits(); + private final SortedMap sortedOrbits = generateCaseFoldOrbits(); public static void main(String[] args) { new UnicodeTablesGenerator(); @@ -99,17 +100,22 @@ public UnicodeTablesGenerator() { scriptRange.add(i); } - // Emit code fold orbits + // Emit code fold orbits. In order to avoid a binary search at runtime, this code emits a sparse + // array of codepoint to the next codepoint in a case folding orbit, e.g. + // k -> K -> K (Kelvin) -> k. { FieldSpec.Builder caseOrbitField = - FieldSpec.builder(int[][].class, "CASE_ORBIT", Modifier.STATIC, Modifier.FINAL); - StringBuilder initializer = new StringBuilder("{\n"); + FieldSpec.builder(char[].class, "CASE_ORBIT", Modifier.STATIC, Modifier.FINAL); + CodeBlock.Builder staticInitBlock = CodeBlock.builder(); + staticInitBlock.addStatement("CASE_ORBIT = new char[$L]", sortedOrbits.lastKey() + 1); for (Map.Entry entry : sortedOrbits.entrySet()) { - initializer.append(String.format("{0x%04X, 0x%04X},\n", entry.getKey(), entry.getValue())); + staticInitBlock.addStatement( + "CASE_ORBIT[0x$L] = 0x$L", + Integer.toHexString(entry.getKey()), + Integer.toHexString(entry.getValue())); } - initializer.append("}"); - caseOrbitField.initializer(initializer.toString()); unicodeTables.addField(caseOrbitField.build()); + unicodeTables.addStaticBlock(staticInitBlock.build()); } // Emit range maps (e.g. Lu -> ranges of lowercase symbols).