Skip to content

Commit

Permalink
eliminate runtime binary search in simpleCodeFold
Browse files Browse the repository at this point in the history
This implements the approach taken by @mykeul in
#101. Instead of a dense array of
(codepoint, case-folded codepoint) mappings, CASE_ORBIT becomes a sparse
array whose indices represent the key in this map.

The previous pull request was written before UnicodeTablesGenerator
existed.
  • Loading branch information
sjamesr committed Jun 9, 2020
1 parent 0a7c5df commit 83ae537
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 85 deletions.
14 changes: 2 additions & 12 deletions java/com/google/re2j/Unicode.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,8 @@ static boolean isPrint(int r) {
//
static int simpleFold(int r) {
// Consult caseOrbit table for special cases.
int lo = 0;
int hi = UnicodeTables.CASE_ORBIT.length;
while (lo < hi) {
int m = lo + (hi - lo) / 2;
if (UnicodeTables.CASE_ORBIT[m][0] < r) {
lo = m + 1;
} else {
hi = m;
}
}
if (lo < UnicodeTables.CASE_ORBIT.length && UnicodeTables.CASE_ORBIT[lo][0] == r) {
return UnicodeTables.CASE_ORBIT[lo][1];
if (r < UnicodeTables.CASE_ORBIT.length && UnicodeTables.CASE_ORBIT[r] != 0) {
return UnicodeTables.CASE_ORBIT[r];
}

// No folding specified. This is a one- or two-element
Expand Down
135 changes: 69 additions & 66 deletions java/com/google/re2j/UnicodeTables.java
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Generated at 2020-06-02T04:01:21.503708Z by Java 11.0.7 using Unicode version 6.0.0.0.
// Generated at 2020-06-09T16:20:06.352Z by Java 1.8.0_181 using Unicode version 6.0.0.0.
// Do not change this file, your edits will be lost. Instead change UnicodeTablesGenerator.java.
package com.google.re2j;

Expand All @@ -7,71 +7,7 @@
import java.util.Map;

final class UnicodeTables {
static final int[][] CASE_ORBIT = {
{0x004B, 0x006B},
{0x0053, 0x0073},
{0x006B, 0x212A},
{0x0073, 0x017F},
{0x00B5, 0x039C},
{0x00C5, 0x00E5},
{0x00DF, 0x1E9E},
{0x00E5, 0x212B},
{0x0130, 0x0130},
{0x0131, 0x0131},
{0x017F, 0x0053},
{0x01C4, 0x01C5},
{0x01C5, 0x01C6},
{0x01C6, 0x01C4},
{0x01C7, 0x01C8},
{0x01C8, 0x01C9},
{0x01C9, 0x01C7},
{0x01CA, 0x01CB},
{0x01CB, 0x01CC},
{0x01CC, 0x01CA},
{0x01F1, 0x01F2},
{0x01F2, 0x01F3},
{0x01F3, 0x01F1},
{0x0345, 0x0399},
{0x0392, 0x03B2},
{0x0395, 0x03B5},
{0x0398, 0x03B8},
{0x0399, 0x03B9},
{0x039A, 0x03BA},
{0x039C, 0x03BC},
{0x03A0, 0x03C0},
{0x03A1, 0x03C1},
{0x03A3, 0x03C2},
{0x03A6, 0x03C6},
{0x03A9, 0x03C9},
{0x03B2, 0x03D0},
{0x03B5, 0x03F5},
{0x03B8, 0x03D1},
{0x03B9, 0x1FBE},
{0x03BA, 0x03F0},
{0x03BC, 0x00B5},
{0x03C0, 0x03D6},
{0x03C1, 0x03F1},
{0x03C2, 0x03C3},
{0x03C3, 0x03A3},
{0x03C6, 0x03D5},
{0x03C9, 0x2126},
{0x03D0, 0x0392},
{0x03D1, 0x03F4},
{0x03D5, 0x03A6},
{0x03D6, 0x03A0},
{0x03F0, 0x039A},
{0x03F1, 0x03A1},
{0x03F4, 0x0398},
{0x03F5, 0x0395},
{0x1E60, 0x1E61},
{0x1E61, 0x1E9B},
{0x1E9B, 0x1E60},
{0x1E9E, 0x00DF},
{0x1FBE, 0x0345},
{0x2126, 0x03A9},
{0x212A, 0x004B},
{0x212B, 0x00C5},
};
static final char[] CASE_ORBIT;

static final int[][] Lu = make_Lu();

Expand Down Expand Up @@ -361,6 +297,73 @@ final class UnicodeTables {

static final Map<String, int[][]> FOLD_CATEGORIES = FoldCategory();

static {
CASE_ORBIT = new char[8492];
CASE_ORBIT[0x4b] = 0x6b;
CASE_ORBIT[0x53] = 0x73;
CASE_ORBIT[0x6b] = 0x212a;
CASE_ORBIT[0x73] = 0x17f;
CASE_ORBIT[0xb5] = 0x39c;
CASE_ORBIT[0xc5] = 0xe5;
CASE_ORBIT[0xdf] = 0x1e9e;
CASE_ORBIT[0xe5] = 0x212b;
CASE_ORBIT[0x130] = 0x130;
CASE_ORBIT[0x131] = 0x131;
CASE_ORBIT[0x17f] = 0x53;
CASE_ORBIT[0x1c4] = 0x1c5;
CASE_ORBIT[0x1c5] = 0x1c6;
CASE_ORBIT[0x1c6] = 0x1c4;
CASE_ORBIT[0x1c7] = 0x1c8;
CASE_ORBIT[0x1c8] = 0x1c9;
CASE_ORBIT[0x1c9] = 0x1c7;
CASE_ORBIT[0x1ca] = 0x1cb;
CASE_ORBIT[0x1cb] = 0x1cc;
CASE_ORBIT[0x1cc] = 0x1ca;
CASE_ORBIT[0x1f1] = 0x1f2;
CASE_ORBIT[0x1f2] = 0x1f3;
CASE_ORBIT[0x1f3] = 0x1f1;
CASE_ORBIT[0x345] = 0x399;
CASE_ORBIT[0x392] = 0x3b2;
CASE_ORBIT[0x395] = 0x3b5;
CASE_ORBIT[0x398] = 0x3b8;
CASE_ORBIT[0x399] = 0x3b9;
CASE_ORBIT[0x39a] = 0x3ba;
CASE_ORBIT[0x39c] = 0x3bc;
CASE_ORBIT[0x3a0] = 0x3c0;
CASE_ORBIT[0x3a1] = 0x3c1;
CASE_ORBIT[0x3a3] = 0x3c2;
CASE_ORBIT[0x3a6] = 0x3c6;
CASE_ORBIT[0x3a9] = 0x3c9;
CASE_ORBIT[0x3b2] = 0x3d0;
CASE_ORBIT[0x3b5] = 0x3f5;
CASE_ORBIT[0x3b8] = 0x3d1;
CASE_ORBIT[0x3b9] = 0x1fbe;
CASE_ORBIT[0x3ba] = 0x3f0;
CASE_ORBIT[0x3bc] = 0xb5;
CASE_ORBIT[0x3c0] = 0x3d6;
CASE_ORBIT[0x3c1] = 0x3f1;
CASE_ORBIT[0x3c2] = 0x3c3;
CASE_ORBIT[0x3c3] = 0x3a3;
CASE_ORBIT[0x3c6] = 0x3d5;
CASE_ORBIT[0x3c9] = 0x2126;
CASE_ORBIT[0x3d0] = 0x392;
CASE_ORBIT[0x3d1] = 0x3f4;
CASE_ORBIT[0x3d5] = 0x3a6;
CASE_ORBIT[0x3d6] = 0x3a0;
CASE_ORBIT[0x3f0] = 0x39a;
CASE_ORBIT[0x3f1] = 0x3a1;
CASE_ORBIT[0x3f4] = 0x398;
CASE_ORBIT[0x3f5] = 0x395;
CASE_ORBIT[0x1e60] = 0x1e61;
CASE_ORBIT[0x1e61] = 0x1e9b;
CASE_ORBIT[0x1e9b] = 0x1e60;
CASE_ORBIT[0x1e9e] = 0xdf;
CASE_ORBIT[0x1fbe] = 0x345;
CASE_ORBIT[0x2126] = 0x3a9;
CASE_ORBIT[0x212a] = 0x4b;
CASE_ORBIT[0x212b] = 0xc5;
}

private UnicodeTables() {}

private static int[][] make_Lu() {
Expand Down
20 changes: 13 additions & 7 deletions unicode/src/main/java/com/google/re2j/UnicodeTablesGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import com.ibm.icu.lang.UCharacterEnums;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.squareup.javapoet.CodeBlock;
import com.squareup.javapoet.FieldSpec;
import com.squareup.javapoet.JavaFile;
import com.squareup.javapoet.MethodSpec;
Expand Down Expand Up @@ -44,7 +45,7 @@ public class UnicodeTablesGenerator {
private final TypeSpec.Builder unicodeTables =
TypeSpec.classBuilder("UnicodeTables").addModifiers(Modifier.FINAL);

private final Map<Integer, Integer> sortedOrbits = generateCaseFoldOrbits();
private final SortedMap<Integer, Integer> sortedOrbits = generateCaseFoldOrbits();

public static void main(String[] args) {
new UnicodeTablesGenerator();
Expand Down Expand Up @@ -99,17 +100,22 @@ public UnicodeTablesGenerator() {
scriptRange.add(i);
}

// Emit code fold orbits
// Emit code fold orbits. In order to avoid a binary search at runtime, this code emits a sparse
// array of codepoint to the next codepoint in a case folding orbit, e.g.
// k -> K -> K (Kelvin) -> k.
{
FieldSpec.Builder caseOrbitField =
FieldSpec.builder(int[][].class, "CASE_ORBIT", Modifier.STATIC, Modifier.FINAL);
StringBuilder initializer = new StringBuilder("{\n");
FieldSpec.builder(char[].class, "CASE_ORBIT", Modifier.STATIC, Modifier.FINAL);
CodeBlock.Builder staticInitBlock = CodeBlock.builder();
staticInitBlock.addStatement("CASE_ORBIT = new char[$L]", sortedOrbits.lastKey() + 1);
for (Map.Entry<Integer, Integer> entry : sortedOrbits.entrySet()) {
initializer.append(String.format("{0x%04X, 0x%04X},\n", entry.getKey(), entry.getValue()));
staticInitBlock.addStatement(
"CASE_ORBIT[0x$L] = 0x$L",
Integer.toHexString(entry.getKey()),
Integer.toHexString(entry.getValue()));
}
initializer.append("}");
caseOrbitField.initializer(initializer.toString());
unicodeTables.addField(caseOrbitField.build());
unicodeTables.addStaticBlock(staticInitBlock.build());
}

// Emit range maps (e.g. Lu -> ranges of lowercase symbols).
Expand Down

0 comments on commit 83ae537

Please sign in to comment.