Skip to content

Commit 8459b0b

Browse files
authored
Generate old monkeys (#979)
* An extra file * divergent syntaxes * spots
1 parent f930b35 commit 8459b0b

File tree

2 files changed

+122
-59
lines changed

2 files changed

+122
-59
lines changed

unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ public abstract class GenerateBreakTest implements UCD_Types {
6060
Normalizer nfd;
6161
Normalizer nfkd;
6262

63+
Segmenter segmenter;
6364
UnicodeMap<String> partition;
6465
UnicodeProperty prop;
6566

@@ -322,6 +323,7 @@ public void run() throws IOException {
322323

323324
boolean forCLDR = seg.target == Segmenter.Target.FOR_CLDR;
324325
String path = "UCD/" + ucd.getVersion() + '/' + (forCLDR ? "cldr/" : "auxiliary/");
326+
String extraPath = "UCD/" + ucd.getVersion() + "/extra/";
325327
String outFilename = fileName + "BreakTest";
326328
if (forCLDR) {
327329
outFilename = outFilename + "-cldr";
@@ -477,6 +479,37 @@ value, new ParsePosition(0), IUP.getXSymbolTable()))) {
477479
fc.close();
478480

479481
generateTest(false, path, outFilename, propertyName);
482+
generateCppOldMonkeys(extraPath, outFilename);
483+
}
484+
485+
private void generateCppOldMonkeys(String path, String outFilename) throws IOException {
486+
final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(path, outFilename + ".cpp");
487+
final PrintWriter out = fc.out;
488+
out.println();
489+
out.println("####### Instructions ##################################");
490+
out.println("# Copy the following lines into rbbitst.cpp in ICU4C, #");
491+
out.println(
492+
"# in the constructor of RBBIMeowMonkey, replacing the #"
493+
.replace("Meow", outFilename.substring(0, 4).replace("Graph", "Char")));
494+
out.println("# existing block of generated code. #");
495+
out.println("#######################################################");
496+
out.println();
497+
out.println(" // --- NOLI ME TANGERE ---");
498+
out.println(" // Generated by GenerateBreakTest.java in the Unicode tools.");
499+
for (Segmenter.Builder.NamedRefinedSet part : segmenter.getPartitionDefinition()) {
500+
out.println(
501+
" partition.emplace_back(\""
502+
+ part.getName()
503+
+ "\", UnicodeSet(uR\"("
504+
+ part.getDefinition()
505+
+ ")\", status));");
506+
}
507+
out.println();
508+
for (Segmenter.SegmentationRule rule : segmenter.getRules()) {
509+
out.println(" rules.push_back(" + rule.toCppOldMonkeyString() + ");");
510+
}
511+
out.println(" // --- End of generated code. ---");
512+
fc.close();
480513
}
481514

482515
private void generateTest(
@@ -1091,6 +1124,7 @@ public XGenerateBreakTest(
10911124
}
10921125
variables = segBuilder.getVariables();
10931126
collectingRules = false;
1127+
segmenter = seg;
10941128
partition = seg.getSamples();
10951129
fileName = filename;
10961130
propertyName = (filename.equals("Grapheme") ? "Grapheme_Cluster" : fileName) + "_Break";

unicodetools/src/main/java/org/unicode/tools/Segmenter.java

Lines changed: 88 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import com.ibm.icu.text.UTF16;
1818
import com.ibm.icu.text.UnicodeSet;
1919
import com.ibm.icu.text.UnicodeSet.SpanCondition;
20-
import com.ibm.icu.text.UnicodeSet.XSymbolTable;
2120
import com.ibm.icu.text.UnicodeSetIterator;
2221
import com.ibm.icu.util.ULocale;
2322
import java.text.ParsePosition;
@@ -36,7 +35,9 @@
3635
import java.util.stream.Collectors;
3736
import org.unicode.cldr.draft.FileUtilities;
3837
import org.unicode.cldr.util.TransliteratorUtilities;
38+
import org.unicode.props.IndexUnicodeProperties;
3939
import org.unicode.props.UnicodeProperty;
40+
import org.unicode.tools.Segmenter.Builder.NamedRefinedSet;
4041
import org.unicode.tools.Segmenter.SegmentationRule.Breaks;
4142

4243
/** Ordered list of rules, with variables resolved before building. Use Builder to make. */
@@ -68,6 +69,7 @@ public enum Target {
6869
public final Target target;
6970

7071
private UnicodeMap<String> samples = new UnicodeMap<String>();
72+
private List<NamedRefinedSet> partitionDefinition = new ArrayList<>();
7173

7274
private Segmenter(Target target) {
7375
this.target = target;
@@ -279,13 +281,16 @@ public abstract Breaks applyAt(
279281
public String toString() {
280282
return toString(false);
281283
}
284+
285+
public abstract String toCppOldMonkeyString();
282286
}
283287

284288
/** A « treat as » rule. */
285289
public static class RemapRule extends SegmentationRule {
286290

287291
public RemapRule(String leftHandSide, String replacement, String line) {
288-
pattern = Pattern.compile(leftHandSide, REGEX_FLAGS);
292+
patternDefinition = leftHandSide;
293+
pattern = Pattern.compile(Builder.expandUnicodeSets(leftHandSide), REGEX_FLAGS);
289294
this.replacement = replacement;
290295
name = line;
291296
}
@@ -352,6 +357,7 @@ public void apply(
352357
remap.accept(result);
353358
}
354359

360+
private String patternDefinition;
355361
private Pattern pattern;
356362
private String replacement;
357363
private String name;
@@ -373,6 +379,17 @@ public Breaks applyAt(
373379
protected String toString(boolean showResolved) {
374380
return name;
375381
}
382+
383+
@Override
384+
public String toCppOldMonkeyString() {
385+
return "std::make_unique<RemapRule>(uR\"("
386+
+ name
387+
+ ")\", uR\"("
388+
+ patternDefinition.replaceAll("&", "&&").replaceAll("-", "--")
389+
+ ")\", uR\"("
390+
+ replacement
391+
+ ")\")";
392+
}
376393
}
377394

378395
/** A rule that determines the status of an offset. */
@@ -384,6 +401,10 @@ public static class RegexRule extends SegmentationRule {
384401
* @param line
385402
*/
386403
public RegexRule(String before, Breaks result, String after, String line) {
404+
beforeDefinition = before;
405+
afterDefinition = after;
406+
before = Builder.expandUnicodeSets(before);
407+
after = Builder.expandUnicodeSets(after);
387408
breaks = result;
388409
before = ".*(" + before + ")";
389410
String parsing = null;
@@ -453,12 +474,27 @@ public String toString(boolean showResolved) {
453474
return result;
454475
}
455476

477+
@Override
478+
public String toCppOldMonkeyString() {
479+
return "std::make_unique<RegexRule>(uR\"("
480+
+ name
481+
+ ")\", uR\"("
482+
+ beforeDefinition.replaceAll("&", "&&").replaceAll("-", "--")
483+
+ ")\", u'"
484+
+ (breaks == Breaks.BREAK ? '÷' : '×')
485+
+ "', uR\"("
486+
+ afterDefinition.replaceAll("&", "&&").replaceAll("-", "--")
487+
+ ")\")";
488+
}
489+
456490
// ============== Internals ================
457491
// We cannot use a single regex of the form "(?<= before) after" because
458492
// (RI RI)* RI × RI would require unbounded lookbehind.
459493
private Pattern before;
460494
private Pattern after;
461495
private String name;
496+
private String beforeDefinition;
497+
private String afterDefinition;
462498

463499
private String resolved;
464500
private Breaks breaks;
@@ -474,31 +510,36 @@ public String toString(boolean showResolved) {
474510
public static class Builder {
475511
private final UnicodeProperty.Factory propFactory;
476512
private final Target target;
477-
private XSymbolTable symbolTable;
478513
private List<String> rawVariables = new ArrayList<String>();
479514
private Map<Double, String> xmlRules = new TreeMap<Double, String>();
480515
private Map<Double, String> htmlRules = new TreeMap<Double, String>();
481516
private List<String> lastComments = new ArrayList<String>();
482517

483518
class NamedSet {
484-
NamedSet(String name, UnicodeSet set) {
519+
NamedSet(String name, String definition, UnicodeSet set) {
485520
this.name = name;
521+
this.definition = definition;
486522
this.set = set;
487523
}
488524

489525
String name;
526+
String definition;
490527
UnicodeSet set;
491528
}
492529

493-
class NamedRefinedSet {
530+
public class NamedRefinedSet {
494531
public NamedRefinedSet clone() {
495532
NamedRefinedSet result = new NamedRefinedSet();
496533
for (var term : intersectionTerms) {
497-
result.intersectionTerms.add(new NamedSet(term.name, term.set.cloneAsThawed()));
534+
result.intersectionTerms.add(
535+
new NamedSet(term.name, term.definition, term.set.cloneAsThawed()));
498536
}
499537
for (var subtrahend : subtrahends) {
500538
result.subtrahends.add(
501-
new NamedSet(subtrahend.name, subtrahend.set.cloneAsThawed()));
539+
new NamedSet(
540+
subtrahend.name,
541+
subtrahend.definition,
542+
subtrahend.set.cloneAsThawed()));
502543
}
503544
result.set = this.set.cloneAsThawed();
504545
return result;
@@ -547,6 +588,19 @@ public String getName() {
547588
.collect(Collectors.joining());
548589
}
549590

591+
public String getDefinition() {
592+
return intersectionTerms.isEmpty()
593+
? "[^[]]"
594+
: "["
595+
+ intersectionTerms.stream()
596+
.map((s) -> s.definition)
597+
.collect(Collectors.joining("&"))
598+
+ subtrahends.stream()
599+
.map((s) -> "-" + s.definition)
600+
.collect(Collectors.joining())
601+
+ "]";
602+
}
603+
550604
private UnicodeSet getIntersection() {
551605
UnicodeSet result = UnicodeSet.ALL_CODE_POINTS.cloneAsThawed();
552606
for (var term : intersectionTerms) {
@@ -565,54 +619,11 @@ private UnicodeSet getIntersection() {
565619
public Builder(UnicodeProperty.Factory factory, Target target) {
566620
propFactory = factory;
567621
this.target = target;
568-
symbolTable = new MyXSymbolTable(); // propFactory.getXSymbolTable();
569622
htmlRules.put(new Double(BREAK_SOT), "sot \u00F7");
570623
htmlRules.put(new Double(BREAK_EOT), "\u00F7 eot");
571624
htmlRules.put(new Double(BREAK_ANY), "\u00F7 Any");
572625
}
573626

574-
// copied to make independent of ICU4J internals
575-
private class MyXSymbolTable extends UnicodeSet.XSymbolTable {
576-
public boolean applyPropertyAlias(
577-
String propertyName, String propertyValue, UnicodeSet result) {
578-
UnicodeProperty prop = propFactory.getProperty(propertyName);
579-
if (prop == null) {
580-
if (propertyValue.isEmpty()) {
581-
prop = propFactory.getProperty("Script");
582-
result.clear();
583-
UnicodeSet x = prop.getSet(propertyName, result);
584-
if (!x.isEmpty()) {
585-
return true;
586-
}
587-
}
588-
// If we cannot handle the property name, then we need to really fail.
589-
// If we were to just print something and return false, then the UnicodeSet code
590-
// would just evaluate this itself, and may succeed but give wrong results.
591-
// For example, as long as we require "gc=Cn" and don't handle "Cn" here,
592-
// falling back to built-in ICU data means that we get gc=Cn ranges from ICU
593-
// rather than from the current Unicode beta.
594-
throw new IllegalArgumentException(
595-
"Segmenter.MyXSymbolTable: Unknown property " + propertyName);
596-
}
597-
// Binary properties:
598-
// \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes}
599-
if (propertyValue.isEmpty() && prop.isType(UnicodeProperty.BINARY_MASK)) {
600-
propertyValue = "Yes";
601-
}
602-
result.clear();
603-
UnicodeSet x = prop.getSet(propertyValue, result);
604-
if (x.isEmpty()) {
605-
// didn't find anything
606-
System.out.println(
607-
"Segmenter.MyXSymbolTable: !Empty! "
608-
+ propertyName
609-
+ "="
610-
+ propertyValue);
611-
}
612-
return true; // mark that we handled it even if there are no results.
613-
}
614-
}
615-
616627
public String toString(String testName, String indent) {
617628

618629
StringBuffer result = new StringBuffer();
@@ -728,10 +739,15 @@ Builder addVariable(String name, String value) {
728739
+ TransliteratorUtilities.toXML.transliterate(value)
729740
+ "</variable>");
730741
value = replaceVariables(value, variables);
742+
;
731743
if (!name.endsWith("_")) {
732744
try {
733745
parsePosition.setIndex(0);
734-
UnicodeSet valueSet = new UnicodeSet(value, parsePosition, symbolTable);
746+
UnicodeSet valueSet =
747+
new UnicodeSet(
748+
value,
749+
parsePosition,
750+
IndexUnicodeProperties.make().getXSymbolTable());
735751
if (parsePosition.getIndex() != value.length()) {
736752
if (SHOW_SAMPLES)
737753
System.out.println(
@@ -748,7 +764,7 @@ Builder addVariable(String name, String value) {
748764
} else {
749765
String name2 = name;
750766
if (name2.startsWith("$")) name2 = name2.substring(1);
751-
refinePartition(new NamedSet(name2, valueSet));
767+
refinePartition(new NamedSet(name2, value, valueSet));
752768
if (SHOW_SAMPLES) {
753769
System.out.println("Samples for: " + name + " = " + value);
754770
System.out.println("\t" + valueSet);
@@ -827,8 +843,7 @@ Builder addRemapRule(Double order, String before, String after, String line) {
827843
+ " </rule>");
828844
rules.put(
829845
order,
830-
new Segmenter.RemapRule(
831-
replaceVariables(before, expandedVariables), after, line));
846+
new Segmenter.RemapRule(replaceVariables(before, variables), after, line));
832847
return this;
833848
}
834849

@@ -889,9 +904,9 @@ Builder addRegexRule(
889904
rules.put(
890905
order,
891906
new Segmenter.RegexRule(
892-
replaceVariables(before, expandedVariables),
907+
replaceVariables(before, variables),
893908
breaks,
894-
replaceVariables(after, expandedVariables),
909+
replaceVariables(after, variables),
895910
line));
896911
return this;
897912
}
@@ -906,6 +921,7 @@ public Segmenter make() {
906921
for (Double key : rules.keySet()) {
907922
result.add(key.doubleValue(), rules.get(key));
908923
}
924+
result.partitionDefinition = partition;
909925
for (var part : partition) {
910926
if (part.getName() == null) {
911927
throw new IllegalArgumentException("Unclassified characters: " + part.getSet());
@@ -952,14 +968,19 @@ private static String replaceVariables(String input, Map<String, String> variabl
952968
}
953969

954970
/** Replaces Unicode Sets with literals. */
955-
public String expandUnicodeSets(String input) {
971+
public static String expandUnicodeSets(String input) {
956972
String result = input;
973+
var parsePosition = new ParsePosition(0);
957974
// replace properties
958975
// TODO really dumb parse for now, fix later
959976
for (int i = 0; i < result.length(); ++i) {
960977
if (UnicodeSet.resemblesPattern(result, i)) {
961978
parsePosition.setIndex(i);
962-
UnicodeSet temp = new UnicodeSet(result, parsePosition, symbolTable);
979+
UnicodeSet temp =
980+
new UnicodeSet(
981+
result,
982+
parsePosition,
983+
IndexUnicodeProperties.make().getXSymbolTable());
963984
String insert = getInsertablePattern(temp);
964985
result =
965986
result.substring(0, i)
@@ -981,7 +1002,7 @@ public String expandUnicodeSets(String input) {
9811002
* @param temp
9821003
* @return
9831004
*/
984-
private String getInsertablePattern(UnicodeSet temp) {
1005+
private static String getInsertablePattern(UnicodeSet temp) {
9851006
temp.complement().complement();
9861007
if (DEBUG_REDUCE_SET_SIZE != null) {
9871008
UnicodeSet temp2 = new UnicodeSet(temp);
@@ -1053,6 +1074,14 @@ public List<String> getRules() {
10531074
}
10541075
}
10551076

1077+
public List<NamedRefinedSet> getPartitionDefinition() {
1078+
return partitionDefinition;
1079+
}
1080+
1081+
public List<SegmentationRule> getRules() {
1082+
return rules;
1083+
}
1084+
10561085
// ============== Internals ================
10571086

10581087
private List<SegmentationRule> rules = new ArrayList<SegmentationRule>(1);

0 commit comments

Comments
 (0)