17
17
import com .ibm .icu .text .UTF16 ;
18
18
import com .ibm .icu .text .UnicodeSet ;
19
19
import com .ibm .icu .text .UnicodeSet .SpanCondition ;
20
- import com .ibm .icu .text .UnicodeSet .XSymbolTable ;
21
20
import com .ibm .icu .text .UnicodeSetIterator ;
22
21
import com .ibm .icu .util .ULocale ;
23
22
import java .text .ParsePosition ;
36
35
import java .util .stream .Collectors ;
37
36
import org .unicode .cldr .draft .FileUtilities ;
38
37
import org .unicode .cldr .util .TransliteratorUtilities ;
38
+ import org .unicode .props .IndexUnicodeProperties ;
39
39
import org .unicode .props .UnicodeProperty ;
40
+ import org .unicode .tools .Segmenter .Builder .NamedRefinedSet ;
40
41
import org .unicode .tools .Segmenter .SegmentationRule .Breaks ;
41
42
42
43
/** Ordered list of rules, with variables resolved before building. Use Builder to make. */
@@ -68,6 +69,7 @@ public enum Target {
68
69
public final Target target ;
69
70
70
71
private UnicodeMap <String > samples = new UnicodeMap <String >();
72
+ private List <NamedRefinedSet > partitionDefinition = new ArrayList <>();
71
73
72
74
private Segmenter (Target target ) {
73
75
this .target = target ;
@@ -279,13 +281,16 @@ public abstract Breaks applyAt(
279
281
public String toString () {
280
282
return toString (false );
281
283
}
284
+
285
+ public abstract String toCppOldMonkeyString ();
282
286
}
283
287
284
288
/** A « treat as » rule. */
285
289
public static class RemapRule extends SegmentationRule {
286
290
287
291
public RemapRule (String leftHandSide , String replacement , String line ) {
288
- pattern = Pattern .compile (leftHandSide , REGEX_FLAGS );
292
+ patternDefinition = leftHandSide ;
293
+ pattern = Pattern .compile (Builder .expandUnicodeSets (leftHandSide ), REGEX_FLAGS );
289
294
this .replacement = replacement ;
290
295
name = line ;
291
296
}
@@ -352,6 +357,7 @@ public void apply(
352
357
remap .accept (result );
353
358
}
354
359
360
+ private String patternDefinition ;
355
361
private Pattern pattern ;
356
362
private String replacement ;
357
363
private String name ;
@@ -373,6 +379,17 @@ public Breaks applyAt(
373
379
protected String toString (boolean showResolved ) {
374
380
return name ;
375
381
}
382
+
383
+ @ Override
384
+ public String toCppOldMonkeyString () {
385
+ return "std::make_unique<RemapRule>(uR\" ("
386
+ + name
387
+ + ")\" , uR\" ("
388
+ + patternDefinition .replaceAll ("&" , "&&" ).replaceAll ("-" , "--" )
389
+ + ")\" , uR\" ("
390
+ + replacement
391
+ + ")\" )" ;
392
+ }
376
393
}
377
394
378
395
/** A rule that determines the status of an offset. */
@@ -384,6 +401,10 @@ public static class RegexRule extends SegmentationRule {
384
401
* @param line
385
402
*/
386
403
public RegexRule (String before , Breaks result , String after , String line ) {
404
+ beforeDefinition = before ;
405
+ afterDefinition = after ;
406
+ before = Builder .expandUnicodeSets (before );
407
+ after = Builder .expandUnicodeSets (after );
387
408
breaks = result ;
388
409
before = ".*(" + before + ")" ;
389
410
String parsing = null ;
@@ -453,12 +474,27 @@ public String toString(boolean showResolved) {
453
474
return result ;
454
475
}
455
476
477
+ @ Override
478
+ public String toCppOldMonkeyString () {
479
+ return "std::make_unique<RegexRule>(uR\" ("
480
+ + name
481
+ + ")\" , uR\" ("
482
+ + beforeDefinition .replaceAll ("&" , "&&" ).replaceAll ("-" , "--" )
483
+ + ")\" , u'"
484
+ + (breaks == Breaks .BREAK ? '÷' : '×' )
485
+ + "', uR\" ("
486
+ + afterDefinition .replaceAll ("&" , "&&" ).replaceAll ("-" , "--" )
487
+ + ")\" )" ;
488
+ }
489
+
456
490
// ============== Internals ================
457
491
// We cannot use a single regex of the form "(?<= before) after" because
458
492
// (RI RI)* RI × RI would require unbounded lookbehind.
459
493
private Pattern before ;
460
494
private Pattern after ;
461
495
private String name ;
496
+ private String beforeDefinition ;
497
+ private String afterDefinition ;
462
498
463
499
private String resolved ;
464
500
private Breaks breaks ;
@@ -474,31 +510,36 @@ public String toString(boolean showResolved) {
474
510
public static class Builder {
475
511
private final UnicodeProperty .Factory propFactory ;
476
512
private final Target target ;
477
- private XSymbolTable symbolTable ;
478
513
private List <String > rawVariables = new ArrayList <String >();
479
514
private Map <Double , String > xmlRules = new TreeMap <Double , String >();
480
515
private Map <Double , String > htmlRules = new TreeMap <Double , String >();
481
516
private List <String > lastComments = new ArrayList <String >();
482
517
483
518
class NamedSet {
484
- NamedSet (String name , UnicodeSet set ) {
519
+ NamedSet (String name , String definition , UnicodeSet set ) {
485
520
this .name = name ;
521
+ this .definition = definition ;
486
522
this .set = set ;
487
523
}
488
524
489
525
String name ;
526
+ String definition ;
490
527
UnicodeSet set ;
491
528
}
492
529
493
- class NamedRefinedSet {
530
+ public class NamedRefinedSet {
494
531
public NamedRefinedSet clone () {
495
532
NamedRefinedSet result = new NamedRefinedSet ();
496
533
for (var term : intersectionTerms ) {
497
- result .intersectionTerms .add (new NamedSet (term .name , term .set .cloneAsThawed ()));
534
+ result .intersectionTerms .add (
535
+ new NamedSet (term .name , term .definition , term .set .cloneAsThawed ()));
498
536
}
499
537
for (var subtrahend : subtrahends ) {
500
538
result .subtrahends .add (
501
- new NamedSet (subtrahend .name , subtrahend .set .cloneAsThawed ()));
539
+ new NamedSet (
540
+ subtrahend .name ,
541
+ subtrahend .definition ,
542
+ subtrahend .set .cloneAsThawed ()));
502
543
}
503
544
result .set = this .set .cloneAsThawed ();
504
545
return result ;
@@ -547,6 +588,19 @@ public String getName() {
547
588
.collect (Collectors .joining ());
548
589
}
549
590
591
+ public String getDefinition () {
592
+ return intersectionTerms .isEmpty ()
593
+ ? "[^[]]"
594
+ : "["
595
+ + intersectionTerms .stream ()
596
+ .map ((s ) -> s .definition )
597
+ .collect (Collectors .joining ("&" ))
598
+ + subtrahends .stream ()
599
+ .map ((s ) -> "-" + s .definition )
600
+ .collect (Collectors .joining ())
601
+ + "]" ;
602
+ }
603
+
550
604
private UnicodeSet getIntersection () {
551
605
UnicodeSet result = UnicodeSet .ALL_CODE_POINTS .cloneAsThawed ();
552
606
for (var term : intersectionTerms ) {
@@ -565,54 +619,11 @@ private UnicodeSet getIntersection() {
565
619
public Builder (UnicodeProperty .Factory factory , Target target ) {
566
620
propFactory = factory ;
567
621
this .target = target ;
568
- symbolTable = new MyXSymbolTable (); // propFactory.getXSymbolTable();
569
622
htmlRules .put (new Double (BREAK_SOT ), "sot \u00F7 " );
570
623
htmlRules .put (new Double (BREAK_EOT ), "\u00F7 eot" );
571
624
htmlRules .put (new Double (BREAK_ANY ), "\u00F7 Any" );
572
625
}
573
626
574
- // copied to make independent of ICU4J internals
575
- private class MyXSymbolTable extends UnicodeSet .XSymbolTable {
576
- public boolean applyPropertyAlias (
577
- String propertyName , String propertyValue , UnicodeSet result ) {
578
- UnicodeProperty prop = propFactory .getProperty (propertyName );
579
- if (prop == null ) {
580
- if (propertyValue .isEmpty ()) {
581
- prop = propFactory .getProperty ("Script" );
582
- result .clear ();
583
- UnicodeSet x = prop .getSet (propertyName , result );
584
- if (!x .isEmpty ()) {
585
- return true ;
586
- }
587
- }
588
- // If we cannot handle the property name, then we need to really fail.
589
- // If we were to just print something and return false, then the UnicodeSet code
590
- // would just evaluate this itself, and may succeed but give wrong results.
591
- // For example, as long as we require "gc=Cn" and don't handle "Cn" here,
592
- // falling back to built-in ICU data means that we get gc=Cn ranges from ICU
593
- // rather than from the current Unicode beta.
594
- throw new IllegalArgumentException (
595
- "Segmenter.MyXSymbolTable: Unknown property " + propertyName );
596
- }
597
- // Binary properties:
598
- // \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes}
599
- if (propertyValue .isEmpty () && prop .isType (UnicodeProperty .BINARY_MASK )) {
600
- propertyValue = "Yes" ;
601
- }
602
- result .clear ();
603
- UnicodeSet x = prop .getSet (propertyValue , result );
604
- if (x .isEmpty ()) {
605
- // didn't find anything
606
- System .out .println (
607
- "Segmenter.MyXSymbolTable: !Empty! "
608
- + propertyName
609
- + "="
610
- + propertyValue );
611
- }
612
- return true ; // mark that we handled it even if there are no results.
613
- }
614
- }
615
-
616
627
public String toString (String testName , String indent ) {
617
628
618
629
StringBuffer result = new StringBuffer ();
@@ -728,10 +739,15 @@ Builder addVariable(String name, String value) {
728
739
+ TransliteratorUtilities .toXML .transliterate (value )
729
740
+ "</variable>" );
730
741
value = replaceVariables (value , variables );
742
+ ;
731
743
if (!name .endsWith ("_" )) {
732
744
try {
733
745
parsePosition .setIndex (0 );
734
- UnicodeSet valueSet = new UnicodeSet (value , parsePosition , symbolTable );
746
+ UnicodeSet valueSet =
747
+ new UnicodeSet (
748
+ value ,
749
+ parsePosition ,
750
+ IndexUnicodeProperties .make ().getXSymbolTable ());
735
751
if (parsePosition .getIndex () != value .length ()) {
736
752
if (SHOW_SAMPLES )
737
753
System .out .println (
@@ -748,7 +764,7 @@ Builder addVariable(String name, String value) {
748
764
} else {
749
765
String name2 = name ;
750
766
if (name2 .startsWith ("$" )) name2 = name2 .substring (1 );
751
- refinePartition (new NamedSet (name2 , valueSet ));
767
+ refinePartition (new NamedSet (name2 , value , valueSet ));
752
768
if (SHOW_SAMPLES ) {
753
769
System .out .println ("Samples for: " + name + " = " + value );
754
770
System .out .println ("\t " + valueSet );
@@ -827,8 +843,7 @@ Builder addRemapRule(Double order, String before, String after, String line) {
827
843
+ " </rule>" );
828
844
rules .put (
829
845
order ,
830
- new Segmenter .RemapRule (
831
- replaceVariables (before , expandedVariables ), after , line ));
846
+ new Segmenter .RemapRule (replaceVariables (before , variables ), after , line ));
832
847
return this ;
833
848
}
834
849
@@ -889,9 +904,9 @@ Builder addRegexRule(
889
904
rules .put (
890
905
order ,
891
906
new Segmenter .RegexRule (
892
- replaceVariables (before , expandedVariables ),
907
+ replaceVariables (before , variables ),
893
908
breaks ,
894
- replaceVariables (after , expandedVariables ),
909
+ replaceVariables (after , variables ),
895
910
line ));
896
911
return this ;
897
912
}
@@ -906,6 +921,7 @@ public Segmenter make() {
906
921
for (Double key : rules .keySet ()) {
907
922
result .add (key .doubleValue (), rules .get (key ));
908
923
}
924
+ result .partitionDefinition = partition ;
909
925
for (var part : partition ) {
910
926
if (part .getName () == null ) {
911
927
throw new IllegalArgumentException ("Unclassified characters: " + part .getSet ());
@@ -952,14 +968,19 @@ private static String replaceVariables(String input, Map<String, String> variabl
952
968
}
953
969
954
970
/** Replaces Unicode Sets with literals. */
955
- public String expandUnicodeSets (String input ) {
971
+ public static String expandUnicodeSets (String input ) {
956
972
String result = input ;
973
+ var parsePosition = new ParsePosition (0 );
957
974
// replace properties
958
975
// TODO really dumb parse for now, fix later
959
976
for (int i = 0 ; i < result .length (); ++i ) {
960
977
if (UnicodeSet .resemblesPattern (result , i )) {
961
978
parsePosition .setIndex (i );
962
- UnicodeSet temp = new UnicodeSet (result , parsePosition , symbolTable );
979
+ UnicodeSet temp =
980
+ new UnicodeSet (
981
+ result ,
982
+ parsePosition ,
983
+ IndexUnicodeProperties .make ().getXSymbolTable ());
963
984
String insert = getInsertablePattern (temp );
964
985
result =
965
986
result .substring (0 , i )
@@ -981,7 +1002,7 @@ public String expandUnicodeSets(String input) {
981
1002
* @param temp
982
1003
* @return
983
1004
*/
984
- private String getInsertablePattern (UnicodeSet temp ) {
1005
+ private static String getInsertablePattern (UnicodeSet temp ) {
985
1006
temp .complement ().complement ();
986
1007
if (DEBUG_REDUCE_SET_SIZE != null ) {
987
1008
UnicodeSet temp2 = new UnicodeSet (temp );
@@ -1053,6 +1074,14 @@ public List<String> getRules() {
1053
1074
}
1054
1075
}
1055
1076
1077
+ public List <NamedRefinedSet > getPartitionDefinition () {
1078
+ return partitionDefinition ;
1079
+ }
1080
+
1081
+ public List <SegmentationRule > getRules () {
1082
+ return rules ;
1083
+ }
1084
+
1056
1085
// ============== Internals ================
1057
1086
1058
1087
private List <SegmentationRule > rules = new ArrayList <SegmentationRule >(1 );
0 commit comments