diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 50dfd4b65bf6..2e66955eb114 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index a10ec339bdb1..a2f20a0bef5f 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -871,6 +871,15 @@ push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' if $v_version ge v6.2.0; +# These properties of Egyptian hieroglyphs are not handled by Perl. Their +# intended audience is only specialist Egyptologists +push @tables_that_may_be_empty, qw(kEH_Cat kEH_Desc kEH_HG kEH_IFAO + kEH_JSesh + kEH_NoMirror kEH_NoMirror=Yes + kEH_NoMirror=No + kEH_NoRotate kEH_NoRotate=Yes) + if $v_version ge v16.0.0; + # The lists below are hashes, so the key is the item in the list, and the # value is the reason why it is in the list. This makes generation of # documentation easier. @@ -3618,6 +3627,11 @@ sub trace { return main::trace(@_); } return $standard_form{$addr} = main::standardize($value); } + sub count($self) { + my $addr = pack 'J', refaddr $self; + return $end{$addr} - $start{$addr} + 1; + } + sub dump($self, $indent) { # Human, not machine readable. For machine readable, comment out this # entire routine and let the standard one take effect. @@ -4802,7 +4816,7 @@ sub trace { return main::trace(@_); } my $count = 0; foreach my $range (@{$ranges{$addr}}) { - $count += $range->end - $range->start + 1; + $count += $range->count; } return $count; } @@ -6019,7 +6033,7 @@ END # points" my $count = ($set->type != 0) ? 1 - : $set->end - $set->start + 1; + : $set->count; $widths{$this_width} += $count; $total += $count; $max_map_width = $this_width @@ -8811,7 +8825,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # filesystem to distinguish between, this is used to manually give short # names for the directory name immediately under $match_tables that the # match tables for this property should be placed in. - main::set_access('match_subdir', \%match_subdir, 'r'); + main::set_access('match_subdir', \%match_subdir, 'r', 's'); my %has_dependency; # A boolean that gives whether some table somewhere is defined as the @@ -9986,7 +10000,6 @@ sub process_PropertyAliases($file) { for my $i (2 .. @data - 1) { $this->add_alias($data[$i]); } - } my $scf = property_ref("Simple_Case_Folding"); @@ -10049,10 +10062,13 @@ sub finish_property_setup($file) { # file directly (it was documented in 5.12 and 5.14 as being thusly # usable), keep it from being adjusted. (range_size_1 is # used to force the traditional format.) - if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { - $nfkc_cf->set_to_output_map($EXTERNAL_MAP); - $nfkc_cf->set_range_size_1(1); + foreach my $property (qw(NFKC_Casefold NFKC_Simple_Casefold)) { + if (defined (my $cf = property_ref($property))) { + $cf->set_to_output_map($EXTERNAL_MAP); + $cf->set_range_size_1(1); + } } + if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { $bmg->set_to_output_map($EXTERNAL_MAP); $bmg->set_range_size_1(1); @@ -10060,6 +10076,14 @@ sub finish_property_setup($file) { property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); + # These two properties have no short names and the file names for them + # clash in DOS 8.3. Work around this by creating shorter file names that + # work + my $IDCMStart = property_ref("ID_Compat_Math_Start"); + $IDCMStart->set_match_subdir("IDCMStart") if defined $IDCMStart; + my $IDCMCont= property_ref("ID_Compat_Math_Continue"); + $IDCMCont->set_match_subdir("IDCMContinue") if defined $IDCMCont; + # The rest of this sub is for properties that need the Multi_Default class # to create objects for defaults. As of v15.0, this is no longer needed. @@ -13375,6 +13399,30 @@ sub filter_early_version_name_alias_line { return; } +sub setup_Unikemet{ + + # These are provisional properties, so aren't in PropAliases.txt + my %properties = ( + Core => $ENUM, + FVal => $STRING, + Func => $STRING, + UniK => $ENUM, + ); + for my $property (keys %properties) { + Property->new("kEH_$property", + Default_Map => "", + Type => $STRING,, + ); + } +} + +sub filter_Unikemet_line { + $_ =~ s/;/,/g; # mktables can't accept semi-colons + $_ =~ s/\t/; /g; + $_ =~ s/ ^ U\+ //x; + return; +} + sub filter_all_caps_script_names { # Some early Unicode releases had the script names in all CAPS. This @@ -13736,6 +13784,10 @@ END next if $range->start == 0x1D7CE; # This whole range was added in 3.1 next if $range->end == 0x19DA && $v_version eq v5.2.0; next if $range->end - $range->start < 9 && $v_version le 4.0.0; + + # 2 sequential series of 10 each were added in 16.0 + next if $range->start == 0x116D0 && $range->end == 0x116E3; + Carp::my_carp("Range $range unexpectedly doesn't contain 10" . " decimal digits. Code in regcomp.c assumes it does," . " and will have to be fixed. Proceeding anyway."); @@ -15179,11 +15231,11 @@ END # Perl tailors the WordBreak property so that \b{wb} doesn't split # adjacent spaces into separate words. Unicode 11.0 moved in that - # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO - # BREAK SPACE as breaking, so we retained the original Perl customization. - # To do this, in the Perl copy of WB, simply replace the mappings of - # horizontal space characters that otherwise would map to the default or - # the 11.0 'WSegSpace' to instead map to our tailoring. + # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) + # NO_BREAK SPACE as breaking, so we retained the original Perl + # customization. To do this, in the Perl copy of WB, simply replace the + # mappings of horizontal space characters that otherwise would map to the + # default or the 11.0 'WSegSpace' to instead map to our tailoring. my $perl_wb = property_ref('_Perl_WB'); my $default = $perl_wb->default_map; for my $range ($Blank->ranges) { @@ -19778,13 +19830,21 @@ my @input_file_objects = ( ), Input_file->new('IdStatus.txt', v13.0.0, Pre_Handler => \&setup_IdStatus, + Has_Missings_Defaults => $IGNORED, Property => 'Identifier_Status', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('IdType.txt', v13.0.0, Pre_Handler => \&setup_IdType, + Has_Missings_Defaults => $IGNORED, Each_Line_Handler => \&filter_IdType_line, Property => 'Identifier_Type', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('confusables.txt', v15.0.0, @@ -19799,6 +19859,17 @@ my @input_file_objects = ( Skip => $Unused_Skip, UCD => 0, ), + Input_file->new('Unikemet.txt', v16.0.0, + Pre_Handler => \&setup_Unikemet, + Each_Line_Handler => \&filter_Unikemet_line, + UCD => 0, + ), + Input_file->new('DoNotEmit.txt', v16.0.0, + # Advice about characters that are unwise to create; not + # any properties, though we could create some. + Skip => $Unused_Skip, + UCD => 0, + ), ); # End of all the preliminaries. diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index fe9034ecda6c..749a15729956 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables +# 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 0b7b686598ea..d43c6d2b27dd 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 38e727c60c90..5875a9f3d091 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index e013651e107a..07fcd5d09023 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl