From 3569950c81b690fc21bf43968a40d6824550e9b4 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 17 Mar 2025 12:15:28 -0600 Subject: [PATCH 1/8] mktables: White-space, comment only Add comments, and rewrap comment lines to fit 80 columns --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 17 +++++++++++------ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 16 insertions(+), 11 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 50dfd4b65bf6..a3fcef5845a6 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index a10ec339bdb1..854d01e1f1d6 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -9986,7 +9986,6 @@ sub process_PropertyAliases($file) { for my $i (2 .. @data - 1) { $this->add_alias($data[$i]); } - } my $scf = property_ref("Simple_Case_Folding"); @@ -15179,11 +15178,11 @@ END # Perl tailors the WordBreak property so that \b{wb} doesn't split # adjacent spaces into separate words. Unicode 11.0 moved in that - # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO - # BREAK SPACE as breaking, so we retained the original Perl customization. - # To do this, in the Perl copy of WB, simply replace the mappings of - # horizontal space characters that otherwise would map to the default or - # the 11.0 'WSegSpace' to instead map to our tailoring. + # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) + # NO_BREAK SPACE as breaking, so we retained the original Perl + # customization. To do this, in the Perl copy of WB, simply replace the + # mappings of horizontal space characters that otherwise would map to the + # default or the 11.0 'WSegSpace' to instead map to our tailoring. my $perl_wb = property_ref('_Perl_WB'); my $default = $perl_wb->default_map; for my $range ($Blank->ranges) { @@ -19779,12 +19778,18 @@ my @input_file_objects = ( Input_file->new('IdStatus.txt', v13.0.0, Pre_Handler => \&setup_IdStatus, Property => 'Identifier_Status', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('IdType.txt', v13.0.0, Pre_Handler => \&setup_IdType, Each_Line_Handler => \&filter_IdType_line, Property => 'Identifier_Type', + + # Part of UTS 39, so must be downloaded separately from + # unicode.org UCD => 0, ), Input_file->new('confusables.txt', v15.0.0, diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index fe9034ecda6c..f85d7c99e5d3 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables +# c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 0b7b686598ea..6a1af51b857b 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 38e727c60c90..cff4b03fab0c 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index e013651e107a..f31b4eae3814 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 8968a5ee00063fa31ff15474b95ccb6b9c228e3ebad6c20bc77f4225187e2023 lib/unicore/mktables + * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From 7f96e1277176b5b4726694d97af5e9260a0b7870 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 12:48:39 -0600 Subject: [PATCH 2/8] mktables: Handle new property NFKC_Simple_Casefold Unicode 15.1 introduces this new property, which needs the same special handling as plain NFKC_Casefold does. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 9 ++++++--- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index a3fcef5845a6..75d408038824 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables + * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 854d01e1f1d6..2ef527a624dc 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -10048,10 +10048,13 @@ sub finish_property_setup($file) { # file directly (it was documented in 5.12 and 5.14 as being thusly # usable), keep it from being adjusted. (range_size_1 is # used to force the traditional format.) - if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { - $nfkc_cf->set_to_output_map($EXTERNAL_MAP); - $nfkc_cf->set_range_size_1(1); + foreach my $property (qw(NFKC_Casefold NFKC_Simple_Casefold)) { + if (defined (my $cf = property_ref($property))) { + $cf->set_to_output_map($EXTERNAL_MAP); + $cf->set_range_size_1(1); + } } + if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { $bmg->set_to_output_map($EXTERNAL_MAP); $bmg->set_range_size_1(1); diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index f85d7c99e5d3..2b1dc74773cf 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables +# 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 6a1af51b857b..06d57f9bed10 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables + * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index cff4b03fab0c..369e81603925 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables + * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index f31b4eae3814..f08945252dc5 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c261eebd8c457d06f3eaf523128819562ebf48dd277dc76b55b3c2956f5e99e8 lib/unicore/mktables + * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From 20c6a055d6d796fac75d1923b99e1b3ab35542d7 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 12:54:47 -0600 Subject: [PATCH 3/8] mktables: Ignore missings entries in two files These files are changed in 15.1 to have @missings lines, whereas they didn't before. This leads to some warnings messages, so turn off looking at them, as we do for a number of other files. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 2 ++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 7 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 75d408038824..f2311e8b08e2 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables + * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 2ef527a624dc..fffffbf02603 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -19780,6 +19780,7 @@ my @input_file_objects = ( ), Input_file->new('IdStatus.txt', v13.0.0, Pre_Handler => \&setup_IdStatus, + Has_Missings_Defaults => $IGNORED, Property => 'Identifier_Status', # Part of UTS 39, so must be downloaded separately from @@ -19788,6 +19789,7 @@ my @input_file_objects = ( ), Input_file->new('IdType.txt', v13.0.0, Pre_Handler => \&setup_IdType, + Has_Missings_Defaults => $IGNORED, Each_Line_Handler => \&filter_IdType_line, Property => 'Identifier_Type', diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 2b1dc74773cf..4a108a04c75a 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables +# 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 06d57f9bed10..1e98903553ed 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables + * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 369e81603925..1edaa22c3f11 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables + * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index f08945252dc5..68cbf235b48a 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 215df5b50b8b30403ceb597a33ffddd7da0aa096813b5e90e1c8851c6e61fc20 lib/unicore/mktables + * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From 11ab4307e0d7cfb506e49049b8826c0a7d06950b Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 13:30:06 -0600 Subject: [PATCH 4/8] mktables: Handle Unicode 16.0 DoNotEmit.txt We handle it by ignoring this file, new to Unicode 16.0. It consists of lists of characters that, to put it less delicately than Unicode would like, they regret creating. But there are no rules associated with them. It would be nice to have a \p{DoNotEmit} property so that applications could handle situations where this occurs. But I'm fearful that if we did something like this, that Unicode would later come up with something that had the same intention but would be subtly or unsubtly different. That has happened before, to our detriment. So I think we should wait to see what they do do, in future releases. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 6 ++++++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index f2311e8b08e2..d8242a72a2ef 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables + * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index fffffbf02603..56c349a6dcb3 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -19809,6 +19809,12 @@ my @input_file_objects = ( Skip => $Unused_Skip, UCD => 0, ), + Input_file->new('DoNotEmit.txt', v16.0.0, + # Advice about characters that are unwise to create; not + # any properties, though we could create some. + Skip => $Unused_Skip, + UCD => 0, + ), ); # End of all the preliminaries. diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 4a108a04c75a..92fddc24d66f 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables +# d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 1e98903553ed..c594512e4b46 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables + * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 1edaa22c3f11..00e0e3c6e3c1 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables + * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 68cbf235b48a..b51288b160f7 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 609be4fb13d99bc8a003a01a6e6d07bfb65c6240900eafa220dd4761a4a9cbb2 lib/unicore/mktables + * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From cef04591b1baac7cab2a357aecc5591c5e75ecd8 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 13:37:43 -0600 Subject: [PATCH 5/8] mktables: Handle Unicode 16.0 Unikemet.txt file This includes several new properties, some of which are considered "provisional" by Unicode, which means they can be heavily revised or withdrawn. These properties are designed for use by scholars of hieroglyphics. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 38 +++++++++++++++++++++++++++++++++++++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 43 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index d8242a72a2ef..28b4b881625c 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables + * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 56c349a6dcb3..c2a5c4b75ce8 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -871,6 +871,15 @@ push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133' if $v_version ge v6.2.0; +# These properties of Egyptian hieroglyphs are not handled by Perl. Their +# intended audience is only specialist Egyptologists +push @tables_that_may_be_empty, qw(kEH_Cat kEH_Desc kEH_HG kEH_IFAO + kEH_JSesh + kEH_NoMirror kEH_NoMirror=Yes + kEH_NoMirror=No + kEH_NoRotate kEH_NoRotate=Yes) + if $v_version ge v16.0.0; + # The lists below are hashes, so the key is the item in the list, and the # value is the reason why it is in the list. This makes generation of # documentation easier. @@ -13377,6 +13386,30 @@ sub filter_early_version_name_alias_line { return; } +sub setup_Unikemet{ + + # These are provisional properties, so aren't in PropAliases.txt + my %properties = ( + Core => $ENUM, + FVal => $STRING, + Func => $STRING, + UniK => $ENUM, + ); + for my $property (keys %properties) { + Property->new("kEH_$property", + Default_Map => "", + Type => $STRING,, + ); + } +} + +sub filter_Unikemet_line { + $_ =~ s/;/,/g; # mktables can't accept semi-colons + $_ =~ s/\t/; /g; + $_ =~ s/ ^ U\+ //x; + return; +} + sub filter_all_caps_script_names { # Some early Unicode releases had the script names in all CAPS. This @@ -19809,6 +19842,11 @@ my @input_file_objects = ( Skip => $Unused_Skip, UCD => 0, ), + Input_file->new('Unikemet.txt', v16.0.0, + Pre_Handler => \&setup_Unikemet, + Each_Line_Handler => \&filter_Unikemet_line, + UCD => 0, + ), Input_file->new('DoNotEmit.txt', v16.0.0, # Advice about characters that are unwise to create; not # any properties, though we could create some. diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 92fddc24d66f..4ef6a3b122a5 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables +# 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index c594512e4b46..7a180141830f 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables + * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 00e0e3c6e3c1..1eefa51dc4e0 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables + * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index b51288b160f7..e09a9d700e86 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * d13b7019817a693e50c3b233b25efd73390334c226f700650628734388c34209 lib/unicore/mktables + * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From a1b81941b950527c750fd84ac6ddcb07ac19b6b0 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 13:50:26 -0600 Subject: [PATCH 6/8] mktables: Support new Unicode 16.0 properties ID_Compat_Math_foo These new properties are automatically handled, but there is a problem. They have no short form names. Files are written for them based on their names, and those files are not distinguishable on a DOS 8.3 file system. The solution here is to manually override the automatically generated file names with distinguishable ones. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 10 +++++++++- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 14 insertions(+), 6 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 28b4b881625c..85dc88c2fef4 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables + * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index c2a5c4b75ce8..5eefe2304555 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -8820,7 +8820,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # filesystem to distinguish between, this is used to manually give short # names for the directory name immediately under $match_tables that the # match tables for this property should be placed in. - main::set_access('match_subdir', \%match_subdir, 'r'); + main::set_access('match_subdir', \%match_subdir, 'r', 's'); my %has_dependency; # A boolean that gives whether some table somewhere is defined as the @@ -10071,6 +10071,14 @@ sub finish_property_setup($file) { property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); + # These two properties have no short names and the file names for them + # clash in DOS 8.3. Work around this by creating shorter file names that + # work + my $IDCMStart = property_ref("ID_Compat_Math_Start"); + $IDCMStart->set_match_subdir("IDCMStart") if defined $IDCMStart; + my $IDCMCont= property_ref("ID_Compat_Math_Continue"); + $IDCMCont->set_match_subdir("IDCMContinue") if defined $IDCMCont; + # The rest of this sub is for properties that need the Multi_Default class # to create objects for defaults. As of v15.0, this is no longer needed. diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 4ef6a3b122a5..d63c0b051c81 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables +# 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 7a180141830f..d8ffbb5072f4 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables + * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 1eefa51dc4e0..3f0d4a50ad2d 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables + * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index e09a9d700e86..d35d499e520b 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 122e104ecbf3408f4d18a5f4e4b9ce12737f421d0b31581835459e38d9c93ca6 lib/unicore/mktables + * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From a19886d2dec04f53dd28c2bc6f7c0d2d48cdb55a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Mar 2025 14:20:44 -0600 Subject: [PATCH 7/8] mktables: Handle Unicode 16.0 new \d ranges mktables does a lot of sanity checks on the data it gets fed. One of those is to make sure any \d group of code points is 10 long. This verifies that Unicode has given us enough code points to form 0-9. It assumes that if it got this much right, that their numeric values are also 0-9. This check has uncovered issues with the Unicode Standard in the past. Nowadays, they've cleaned up their act, and it's been many releases since there has been problems. But our checks remain, and I think they should. What happens in Unicode 16.0 was there was a range of \d characters that contain two consecutive groups of 0-9 values. The check could be changed to verify that the count is divisible by 10, but checking for this particular range is a bit safer. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 4 ++++ lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 9 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 85dc88c2fef4..56c9eb714509 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables + * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 5eefe2304555..7e31673ab6e1 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -13779,6 +13779,10 @@ END next if $range->start == 0x1D7CE; # This whole range was added in 3.1 next if $range->end == 0x19DA && $v_version eq v5.2.0; next if $range->end - $range->start < 9 && $v_version le 4.0.0; + + # 2 sequential series of 10 each were added in 16.0 + next if $range->start == 0x116D0 && $range->end == 0x116E3; + Carp::my_carp("Range $range unexpectedly doesn't contain 10" . " decimal digits. Code in regcomp.c assumes it does," . " and will have to be fixed. Proceeding anyway."); diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index d63c0b051c81..8782eb913183 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables +# c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index d8ffbb5072f4..35c8f670b874 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables + * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 3f0d4a50ad2d..1d4f27ed6999 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables + * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index d35d499e520b..835b260cf5d5 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * 5b296d0f4540ce1853589060d595799065c01361bcb5077f8e2cfabdefd18a61 lib/unicore/mktables + * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl From 32ee519c8e46cb54c45ab4f8924a09557be26ef8 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 2 Apr 2025 10:35:58 -0600 Subject: [PATCH 8/8] mktables: Add count() method to Range class There is already this method for lists of Ranges, so this is is just so callers don't need to know which they are operating on. --- charclass_invlists.inc | 2 +- lib/unicore/mktables | 9 +++++++-- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- regexp_constants.h | 2 +- uni_keywords.h | 2 +- 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/charclass_invlists.inc b/charclass_invlists.inc index 56c9eb714509..2e66955eb114 100644 --- a/charclass_invlists.inc +++ b/charclass_invlists.inc @@ -436055,7 +436055,7 @@ static const U8 WB_table[23][23] = { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 7e31673ab6e1..a2f20a0bef5f 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -3627,6 +3627,11 @@ sub trace { return main::trace(@_); } return $standard_form{$addr} = main::standardize($value); } + sub count($self) { + my $addr = pack 'J', refaddr $self; + return $end{$addr} - $start{$addr} + 1; + } + sub dump($self, $indent) { # Human, not machine readable. For machine readable, comment out this # entire routine and let the standard one take effect. @@ -4811,7 +4816,7 @@ sub trace { return main::trace(@_); } my $count = 0; foreach my $range (@{$ranges{$addr}}) { - $count += $range->end - $range->start + 1; + $count += $range->count; } return $count; } @@ -6028,7 +6033,7 @@ END # points" my $count = ($set->type != 0) ? 1 - : $set->end - $set->start + 1; + : $set->count; $widths{$this_width} += $count; $total += $count; $max_map_width = $this_width diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 8782eb913183..749a15729956 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1331,7 +1331,7 @@ # 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt # 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt # 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt -# c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables +# 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables # 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version # 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl # c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 35c8f670b874..d43c6d2b27dd 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -3850,7 +3850,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * 2a984ef37eb37e718ed25d472988745196816f9147d6ee6822a8efeafda340e5 regen/regcharclass.pl diff --git a/regexp_constants.h b/regexp_constants.h index 1d4f27ed6999..5875a9f3d091 100644 --- a/regexp_constants.h +++ b/regexp_constants.h @@ -78,7 +78,7 @@ * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 835b260cf5d5..07fcd5d09023 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7756,7 +7756,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) { * 3f4f32ed2a577344a508114527e721d7a8b633d32f38945d47fe0c743650c585 lib/unicore/extracted/DLineBreak.txt * 710abf2d581ac9c57f244c0834f9d9969d9781e0396adccd330eaae658ac7d6b lib/unicore/extracted/DNumType.txt * 6bd30f385f3baf3ab5d5308c111a81de87bea5f494ba0ba69e8ab45263b8c34d lib/unicore/extracted/DNumValues.txt - * c1557a0885bf627ece862b3a80ee1bd24449b656e01159d4c6753c3a1ed54335 lib/unicore/mktables + * 4b2ad6e7689bea5acec1b52fa813a60fdac125a5cc6901cc02be3093b1697894 lib/unicore/mktables * 55d90fdc3f902e5c0b16b3378f9eaa36e970a1c09723c33de7d47d0370044012 lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl * c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl