Skip to content

Commit 1402478

Browse files
authored
Improve unicode table handling (bellard#286)
- Document table and index formats - Add size statistics - Fix UBSAN issue in `get_le24()` Fixes bellard#285
1 parent 3b45d15 commit 1402478

File tree

3 files changed

+234
-79
lines changed

3 files changed

+234
-79
lines changed

libunicode-table.h

+116-45
Original file line numberDiff line numberDiff line change
@@ -189,9 +189,13 @@ static const uint8_t unicode_prop_Cased1_table[196] = {
189189
};
190190

191191
static const uint8_t unicode_prop_Cased1_index[21] = {
192-
0xb9, 0x02, 0xe0, 0xc0, 0x1d, 0x20, 0xe5, 0x2c,
193-
0x20, 0xb1, 0x07, 0x21, 0xc1, 0xd6, 0x21, 0x4a,
194-
0xf1, 0x01, 0x8a, 0xf1, 0x01,
192+
0xb9, 0x02, 0xe0, // 002B9 at 39
193+
0xc0, 0x1d, 0x20, // 01DC0 at 65
194+
0xe5, 0x2c, 0x20, // 02CE5 at 97
195+
0xb1, 0x07, 0x21, // 107B1 at 129
196+
0xc1, 0xd6, 0x21, // 1D6C1 at 161
197+
0x4a, 0xf1, 0x01, // 1F14A at 192
198+
0x8a, 0xf1, 0x01, // 1F18A at 224 (upper bound)
195199
};
196200

197201
static const uint8_t unicode_prop_Case_Ignorable_table[737] = {
@@ -291,15 +295,29 @@ static const uint8_t unicode_prop_Case_Ignorable_table[737] = {
291295
};
292296

293297
static const uint8_t unicode_prop_Case_Ignorable_index[69] = {
294-
0xbe, 0x05, 0x00, 0xfe, 0x07, 0x00, 0x52, 0x0a,
295-
0xa0, 0xc1, 0x0b, 0x00, 0x82, 0x0d, 0x00, 0x3f,
296-
0x10, 0x80, 0xd4, 0x17, 0x40, 0xcf, 0x1a, 0x20,
297-
0xf5, 0x1c, 0x00, 0x80, 0x20, 0x00, 0x16, 0xa0,
298-
0x00, 0xc6, 0xa8, 0x00, 0xc2, 0xaa, 0x60, 0x56,
299-
0xfe, 0x20, 0xb1, 0x07, 0x01, 0x75, 0x10, 0x01,
300-
0xeb, 0x12, 0x21, 0x41, 0x16, 0x01, 0x5c, 0x1a,
301-
0x01, 0x43, 0x1f, 0x01, 0x2e, 0xcf, 0x41, 0x25,
302-
0xe0, 0x01, 0xf0, 0x01, 0x0e,
298+
0xbe, 0x05, 0x00, // 005BE at 32
299+
0xfe, 0x07, 0x00, // 007FE at 64
300+
0x52, 0x0a, 0xa0, // 00A52 at 101
301+
0xc1, 0x0b, 0x00, // 00BC1 at 128
302+
0x82, 0x0d, 0x00, // 00D82 at 160
303+
0x3f, 0x10, 0x80, // 0103F at 196
304+
0xd4, 0x17, 0x40, // 017D4 at 226
305+
0xcf, 0x1a, 0x20, // 01ACF at 257
306+
0xf5, 0x1c, 0x00, // 01CF5 at 288
307+
0x80, 0x20, 0x00, // 02080 at 320
308+
0x16, 0xa0, 0x00, // 0A016 at 352
309+
0xc6, 0xa8, 0x00, // 0A8C6 at 384
310+
0xc2, 0xaa, 0x60, // 0AAC2 at 419
311+
0x56, 0xfe, 0x20, // 0FE56 at 449
312+
0xb1, 0x07, 0x01, // 107B1 at 480
313+
0x75, 0x10, 0x01, // 11075 at 512
314+
0xeb, 0x12, 0x21, // 112EB at 545
315+
0x41, 0x16, 0x01, // 11641 at 576
316+
0x5c, 0x1a, 0x01, // 11A5C at 608
317+
0x43, 0x1f, 0x01, // 11F43 at 640
318+
0x2e, 0xcf, 0x41, // 1CF2E at 674
319+
0x25, 0xe0, 0x01, // 1E025 at 704
320+
0xf0, 0x01, 0x0e, // E01F0 at 736 (upper bound)
303321
};
304322

305323
static const uint8_t unicode_prop_ID_Start_table[1100] = {
@@ -444,20 +462,41 @@ static const uint8_t unicode_prop_ID_Start_table[1100] = {
444462
};
445463

446464
static const uint8_t unicode_prop_ID_Start_index[105] = {
447-
0xf6, 0x03, 0x20, 0xa6, 0x07, 0x00, 0xa9, 0x09,
448-
0x20, 0xb1, 0x0a, 0x00, 0xba, 0x0b, 0x20, 0x3b,
449-
0x0d, 0x20, 0xc7, 0x0e, 0x20, 0x49, 0x12, 0x00,
450-
0x9b, 0x16, 0x00, 0xac, 0x19, 0x00, 0xc0, 0x1d,
451-
0x80, 0x80, 0x20, 0x20, 0x70, 0x2d, 0x00, 0x00,
452-
0x32, 0x00, 0xda, 0xa7, 0x00, 0x4c, 0xaa, 0x20,
453-
0xc7, 0xd7, 0x20, 0xfc, 0xfd, 0x20, 0x9d, 0x02,
454-
0x21, 0x96, 0x05, 0x01, 0xf3, 0x08, 0x01, 0xb3,
455-
0x0c, 0x21, 0x73, 0x11, 0x61, 0x34, 0x13, 0x01,
456-
0x1b, 0x17, 0x21, 0x8a, 0x1a, 0x01, 0x34, 0x1f,
457-
0x21, 0xbf, 0x6a, 0x01, 0x23, 0xb1, 0xa1, 0xad,
458-
0xd4, 0x01, 0x6f, 0xd7, 0x01, 0xff, 0xe7, 0x61,
459-
0x5e, 0xee, 0x01, 0xe1, 0xeb, 0x22, 0xb0, 0x23,
460-
0x03,
465+
0xf6, 0x03, 0x20, // 003F6 at 33
466+
0xa6, 0x07, 0x00, // 007A6 at 64
467+
0xa9, 0x09, 0x20, // 009A9 at 97
468+
0xb1, 0x0a, 0x00, // 00AB1 at 128
469+
0xba, 0x0b, 0x20, // 00BBA at 161
470+
0x3b, 0x0d, 0x20, // 00D3B at 193
471+
0xc7, 0x0e, 0x20, // 00EC7 at 225
472+
0x49, 0x12, 0x00, // 01249 at 256
473+
0x9b, 0x16, 0x00, // 0169B at 288
474+
0xac, 0x19, 0x00, // 019AC at 320
475+
0xc0, 0x1d, 0x80, // 01DC0 at 356
476+
0x80, 0x20, 0x20, // 02080 at 385
477+
0x70, 0x2d, 0x00, // 02D70 at 416
478+
0x00, 0x32, 0x00, // 03200 at 448
479+
0xda, 0xa7, 0x00, // 0A7DA at 480
480+
0x4c, 0xaa, 0x20, // 0AA4C at 513
481+
0xc7, 0xd7, 0x20, // 0D7C7 at 545
482+
0xfc, 0xfd, 0x20, // 0FDFC at 577
483+
0x9d, 0x02, 0x21, // 1029D at 609
484+
0x96, 0x05, 0x01, // 10596 at 640
485+
0xf3, 0x08, 0x01, // 108F3 at 672
486+
0xb3, 0x0c, 0x21, // 10CB3 at 705
487+
0x73, 0x11, 0x61, // 11173 at 739
488+
0x34, 0x13, 0x01, // 11334 at 768
489+
0x1b, 0x17, 0x21, // 1171B at 801
490+
0x8a, 0x1a, 0x01, // 11A8A at 832
491+
0x34, 0x1f, 0x21, // 11F34 at 865
492+
0xbf, 0x6a, 0x01, // 16ABF at 896
493+
0x23, 0xb1, 0xa1, // 1B123 at 933
494+
0xad, 0xd4, 0x01, // 1D4AD at 960
495+
0x6f, 0xd7, 0x01, // 1D76F at 992
496+
0xff, 0xe7, 0x61, // 1E7FF at 1027
497+
0x5e, 0xee, 0x01, // 1EE5E at 1056
498+
0xe1, 0xeb, 0x22, // 2EBE1 at 1089
499+
0xb0, 0x23, 0x03, // 323B0 at 1120 (upper bound)
461500
};
462501

463502
static const uint8_t unicode_prop_ID_Continue1_table[660] = {
@@ -547,14 +586,27 @@ static const uint8_t unicode_prop_ID_Continue1_table[660] = {
547586
};
548587

549588
static const uint8_t unicode_prop_ID_Continue1_index[63] = {
550-
0xfa, 0x06, 0x00, 0x70, 0x09, 0x00, 0xf0, 0x0a,
551-
0x40, 0x57, 0x0c, 0x00, 0xf0, 0x0d, 0x60, 0xc7,
552-
0x0f, 0x20, 0xea, 0x17, 0x40, 0x05, 0x1b, 0x00,
553-
0x41, 0x20, 0x00, 0x0c, 0xa8, 0x80, 0x37, 0xaa,
554-
0x20, 0x50, 0xfe, 0x20, 0x3a, 0x0d, 0x21, 0x74,
555-
0x11, 0x01, 0x5a, 0x14, 0x21, 0x44, 0x19, 0x81,
556-
0x5a, 0x1d, 0xa1, 0xf5, 0x6a, 0x21, 0x45, 0xd2,
557-
0x41, 0xaf, 0xe2, 0x21, 0xf0, 0x01, 0x0e,
589+
0xfa, 0x06, 0x00, // 006FA at 32
590+
0x70, 0x09, 0x00, // 00970 at 64
591+
0xf0, 0x0a, 0x40, // 00AF0 at 98
592+
0x57, 0x0c, 0x00, // 00C57 at 128
593+
0xf0, 0x0d, 0x60, // 00DF0 at 163
594+
0xc7, 0x0f, 0x20, // 00FC7 at 193
595+
0xea, 0x17, 0x40, // 017EA at 226
596+
0x05, 0x1b, 0x00, // 01B05 at 256
597+
0x41, 0x20, 0x00, // 02041 at 288
598+
0x0c, 0xa8, 0x80, // 0A80C at 324
599+
0x37, 0xaa, 0x20, // 0AA37 at 353
600+
0x50, 0xfe, 0x20, // 0FE50 at 385
601+
0x3a, 0x0d, 0x21, // 10D3A at 417
602+
0x74, 0x11, 0x01, // 11174 at 448
603+
0x5a, 0x14, 0x21, // 1145A at 481
604+
0x44, 0x19, 0x81, // 11944 at 516
605+
0x5a, 0x1d, 0xa1, // 11D5A at 549
606+
0xf5, 0x6a, 0x21, // 16AF5 at 577
607+
0x45, 0xd2, 0x41, // 1D245 at 610
608+
0xaf, 0xe2, 0x21, // 1E2AF at 641
609+
0xf0, 0x01, 0x0e, // E01F0 at 672 (upper bound)
558610
};
559611

560612
#ifdef CONFIG_ALL_UNICODE
@@ -676,17 +728,35 @@ static const uint8_t unicode_cc_table[899] = {
676728
};
677729

678730
static const uint8_t unicode_cc_index[87] = {
679-
0x4d, 0x03, 0x00, 0x97, 0x05, 0x20, 0xc6, 0x05,
680-
0x00, 0xe7, 0x06, 0x00, 0x45, 0x07, 0x00, 0x9c,
681-
0x08, 0x00, 0x4d, 0x09, 0x00, 0x3c, 0x0b, 0x00,
682-
0x3d, 0x0d, 0x00, 0x36, 0x0f, 0x00, 0x38, 0x10,
683-
0x20, 0x3a, 0x19, 0x00, 0xcb, 0x1a, 0x20, 0xd3,
684-
0x1c, 0x00, 0xcf, 0x1d, 0x00, 0xe2, 0x20, 0x00,
685-
0x2e, 0x30, 0x20, 0x2b, 0xa9, 0x20, 0xed, 0xab,
686-
0x00, 0x39, 0x0a, 0x01, 0x51, 0x0f, 0x01, 0x73,
687-
0x11, 0x01, 0x75, 0x13, 0x01, 0x2b, 0x17, 0x21,
688-
0x3f, 0x1c, 0x21, 0x9e, 0xbc, 0x21, 0x08, 0xe0,
689-
0x01, 0x44, 0xe9, 0x01, 0x4b, 0xe9, 0x01,
731+
0x4d, 0x03, 0x00, // 0034D at 32
732+
0x97, 0x05, 0x20, // 00597 at 65
733+
0xc6, 0x05, 0x00, // 005C6 at 96
734+
0xe7, 0x06, 0x00, // 006E7 at 128
735+
0x45, 0x07, 0x00, // 00745 at 160
736+
0x9c, 0x08, 0x00, // 0089C at 192
737+
0x4d, 0x09, 0x00, // 0094D at 224
738+
0x3c, 0x0b, 0x00, // 00B3C at 256
739+
0x3d, 0x0d, 0x00, // 00D3D at 288
740+
0x36, 0x0f, 0x00, // 00F36 at 320
741+
0x38, 0x10, 0x20, // 01038 at 353
742+
0x3a, 0x19, 0x00, // 0193A at 384
743+
0xcb, 0x1a, 0x20, // 01ACB at 417
744+
0xd3, 0x1c, 0x00, // 01CD3 at 448
745+
0xcf, 0x1d, 0x00, // 01DCF at 480
746+
0xe2, 0x20, 0x00, // 020E2 at 512
747+
0x2e, 0x30, 0x20, // 0302E at 545
748+
0x2b, 0xa9, 0x20, // 0A92B at 577
749+
0xed, 0xab, 0x00, // 0ABED at 608
750+
0x39, 0x0a, 0x01, // 10A39 at 640
751+
0x51, 0x0f, 0x01, // 10F51 at 672
752+
0x73, 0x11, 0x01, // 11173 at 704
753+
0x75, 0x13, 0x01, // 11375 at 736
754+
0x2b, 0x17, 0x21, // 1172B at 769
755+
0x3f, 0x1c, 0x21, // 11C3F at 801
756+
0x9e, 0xbc, 0x21, // 1BC9E at 833
757+
0x08, 0xe0, 0x01, // 1E008 at 864
758+
0x44, 0xe9, 0x01, // 1E944 at 896
759+
0x4b, 0xe9, 0x01, // 1E94B at 928 (upper bound)
690760
};
691761

692762
static const uint32_t unicode_decomp_table1[699] = {
@@ -4484,3 +4554,4 @@ static const uint16_t unicode_prop_len_table[] = {
44844554
};
44854555

44864556
#endif /* CONFIG_ALL_UNICODE */
4557+
/* 62 tables / 32261 bytes, 5 index / 345 bytes */

libunicode.c

+32-4
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,7 @@ int lre_canonicalize(uint32_t c, BOOL is_unicode)
262262

263263
static uint32_t get_le24(const uint8_t *ptr)
264264
{
265-
#if defined(__x86__) || defined(__x86_64__)
266-
return *(uint16_t *)ptr | (ptr[2] << 16);
267-
#else
268265
return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
269-
#endif
270266
}
271267

272268
#define UNICODE_INDEX_BLOCK_LEN 32
@@ -317,6 +313,14 @@ static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
317313
return FALSE; /* outside the table */
318314
p = table + pos;
319315
bit = 0;
316+
/* Compressed run length encoding:
317+
00..3F: 2 packed lengths: 3-bit + 3-bit
318+
40..5F: 5-bits plus extra byte for length
319+
60..7F: 5-bits plus 2 extra bytes for length
320+
80..FF: 7-bit length
321+
lengths must be incremented to get character count
322+
Ranges alternate between false and true return value.
323+
*/
320324
for(;;) {
321325
b = *p++;
322326
if (b < 64) {
@@ -833,6 +837,13 @@ static int unicode_get_cc(uint32_t c)
833837
if (pos < 0)
834838
return 0;
835839
p = unicode_cc_table + pos;
840+
/* Compressed run length encoding:
841+
- 2 high order bits are combining class type
842+
- 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
843+
- 00..2F: range length (add 1)
844+
- 30..37: 3-bit range-length + 1 extra byte
845+
- 38..3F: 3-bit range-length + 2 extra byte
846+
*/
836847
for(;;) {
837848
b = *p++;
838849
type = b >> 6;
@@ -1185,6 +1196,15 @@ static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
11851196
p = unicode_gc_table;
11861197
p_end = unicode_gc_table + countof(unicode_gc_table);
11871198
c = 0;
1199+
/* Compressed range encoding:
1200+
initial byte:
1201+
bits 0..4: category number (special case 31)
1202+
bits 5..7: range length (add 1)
1203+
special case bits 5..7 == 7: read an extra byte
1204+
- 00..7F: range length (add 7 + 1)
1205+
- 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
1206+
- C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
1207+
*/
11881208
while (p < p_end) {
11891209
b = *p++;
11901210
n = b >> 5;
@@ -1238,6 +1258,14 @@ static int unicode_prop1(CharRange *cr, int prop_idx)
12381258
p_end = p + unicode_prop_len_table[prop_idx];
12391259
c = 0;
12401260
bit = 0;
1261+
/* Compressed range encoding:
1262+
00..3F: 2 packed lengths: 3-bit + 3-bit
1263+
40..5F: 5-bits plus extra byte for length
1264+
60..7F: 5-bits plus 2 extra bytes for length
1265+
80..FF: 7-bit length
1266+
lengths must be incremented to get character count
1267+
Ranges alternate between false and true return value.
1268+
*/
12411269
while (p < p_end) {
12421270
c0 = c;
12431271
b = *p++;

0 commit comments

Comments
 (0)