Skip to content

Commit 0e89fc9

Browse files
committed
properties: add "ambiwidth" property for ambiguous East Asian Width
Some characters have their width defined as "Ambiguous" in UAX#11. These are typically rendered as single-width by modern monospace fonts, and utf8proc correctly returns charwidth==1 for these. However some applications might need to support older CJK fonts where characters which where two-byte in legacy encodings were rendered as double-width. An example of this is the 'ambiwidth' option of vim and neovim which supports rendering in terminals using such wideness rules. Add an 'ambiwidth' property to utf8proc_property_t for such characters.
1 parent 5568eff commit 0e89fc9

File tree

4 files changed

+12915
-12862
lines changed

4 files changed

+12915
-12862
lines changed

data/data_generator.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ function read_east_asian_widths(filename)
190190
for (rng,widthcode) in read_hex_ranges(filename)
191191
w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
192192
widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
193+
widthcode == "A" ? -1 : # ambiguous width
193194
nothing
194195
if !isnothing(w)
195196
set_all!(ea_widths, rng, w)
@@ -221,7 +222,7 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
221222
# Widths from UAX #11: East Asian Width
222223
eaw = get(ea_widths, code, nothing)
223224
if !isnothing(eaw)
224-
width = eaw
225+
width = eaw < 0 ? 1 : eaw
225226
end
226227

227228
# A few exceptional cases, found by manual comparison to other wcwidth
@@ -242,6 +243,9 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
242243

243244
return width
244245
end
246+
global function is_ambiwidth(code)
247+
return get(ea_widths, code, 0) < 0
248+
end
245249
end
246250

247251
#-------------------------------------------------------------------------------
@@ -394,6 +398,7 @@ function char_table_properties!(sequences, char)
394398
control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") &&
395399
!(char.code in (0x200C, 0x200D)),
396400
charwidth = derive_char_width(code, char.category),
401+
ambiwidth = is_ambiwidth(code),
397402
boundclass = get_grapheme_boundclass(code),
398403
indic_conjunct_break = get_indic_conjunct_break(code),
399404
)
@@ -479,7 +484,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
479484

480485
print(io, """
481486
static const utf8proc_property_t utf8proc_properties[] = {
482-
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
487+
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
483488
""")
484489
for prop in deduplicated_props
485490
print(io, " {",
@@ -498,6 +503,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
498503
prop.ignorable, ", ",
499504
prop.control_boundary, ", ",
500505
prop.charwidth, ", ",
506+
prop.ambiwidth, ", ",
501507
"0, ", # bitfield padding
502508
c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
503509
c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),

test/charwidth.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ static int my_isprint(int c) {
1414
(cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
1515
}
1616

17+
static bool is_ambiwidth(int c) {
18+
return utf8proc_get_property(c)->ambiwidth;
19+
}
20+
1721
int main(int argc, char **argv)
1822
{
1923
int c, error = 0, updates = 0;
@@ -25,6 +29,7 @@ int main(int argc, char **argv)
2529
for (c = 0; c <= 0x110000; ++c) {
2630
int cat = utf8proc_get_property(c)->category;
2731
int w = utf8proc_charwidth(c);
32+
int ambiwidth = is_ambiwidth(c);
2833
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
2934
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
3035
error += 1;
@@ -42,6 +47,10 @@ int main(int argc, char **argv)
4247
isprint(c) ? "printable" : "non-printable", c);
4348
error += 1;
4449
}
50+
if (c <= 127 && is_ambiwidth(c)) {
51+
fprintf(stderr, "ambiwith set for ASCII %x\n", c);
52+
error += 1;
53+
}
4554
if (!my_isprint(c) && w > 0) {
4655
fprintf(stderr, "non-printing %x had width %d\n", c, w);
4756
error += 1;
@@ -50,11 +59,20 @@ int main(int argc, char **argv)
5059
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
5160
error += 1;
5261
}
62+
if (ambiwidth && w >= 2) {
63+
fprintf(stderr, "char %x is both doublewidth and ambiwidth\n", c);
64+
error += 1;
65+
}
5366
}
5467
check(!error, "utf8proc_charwidth FAILED %d tests.", error);
5568

5669
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
70+
check(is_ambiwidth(0x00ad) , "incorrect ambiwidth for U+00AD (soft hyphen)");
5771
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
72+
check(is_ambiwidth(0xe000), "incorrect ambiwidth for U+e000 (PUA)");
73+
74+
check(is_ambiwidth(0x00A1), "incorrect ambiwidth for U+00A1 (inverted exclamation mark)");
75+
check(!is_ambiwidth(0x00A2), "incorrect ambiwidth for U+00A2 (cent sign)");
5876

5977
/* print some other information by compariing with system wcwidth */
6078
printf("Mismatches with system wcwidth (not necessarily errors):\n");

utf8proc.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,9 @@ typedef struct utf8proc_property_struct {
268268
unsigned control_boundary:1;
269269
/** The width of the codepoint. */
270270
unsigned charwidth:2;
271-
unsigned pad:2;
271+
/** East Asian width class A */
272+
unsigned ambiwidth:1;
273+
unsigned pad:1;
272274
/**
273275
* Boundclass.
274276
* @see utf8proc_boundclass_t.

0 commit comments

Comments
 (0)