From 5d64115bbc594ec3e29862b1d225c5300df9606d Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 11:37:36 -0400 Subject: [PATCH 1/8] optimized textwidth(::Char) for ASCII --- base/strings/unicode.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 42a4106d0f52f..39a73b8db415e 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -240,6 +240,9 @@ end ############################################################################ +# precomputed table of ASCII text widths +const _textwidth_ascii = [ccall(:utf8proc_charwidth, Cint, (UInt32,), c) for c = 0:127] + ## character column width function ## """ textwidth(c) @@ -256,6 +259,8 @@ julia> textwidth('⛵') ``` """ function textwidth(c::AbstractChar) + b = bswap(reinterpret(UInt32, c)) # from isascii(c) + b < 0x80 && @inbounds return Int(_textwidth_ascii[b%Int+1]) # ASCII fast path ismalformed(c) && return 1 Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) end From 16b269ff97ec4f39f09fbf7cdd8a3a5c086f7aae Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 11:47:15 -0400 Subject: [PATCH 2/8] fix AbstractChar vs Char --- base/strings/unicode.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 39a73b8db415e..65fa704ce1cb8 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -259,6 +259,13 @@ julia> textwidth('⛵') ``` """ function textwidth(c::AbstractChar) + ismalformed(c) && return 1 + i = codepoint(c) + i < 0x80 && @inbounds return Int(_textwidth_ascii[i%Int+1]) # ASCII fast path + Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) +end + +function textwidth(c::Char) b = bswap(reinterpret(UInt32, c)) # from isascii(c) b < 0x80 && @inbounds return Int(_textwidth_ascii[b%Int+1]) # ASCII fast path ismalformed(c) && return 1 From 68d372c8e062817294a310ef21f04589adb0906a Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 11:56:08 -0400 Subject: [PATCH 3/8] more textwidth tests --- test/strings/util.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/strings/util.jl b/test/strings/util.jl index 59638dc3b9ca6..35f2e384c6205 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -2,6 +2,14 @@ SubStr(s) = SubString("abc$(s)de", firstindex(s) + 3, lastindex(s) + 3) +@testset "textwidth" begin + for (c, w) in [('x', 1), ('α', 1), ('🍕', 2), ('\0', 0), ('\u0302', 0), ('\xc0', 1)] + @test textwidth(c) == w + @test textwidth(c^3) == w*3 + @test w == @invoke textwidth(c::AbstractChar) + end +end + @testset "padding (lpad and rpad)" begin @test lpad("foo", 2) == "foo" @test rpad("foo", 2) == "foo" From 516bfd4e476fbfd30aed814f5df93de8f46f4675 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 12:01:00 -0400 Subject: [PATCH 4/8] tuple table is faster --- base/strings/unicode.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 65fa704ce1cb8..2d619076f7f13 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -241,7 +241,7 @@ end ############################################################################ # precomputed table of ASCII text widths -const _textwidth_ascii = [ccall(:utf8proc_charwidth, Cint, (UInt32,), c) for c = 0:127] +const _textwidth_ascii = ntuple(i -> ccall(:utf8proc_charwidth, Cint, (UInt32,), i-1), 128) ## character column width function ## """ From c67ebbe417e3aebc9d87ebb01fd50c50487ad0fa Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 12:11:26 -0400 Subject: [PATCH 5/8] replace lookup table with check --- base/strings/unicode.jl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 2d619076f7f13..a36a668a3dee1 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -240,9 +240,6 @@ end ############################################################################ -# precomputed table of ASCII text widths -const _textwidth_ascii = ntuple(i -> ccall(:utf8proc_charwidth, Cint, (UInt32,), i-1), 128) - ## character column width function ## """ textwidth(c) @@ -261,13 +258,13 @@ julia> textwidth('⛵') function textwidth(c::AbstractChar) ismalformed(c) && return 1 i = codepoint(c) - i < 0x80 && @inbounds return Int(_textwidth_ascii[i%Int+1]) # ASCII fast path + i < 0x7f && return Int(i >= 0x20); # ASCII fast path Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) end function textwidth(c::Char) b = bswap(reinterpret(UInt32, c)) # from isascii(c) - b < 0x80 && @inbounds return Int(_textwidth_ascii[b%Int+1]) # ASCII fast path + b < 0x7f && return Int(b >= 0x20); # ASCII fast path ismalformed(c) && return 1 Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) end From dcad1b0b80fddfdcc6f53060fec7040a830f68c8 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 12:14:11 -0400 Subject: [PATCH 6/8] more tests --- test/strings/util.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/strings/util.jl b/test/strings/util.jl index 35f2e384c6205..e5db9bd03ae8e 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -8,6 +8,12 @@ SubStr(s) = SubString("abc$(s)de", firstindex(s) + 3, lastindex(s) + 3) @test textwidth(c^3) == w*3 @test w == @invoke textwidth(c::AbstractChar) end + for i in 0x00:0x7f # test all ASCII chars (which have fast path) + w = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) + c = Char(i) + @test textwidth(c) == w + @test w == @invoke textwidth(c::AbstractChar) + end end @testset "padding (lpad and rpad)" begin From a5cdf2569cde4a08310be7a816246d54f8a94854 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 21:21:50 -0400 Subject: [PATCH 7/8] Update base/strings/unicode.jl Co-authored-by: Jeff Bezanson --- base/strings/unicode.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index a36a668a3dee1..33bae6b3445b9 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -258,7 +258,7 @@ julia> textwidth('⛵') function textwidth(c::AbstractChar) ismalformed(c) && return 1 i = codepoint(c) - i < 0x7f && return Int(i >= 0x20); # ASCII fast path + i < 0x7f && return Int(i >= 0x20) # ASCII fast path Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), i)) end From 024e83dbc7a07f748699e5605a4fcaed1b9eb6da Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 6 Aug 2024 21:21:57 -0400 Subject: [PATCH 8/8] Update base/strings/unicode.jl Co-authored-by: Jeff Bezanson --- base/strings/unicode.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 33bae6b3445b9..3c6710025077c 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -264,7 +264,7 @@ end function textwidth(c::Char) b = bswap(reinterpret(UInt32, c)) # from isascii(c) - b < 0x7f && return Int(b >= 0x20); # ASCII fast path + b < 0x7f && return Int(b >= 0x20) # ASCII fast path ismalformed(c) && return 1 Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) end