From 0b6cf370b50a4f7c4eb84a1a6b8350e28916af0f Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 17 Jul 2024 11:15:52 -0400 Subject: [PATCH 1/7] make codepoint(c) work for overlong chars --- NEWS.md | 2 ++ base/char.jl | 57 +++++++++++++++++++++++++++-------------- doc/src/base/strings.md | 3 +++ test/char.jl | 13 ++++++++-- 4 files changed, 54 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index a29fe2d43ab39..33c479f61b218 100644 --- a/NEWS.md +++ b/NEWS.md @@ -99,6 +99,8 @@ Standard library changes ------------------------ * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]). +* `codepoint(c)` now succeeds for overlong encodings. `Base.ismalformed`, `Base.isoverlong`, and + `Base.show_invalid` are now `public` and documented (but not exported) ([#xxxxx]). #### StyledStrings diff --git a/base/char.jl b/base/char.jl index bc68a672ce0ca..0481c1972354f 100644 --- a/base/char.jl +++ b/base/char.jl @@ -76,9 +76,18 @@ end Return the Unicode codepoint (an unsigned integer) corresponding to the character `c` (or throw an exception if `c` does not represent -a valid character). For `Char`, this is a `UInt32` value, but +a well-formed character). For `Char`, this is a `UInt32` value, but `AbstractChar` types that represent only a subset of Unicode may return a different-sized integer (e.g. `UInt8`). + +Should succeed for any non-malformed character, i.e. when +[`Base.ismalformed(c)`](@ref) returns `false`. This includes +invalid Unicode characters (such as unpaired surrogates) +and overlong encodings. + +!!! compat "Julia 1.12" + Prior to Julia 1.12, `codepoint(c)` fails for overlong encodings (when + [`Base.isoverlong(c)`](@ref) is `true`), and `Base.decode_overlong(c)` was needed. """ function codepoint end @@ -112,24 +121,35 @@ end # not to support malformed or overlong encodings. """ - ismalformed(c::AbstractChar) -> Bool + Base.ismalformed(c::AbstractChar) -> Bool Return `true` if `c` represents malformed (non-Unicode) data according to the encoding used by `c`. Defaults to `false` for non-`Char` types. -See also [`show_invalid`](@ref). +Any *non*-malformed `c` can be mapped to an integer codepoint +by [`codepoint(c)`](@ref); this includes codepoints that are +not valid Unicode characters ([`isvalid(c)`](@ref) is `false`). +For example, well-formed characters can include invalid Unicode +codepoints like `'\\U110000'`, unpaired surrogates such as `'\\ud800'`, +and can also include overlong encodings ([`Base.isoverlong`](@ref)). +Malformed data, in contrast, cannot be decoded to a codepoint +(`codepoint` will throw an exception). + +See also [`Base.show_invalid`](@ref). """ ismalformed(c::AbstractChar) = false +public ismalformed """ - isoverlong(c::AbstractChar) -> Bool + Base.isoverlong(c::AbstractChar) -> Bool Return `true` if `c` represents an overlong UTF-8 sequence. Defaults to `false` for non-`Char` types. -See also [`decode_overlong`](@ref) and [`show_invalid`](@ref). +See also [`Base.show_invalid`](@ref). """ isoverlong(c::AbstractChar) = false +public isoverlong @constprop :aggressive function UInt32(c::Char) # TODO: use optimized inline LLVM @@ -138,7 +158,7 @@ isoverlong(c::AbstractChar) = false l1 = leading_ones(u) t0 = trailing_zeros(u) & 56 (l1 == 1) | (8l1 + t0 > 32) | - ((((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) | is_overlong_enc(u)) && + (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) && throw_invalid_char(c) u &= 0xffffffff >> l1 u >>= t0 @@ -150,20 +170,18 @@ end decode_overlong(c::AbstractChar) -> Integer When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns -the Unicode codepoint value of `c`. `AbstractChar` implementations -that support overlong encodings should implement `Base.decode_overlong`. +the Unicode codepoint value of `c`. Deprecated in favor of +`codepoint(c)`. + +!!! compat "Julia 1.12" + In Julia 1.12 or later, `decode_overlong(c)` simply calls + `codepoint(c)`, which should now work for overlong encodings. + `AbstractChar` implementations that support overlong encodings + should implement `Base.decode_overlong` on older releases. """ function decode_overlong end -@constprop :aggressive function decode_overlong(c::Char) - u = bitcast(UInt32, c) - l1 = leading_ones(u) - t0 = trailing_zeros(u) & 56 - u &= 0xffffffff >> l1 - u >>= t0 - ((u & 0x0000007f) >> 0) | ((u & 0x00007f00) >> 2) | - ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6) -end +@constprop :aggressive decode_overlong(c::AbstractChar) = codepoint(c) @constprop :aggressive function Char(u::UInt32) u < 0x80 && return bitcast(Char, u << 24) @@ -277,7 +295,7 @@ function show_invalid(io::IO, c::Char) end """ - show_invalid(io::IO, c::AbstractChar) + Base.show_invalid(io::IO, c::AbstractChar) Called by `show(io, c)` when [`isoverlong(c)`](@ref) or [`ismalformed(c)`](@ref) return `true`. Subclasses @@ -285,6 +303,7 @@ of `AbstractChar` should define `Base.show_invalid` methods if they support storing invalid character data. """ show_invalid +public show_invalid # show c to io, assuming UTF-8 encoded output function show(io::IO, c::AbstractChar) @@ -330,7 +349,7 @@ function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar} print(io, ": ") if isoverlong(c) print(io, "[overlong] ") - u = decode_overlong(c) + u = decode_overlong(c) # backwards compat Julia < 1.12 c = T(u) else u = codepoint(c) diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md index ef470be6b55cc..d1e142be0529b 100644 --- a/doc/src/base/strings.md +++ b/doc/src/base/strings.md @@ -39,6 +39,9 @@ Base.Docs.@text_str Base.isvalid(::Any) Base.isvalid(::Any, ::Any) Base.isvalid(::AbstractString, ::Integer) +Base.ismalformed +Base.isoverlong +Base.show_invalid Base.match Base.eachmatch Base.RegexMatch diff --git a/test/char.jl b/test/char.jl index 5da92121b1630..3928d3c45016a 100644 --- a/test/char.jl +++ b/test/char.jl @@ -244,8 +244,8 @@ end @testset "overlong codes" begin function test_overlong(c::Char, n::Integer, rep::String) - if isvalid(c) - @test Int(c) == n + if !Base.ismalformed(c) + @test Int(c) == n == codepoint(c) else @test_throws Base.InvalidCharError UInt32(c) end @@ -352,6 +352,15 @@ end "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)" end +@testset "overlong, non-malformed chars" begin + c = ['\xc0\xa0', '\xf0\x8e\x80\x80'] + @test all(Base.isoverlong, c) + @test !any(Base.ismalformed, c) + @test repr("text/plain", c[1]) == "'\\xc0\\xa0': [overlong] ASCII/Unicode U+0020 (category Zs: Separator, space)" + @test codepoint.(c) == [0x20, 0xE000] + @test isuppercase(c[1]) == isuppercase(c[2]) == false # issue #54343 +end + @testset "More fallback tests" begin @test length(ASCIIChar('x')) == 1 @test firstindex(ASCIIChar('x')) == 1 From 28f59ddd2122dc9a1ef961db6bb30254d479e2c5 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 17 Jul 2024 11:21:15 -0400 Subject: [PATCH 2/7] add PR # to NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 33c479f61b218..fe76665dc8453 100644 --- a/NEWS.md +++ b/NEWS.md @@ -100,7 +100,7 @@ Standard library changes * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]). * `codepoint(c)` now succeeds for overlong encodings. `Base.ismalformed`, `Base.isoverlong`, and - `Base.show_invalid` are now `public` and documented (but not exported) ([#xxxxx]). + `Base.show_invalid` are now `public` and documented (but not exported) ([#55152]). #### StyledStrings From 1cb029177fec1f51f6a63ed93c64e5fa4f5504b9 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 30 Jul 2024 19:10:38 -0400 Subject: [PATCH 3/7] clarify AbstractChar docs --- base/char.jl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/base/char.jl b/base/char.jl index 0481c1972354f..7d5ae1f705119 100644 --- a/base/char.jl +++ b/base/char.jl @@ -2,18 +2,22 @@ """ The `AbstractChar` type is the supertype of all character implementations -in Julia. A character represents a Unicode code point, and can be converted -to an integer via the [`codepoint`](@ref) function in order to obtain the -numerical value of the code point, or constructed from the same integer. -These numerical values determine how characters are compared with `<` and `==`, -for example. New `T <: AbstractChar` types should define a `codepoint(::T)` +in Julia. A character normally represents a Unicode codepoint (and can +also encapsulate other information from an encoded byte sequence as described below), +and characters can be converted to integer codepoint values via the [`codepoint`](@ref) +function, or can be constructed from the same integer. At least for valid, +properly encoded Unicode characters, these numerical codepoint values +determine how characters are compared with `<` and `==`, for example. +New `T <: AbstractChar` types should define a `codepoint(::T)` method and a `T(::UInt32)` constructor, at minimum. A given `AbstractChar` subtype may be capable of representing only a subset of Unicode, in which case conversion from an unsupported `UInt32` value may throw an error. Conversely, the built-in [`Char`](@ref) type represents a *superset* of Unicode (in order to losslessly encode invalid byte streams), -in which case conversion of a non-Unicode value *to* `UInt32` throws an error. +in which case conversion of a non-Unicode value *to* `UInt32` throws an error +(see [`Base.ismalformed`](@ref)), and on the other hand a `Char` can also represent +a nonstandard "overlong" encoding ([`Base.isoverlong`](@ref)) of a codepoint. The [`isvalid`](@ref) function can be used to check which codepoints are representable in a given `AbstractChar` type. From bb40574b00a9f8198ae8da90f195c4385b7232d9 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 1 Jan 2025 10:06:05 -0500 Subject: [PATCH 4/7] Update char.jl: rm trailing whitespace --- base/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index 7d5ae1f705119..62e3a1522bf23 100644 --- a/base/char.jl +++ b/base/char.jl @@ -7,7 +7,7 @@ also encapsulate other information from an encoded byte sequence as described be and characters can be converted to integer codepoint values via the [`codepoint`](@ref) function, or can be constructed from the same integer. At least for valid, properly encoded Unicode characters, these numerical codepoint values -determine how characters are compared with `<` and `==`, for example. +determine how characters are compared with `<` and `==`, for example. New `T <: AbstractChar` types should define a `codepoint(::T)` method and a `T(::UInt32)` constructor, at minimum. From f778faba2e70f09469a0f5b759f6ec36d4e85a61 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 2 Jan 2025 14:41:23 -0500 Subject: [PATCH 5/7] move public decls to public.jl --- base/char.jl | 3 --- base/public.jl | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/base/char.jl b/base/char.jl index 2e1845247ed6d..985b44834d4ac 100644 --- a/base/char.jl +++ b/base/char.jl @@ -142,7 +142,6 @@ Malformed data, in contrast, cannot be decoded to a codepoint See also [`Base.show_invalid`](@ref). """ ismalformed(c::AbstractChar) = false -public ismalformed """ Base.isoverlong(c::AbstractChar) -> Bool @@ -153,7 +152,6 @@ to `false` for non-`Char` types. See also [`Base.show_invalid`](@ref). """ isoverlong(c::AbstractChar) = false -public isoverlong @constprop :aggressive function UInt32(c::Char) # TODO: use optimized inline LLVM @@ -308,7 +306,6 @@ of `AbstractChar` should define `Base.show_invalid` methods if they support storing invalid character data. """ show_invalid -public show_invalid # show c to io, assuming UTF-8 encoded output function show(io::IO, c::AbstractChar) diff --git a/base/public.jl b/base/public.jl index 8777a454c920a..84e6846ad7156 100644 --- a/base/public.jl +++ b/base/public.jl @@ -96,6 +96,11 @@ public # Strings escape_raw_string, +# Chars + ismalformed, + isoverlong, + show_invalid, + # IO # types BufferStream, From b8a06bdfaccfb9acaea991c359206502604ece3a Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 2 Jan 2025 15:46:31 -0500 Subject: [PATCH 6/7] Update base/char.jl --- base/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index 985b44834d4ac..95aff77f86f01 100644 --- a/base/char.jl +++ b/base/char.jl @@ -127,7 +127,7 @@ end """ Base.ismalformed(c::AbstractChar) -> Bool -Return `true` if `c` represents malformed (non-Unicode) data according to the +Return `true` if `c` represents malformed (non-codepoint / mis-encoded) data according to the encoding used by `c`. Defaults to `false` for non-`Char` types. Any *non*-malformed `c` can be mapped to an integer codepoint From c717418d5d2c01bb0c0c993be33e36d3c29fc665 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 2 Jan 2025 15:49:15 -0500 Subject: [PATCH 7/7] not well-formed -> malformed --- base/char.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/char.jl b/base/char.jl index 95aff77f86f01..8062ec1c64bd2 100644 --- a/base/char.jl +++ b/base/char.jl @@ -79,8 +79,8 @@ end codepoint(c::AbstractChar) -> Integer Return the Unicode codepoint (an unsigned integer) corresponding -to the character `c` (or throw an exception if `c` does not represent -a well-formed character). For `Char`, this is a `UInt32` value, but +to the character `c` (or throw an exception if `c` represents +a malformed character). For `Char`, this is a `UInt32` value, but `AbstractChar` types that represent only a subset of Unicode may return a different-sized integer (e.g. `UInt8`).