From 0b6cf370b50a4f7c4eb84a1a6b8350e28916af0f Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Wed, 17 Jul 2024 11:15:52 -0400
Subject: [PATCH 1/7] make codepoint(c) work for overlong chars

---
 NEWS.md                 |  2 ++
 base/char.jl            | 57 +++++++++++++++++++++++++++--------------
 doc/src/base/strings.md |  3 +++
 test/char.jl            | 13 ++++++++--
 4 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index a29fe2d43ab39..33c479f61b218 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -99,6 +99,8 @@ Standard library changes
 ------------------------
 
 * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]).
+* `codepoint(c)` now succeeds for overlong encodings.  `Base.ismalformed`, `Base.isoverlong`, and
+  `Base.show_invalid` are now `public` and documented (but not exported) ([#xxxxx]).
 
 #### StyledStrings
 
diff --git a/base/char.jl b/base/char.jl
index bc68a672ce0ca..0481c1972354f 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -76,9 +76,18 @@ end
 
 Return the Unicode codepoint (an unsigned integer) corresponding
 to the character `c` (or throw an exception if `c` does not represent
-a valid character). For `Char`, this is a `UInt32` value, but
+a well-formed character). For `Char`, this is a `UInt32` value, but
 `AbstractChar` types that represent only a subset of Unicode may
 return a different-sized integer (e.g. `UInt8`).
+
+Should succeed for any non-malformed character, i.e. when
+[`Base.ismalformed(c)`](@ref) returns `false`.   This includes
+invalid Unicode characters (such as unpaired surrogates)
+and overlong encodings.
+
+!!! compat "Julia 1.12"
+    Prior to Julia 1.12, `codepoint(c)` fails for overlong encodings (when
+    [`Base.isoverlong(c)`](@ref) is `true`), and `Base.decode_overlong(c)` was needed.
 """
 function codepoint end
 
@@ -112,24 +121,35 @@ end
 #           not to support malformed or overlong encodings.
 
 """
-    ismalformed(c::AbstractChar) -> Bool
+    Base.ismalformed(c::AbstractChar) -> Bool
 
 Return `true` if `c` represents malformed (non-Unicode) data according to the
 encoding used by `c`. Defaults to `false` for non-`Char` types.
 
-See also [`show_invalid`](@ref).
+Any *non*-malformed `c` can be mapped to an integer codepoint
+by [`codepoint(c)`](@ref); this includes codepoints that are
+not valid Unicode characters ([`isvalid(c)`](@ref) is `false`).
+For example, well-formed characters can include invalid Unicode
+codepoints like `'\\U110000'`, unpaired surrogates such as `'\\ud800'`,
+and can also include overlong encodings ([`Base.isoverlong`](@ref)).
+Malformed data, in contrast, cannot be decoded to a codepoint
+(`codepoint` will throw an exception).
+
+See also [`Base.show_invalid`](@ref).
 """
 ismalformed(c::AbstractChar) = false
+public ismalformed
 
 """
-    isoverlong(c::AbstractChar) -> Bool
+    Base.isoverlong(c::AbstractChar) -> Bool
 
 Return `true` if `c` represents an overlong UTF-8 sequence. Defaults
 to `false` for non-`Char` types.
 
-See also [`decode_overlong`](@ref) and [`show_invalid`](@ref).
+See also [`Base.show_invalid`](@ref).
 """
 isoverlong(c::AbstractChar) = false
+public isoverlong
 
 @constprop :aggressive function UInt32(c::Char)
     # TODO: use optimized inline LLVM
@@ -138,7 +158,7 @@ isoverlong(c::AbstractChar) = false
     l1 = leading_ones(u)
     t0 = trailing_zeros(u) & 56
     (l1 == 1) | (8l1 + t0 > 32) |
-    ((((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) | is_overlong_enc(u)) &&
+    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) &&
         throw_invalid_char(c)
     u &= 0xffffffff >> l1
     u >>= t0
@@ -150,20 +170,18 @@ end
     decode_overlong(c::AbstractChar) -> Integer
 
 When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns
-the Unicode codepoint value of `c`. `AbstractChar` implementations
-that support overlong encodings should implement `Base.decode_overlong`.
+the Unicode codepoint value of `c`.   Deprecated in favor of
+`codepoint(c)`.
+
+!!! compat "Julia 1.12"
+    In Julia 1.12 or later, `decode_overlong(c)` simply calls
+    `codepoint(c)`, which should now work for overlong encodings.
+    `AbstractChar` implementations that support overlong encodings
+    should implement `Base.decode_overlong` on older releases.
 """
 function decode_overlong end
 
-@constprop :aggressive function decode_overlong(c::Char)
-    u = bitcast(UInt32, c)
-    l1 = leading_ones(u)
-    t0 = trailing_zeros(u) & 56
-    u &= 0xffffffff >> l1
-    u >>= t0
-    ((u & 0x0000007f) >> 0) | ((u & 0x00007f00) >> 2) |
-    ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6)
-end
+@constprop :aggressive decode_overlong(c::AbstractChar) = codepoint(c)
 
 @constprop :aggressive function Char(u::UInt32)
     u < 0x80 && return bitcast(Char, u << 24)
@@ -277,7 +295,7 @@ function show_invalid(io::IO, c::Char)
 end
 
 """
-    show_invalid(io::IO, c::AbstractChar)
+    Base.show_invalid(io::IO, c::AbstractChar)
 
 Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
 [`ismalformed(c)`](@ref) return `true`.   Subclasses
@@ -285,6 +303,7 @@ of `AbstractChar` should define `Base.show_invalid` methods
 if they support storing invalid character data.
 """
 show_invalid
+public show_invalid
 
 # show c to io, assuming UTF-8 encoded output
 function show(io::IO, c::AbstractChar)
@@ -330,7 +349,7 @@ function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar}
         print(io, ": ")
         if isoverlong(c)
             print(io, "[overlong] ")
-            u = decode_overlong(c)
+            u = decode_overlong(c) # backwards compat Julia < 1.12
             c = T(u)
         else
             u = codepoint(c)
diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md
index ef470be6b55cc..d1e142be0529b 100644
--- a/doc/src/base/strings.md
+++ b/doc/src/base/strings.md
@@ -39,6 +39,9 @@ Base.Docs.@text_str
 Base.isvalid(::Any)
 Base.isvalid(::Any, ::Any)
 Base.isvalid(::AbstractString, ::Integer)
+Base.ismalformed
+Base.isoverlong
+Base.show_invalid
 Base.match
 Base.eachmatch
 Base.RegexMatch
diff --git a/test/char.jl b/test/char.jl
index 5da92121b1630..3928d3c45016a 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -244,8 +244,8 @@ end
 
 @testset "overlong codes" begin
     function test_overlong(c::Char, n::Integer, rep::String)
-        if isvalid(c)
-            @test Int(c) == n
+        if !Base.ismalformed(c)
+            @test Int(c) == n == codepoint(c)
         else
             @test_throws Base.InvalidCharError UInt32(c)
         end
@@ -352,6 +352,15 @@ end
         "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)"
 end
 
+@testset "overlong, non-malformed chars" begin
+    c = ['\xc0\xa0', '\xf0\x8e\x80\x80']
+    @test all(Base.isoverlong, c)
+    @test !any(Base.ismalformed, c)
+    @test repr("text/plain", c[1]) == "'\\xc0\\xa0': [overlong] ASCII/Unicode U+0020 (category Zs: Separator, space)"
+    @test codepoint.(c) == [0x20, 0xE000]
+    @test isuppercase(c[1]) == isuppercase(c[2]) == false # issue #54343
+end
+
 @testset "More fallback tests" begin
     @test length(ASCIIChar('x')) == 1
     @test firstindex(ASCIIChar('x')) == 1

From 28f59ddd2122dc9a1ef961db6bb30254d479e2c5 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Wed, 17 Jul 2024 11:21:15 -0400
Subject: [PATCH 2/7] add PR # to NEWS

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 33c479f61b218..fe76665dc8453 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -100,7 +100,7 @@ Standard library changes
 
 * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]).
 * `codepoint(c)` now succeeds for overlong encodings.  `Base.ismalformed`, `Base.isoverlong`, and
-  `Base.show_invalid` are now `public` and documented (but not exported) ([#xxxxx]).
+  `Base.show_invalid` are now `public` and documented (but not exported) ([#55152]).
 
 #### StyledStrings
 

From 1cb029177fec1f51f6a63ed93c64e5fa4f5504b9 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 30 Jul 2024 19:10:38 -0400
Subject: [PATCH 3/7] clarify AbstractChar docs

---
 base/char.jl | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index 0481c1972354f..7d5ae1f705119 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -2,18 +2,22 @@
 
 """
 The `AbstractChar` type is the supertype of all character implementations
-in Julia. A character represents a Unicode code point, and can be converted
-to an integer via the [`codepoint`](@ref) function in order to obtain the
-numerical value of the code point, or constructed from the same integer.
-These numerical values determine how characters are compared with `<` and `==`,
-for example.  New `T <: AbstractChar` types should define a `codepoint(::T)`
+in Julia. A character normally represents a Unicode codepoint (and can
+also encapsulate other information from an encoded byte sequence as described below),
+and characters can be converted to integer codepoint values via the [`codepoint`](@ref)
+function, or can be constructed from the same integer.  At least for valid,
+properly encoded Unicode characters, these numerical codepoint values
+determine how characters are compared with `<` and `==`, for example. 
+New `T <: AbstractChar` types should define a `codepoint(::T)`
 method and a `T(::UInt32)` constructor, at minimum.
 
 A given `AbstractChar` subtype may be capable of representing only a subset
 of Unicode, in which case conversion from an unsupported `UInt32` value
 may throw an error. Conversely, the built-in [`Char`](@ref) type represents
 a *superset* of Unicode (in order to losslessly encode invalid byte streams),
-in which case conversion of a non-Unicode value *to* `UInt32` throws an error.
+in which case conversion of a non-Unicode value *to* `UInt32` throws an error
+(see [`Base.ismalformed`](@ref)), and on the other hand a `Char` can also represent
+a nonstandard "overlong" encoding ([`Base.isoverlong`](@ref)) of a codepoint.
 The [`isvalid`](@ref) function can be used to check which codepoints are
 representable in a given `AbstractChar` type.
 

From bb40574b00a9f8198ae8da90f195c4385b7232d9 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Wed, 1 Jan 2025 10:06:05 -0500
Subject: [PATCH 4/7] Update char.jl: rm trailing whitespace

---
 base/char.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/char.jl b/base/char.jl
index 7d5ae1f705119..62e3a1522bf23 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -7,7 +7,7 @@ also encapsulate other information from an encoded byte sequence as described be
 and characters can be converted to integer codepoint values via the [`codepoint`](@ref)
 function, or can be constructed from the same integer.  At least for valid,
 properly encoded Unicode characters, these numerical codepoint values
-determine how characters are compared with `<` and `==`, for example. 
+determine how characters are compared with `<` and `==`, for example.
 New `T <: AbstractChar` types should define a `codepoint(::T)`
 method and a `T(::UInt32)` constructor, at minimum.
 

From f778faba2e70f09469a0f5b759f6ec36d4e85a61 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Thu, 2 Jan 2025 14:41:23 -0500
Subject: [PATCH 5/7] move public decls to public.jl

---
 base/char.jl   | 3 ---
 base/public.jl | 5 +++++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index 2e1845247ed6d..985b44834d4ac 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -142,7 +142,6 @@ Malformed data, in contrast, cannot be decoded to a codepoint
 See also [`Base.show_invalid`](@ref).
 """
 ismalformed(c::AbstractChar) = false
-public ismalformed
 
 """
     Base.isoverlong(c::AbstractChar) -> Bool
@@ -153,7 +152,6 @@ to `false` for non-`Char` types.
 See also [`Base.show_invalid`](@ref).
 """
 isoverlong(c::AbstractChar) = false
-public isoverlong
 
 @constprop :aggressive function UInt32(c::Char)
     # TODO: use optimized inline LLVM
@@ -308,7 +306,6 @@ of `AbstractChar` should define `Base.show_invalid` methods
 if they support storing invalid character data.
 """
 show_invalid
-public show_invalid
 
 # show c to io, assuming UTF-8 encoded output
 function show(io::IO, c::AbstractChar)
diff --git a/base/public.jl b/base/public.jl
index 8777a454c920a..84e6846ad7156 100644
--- a/base/public.jl
+++ b/base/public.jl
@@ -96,6 +96,11 @@ public
 # Strings
     escape_raw_string,
 
+# Chars
+    ismalformed,
+    isoverlong,
+    show_invalid,
+
 # IO
     # types
     BufferStream,

From b8a06bdfaccfb9acaea991c359206502604ece3a Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Thu, 2 Jan 2025 15:46:31 -0500
Subject: [PATCH 6/7] Update base/char.jl

---
 base/char.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/char.jl b/base/char.jl
index 985b44834d4ac..95aff77f86f01 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -127,7 +127,7 @@ end
 """
     Base.ismalformed(c::AbstractChar) -> Bool
 
-Return `true` if `c` represents malformed (non-Unicode) data according to the
+Return `true` if `c` represents malformed (non-codepoint / mis-encoded) data according to the
 encoding used by `c`. Defaults to `false` for non-`Char` types.
 
 Any *non*-malformed `c` can be mapped to an integer codepoint

From c717418d5d2c01bb0c0c993be33e36d3c29fc665 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Thu, 2 Jan 2025 15:49:15 -0500
Subject: [PATCH 7/7] not well-formed -> malformed

---
 base/char.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index 95aff77f86f01..8062ec1c64bd2 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -79,8 +79,8 @@ end
     codepoint(c::AbstractChar) -> Integer
 
 Return the Unicode codepoint (an unsigned integer) corresponding
-to the character `c` (or throw an exception if `c` does not represent
-a well-formed character). For `Char`, this is a `UInt32` value, but
+to the character `c` (or throw an exception if `c` represents
+a malformed character). For `Char`, this is a `UInt32` value, but
 `AbstractChar` types that represent only a subset of Unicode may
 return a different-sized integer (e.g. `UInt8`).