Skip to content

Commit f9f11a3

Browse files
authored
Move docstrings to Unicode stdlib module (#25902)
Functions need to be defined in Unicode module, as reexporting Base.Unicode functions makes them appear with the latter prefix in the manual. Also fix examples by fully qualifying Unicode.normalize().
1 parent 131813f commit f9f11a3

File tree

3 files changed

+92
-81
lines changed

3 files changed

+92
-81
lines changed

base/strings/substring.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
107107
Reverses a string. Technically, this function reverses the codepoints in a string and its
108108
main utility is for reversed-order string processing, especially for reversed
109109
regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to
110-
indices in `reverse(s)` and vice-versa, and [`Unicode.graphemes`](@ref Base.Unicode.graphemes) to
110+
indices in `reverse(s)` and vice-versa, and `graphemes` from module `Unicode` to
111111
operate on user-visible "characters" (graphemes) rather than codepoints.
112112
See also [`Iterators.reverse`](@ref) for
113113
reverse-order iteration without making a copy. Custom string types must implement the

base/strings/unicode.jl

Lines changed: 3 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ end
150150

151151
utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
152152

153+
# Documented in Unicode module
153154
function normalize(
154155
s::AbstractString;
155156
stable::Bool=false,
@@ -190,55 +191,6 @@ function normalize(
190191
utf8proc_map(s, flags)
191192
end
192193

193-
"""
194-
Unicode.normalize(s::AbstractString, normalform::Symbol)
195-
196-
Normalize the string `s` according to one of the four "normal forms" of the Unicode
197-
standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
198-
(canonical composition) and D (canonical decomposition) convert different visually identical
199-
representations of the same abstract string into a single canonical form, with form C being
200-
more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
201-
they convert characters that are abstractly similar but visually distinct into a single
202-
canonical choice (e.g. they expand ligatures into the individual characters), with form KC
203-
being more compact.
204-
205-
Alternatively, finer control and additional transformations may be be obtained by calling
206-
`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords
207-
options (which all default to `false` except for `compose`) are specified:
208-
209-
* `compose=false`: do not perform canonical composition
210-
* `decompose=true`: do canonical decomposition instead of canonical composition
211-
(`compose=true` is ignored if present)
212-
* `compat=true`: compatibility equivalents are canonicalized
213-
* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
214-
* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
215-
sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
216-
paragraph-separation (PS) character, respectively
217-
* `stripmark=true`: strip diacritical marks (e.g. accents)
218-
* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
219-
or the left-to-right marker)
220-
* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
221-
spaces; newlines are also converted to spaces unless a newline-conversion flag was
222-
specified
223-
* `rejectna=true`: throw an error if unassigned code points are found
224-
* `stable=true`: enforce Unicode Versioning Stability
225-
226-
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
227-
228-
# Examples
229-
```jldoctest
230-
julia> using Unicode
231-
232-
julia> "μ" == normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5
233-
true
234-
235-
julia> normalize("JuLiA", casefold=true)
236-
"julia"
237-
238-
julia> normalize("JúLiA", stripmark=true)
239-
"JuLiA"
240-
```
241-
"""
242194
function normalize(s::AbstractString, nf::Symbol)
243195
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
244196
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
@@ -311,22 +263,6 @@ end
311263

312264
category_string(c) = category_strings[category_code(c)+1]
313265

314-
"""
315-
Unicode.isassigned(c) -> Bool
316-
317-
Returns `true` if the given char or integer is an assigned Unicode code point.
318-
319-
# Examples
320-
```jldoctest
321-
julia> using Unicode
322-
323-
julia> Unicode.isassigned(101)
324-
true
325-
326-
julia> Unicode.isassigned('\\x01')
327-
true
328-
```
329-
"""
330266
isassigned(c) = UTF8PROC_CATEGORY_CN < category_code(c) <= UTF8PROC_CATEGORY_CO
331267

332268
## libc character class predicates ##
@@ -378,11 +314,7 @@ function isupper(c::Char)
378314
cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
379315
end
380316

381-
"""
382-
iscased(c::Char) -> Bool
383-
384-
Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
385-
"""
317+
# Documented in Unicode module
386318
function iscased(c::Char)
387319
cat = category_code(c)
388320
return cat == UTF8PROC_CATEGORY_LU ||
@@ -696,14 +628,7 @@ struct GraphemeIterator{S<:AbstractString}
696628
s::S # original string (for generation of SubStrings)
697629
end
698630

699-
"""
700-
graphemes(s::AbstractString) -> GraphemeIterator
701-
702-
Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
703-
string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
704-
single characters, even though they may contain more than one codepoint; for example a
705-
letter combined with an accent mark is a single grapheme.)
706-
"""
631+
# Documented in Unicode module
707632
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
708633

709634
eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}

stdlib/Unicode/src/Unicode.jl

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,96 @@ __precompile__(true)
44

55
module Unicode
66

7-
using Base.Unicode: normalize, graphemes, isassigned, iscased
8-
97
export graphemes
108

9+
"""
10+
Unicode.normalize(s::AbstractString, normalform::Symbol)
11+
12+
Normalize the string `s` according to one of the four "normal forms" of the Unicode
13+
standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
14+
(canonical composition) and D (canonical decomposition) convert different visually identical
15+
representations of the same abstract string into a single canonical form, with form C being
16+
more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
17+
they convert characters that are abstractly similar but visually distinct into a single
18+
canonical choice (e.g. they expand ligatures into the individual characters), with form KC
19+
being more compact.
20+
21+
Alternatively, finer control and additional transformations may be be obtained by calling
22+
`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords
23+
options (which all default to `false` except for `compose`) are specified:
24+
25+
* `compose=false`: do not perform canonical composition
26+
* `decompose=true`: do canonical decomposition instead of canonical composition
27+
(`compose=true` is ignored if present)
28+
* `compat=true`: compatibility equivalents are canonicalized
29+
* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
30+
* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
31+
sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
32+
paragraph-separation (PS) character, respectively
33+
* `stripmark=true`: strip diacritical marks (e.g. accents)
34+
* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
35+
or the left-to-right marker)
36+
* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
37+
spaces; newlines are also converted to spaces unless a newline-conversion flag was
38+
specified
39+
* `rejectna=true`: throw an error if unassigned code points are found
40+
* `stable=true`: enforce Unicode Versioning Stability
41+
42+
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
43+
44+
# Examples
45+
```jldoctest
46+
julia> using Unicode
47+
48+
julia> "μ" == Unicode.normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5
49+
true
50+
51+
julia> Unicode.normalize("JuLiA", casefold=true)
52+
"julia"
53+
54+
julia> Unicode.normalize("JúLiA", stripmark=true)
55+
"JuLiA"
56+
```
57+
"""
58+
function normalize end
59+
normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf)
60+
normalize(s::AbstractString; kwargs...) = Base.Unicode.normalize(s; kwargs...)
61+
62+
"""
63+
Unicode.isassigned(c) -> Bool
64+
65+
Returns `true` if the given char or integer is an assigned Unicode code point.
66+
67+
# Examples
68+
```jldoctest
69+
julia> using Unicode
70+
71+
julia> Unicode.isassigned(101)
72+
true
73+
74+
julia> Unicode.isassigned('\\x01')
75+
true
76+
```
77+
"""
78+
isassigned(c) = Base.Unicode.isassigned(c)
79+
80+
"""
81+
iscased(c::Char) -> Bool
82+
83+
Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
84+
"""
85+
iscased(c::Char) = Base.Unicode.iscased(c)
86+
87+
"""
88+
graphemes(s::AbstractString) -> GraphemeIterator
89+
90+
Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
91+
string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
92+
single characters, even though they may contain more than one codepoint; for example a
93+
letter combined with an accent mark is a single grapheme.)
94+
"""
95+
graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
96+
1197
# BEGIN 0.7 deprecations
1298

1399
@deprecate is_assigned_char(c::Char) Unicode.isassigned(c)

0 commit comments

Comments
 (0)