diff --git a/Makefile b/Makefile index 806f441..ab03f74 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,7 @@ manifest: MANIFEST.new # real targets -data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl +data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.jl $(MAKE) -C data utf8proc_data.c.new utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c diff --git a/data/Makefile b/data/Makefile index 1c5830e..484c44b 100644 --- a/data/Makefile +++ b/data/Makefile @@ -1,11 +1,10 @@ # Unicode data generation rules. Except for the test data files, most # users will not use these Makefile rules, which are primarily to re-generate # unicode_data.c when we get a new Unicode version or charwidth data; they -# require ruby and julia to be installed. +# require julia to be installed. # programs CURL=curl -RUBY=ruby PERL=perl MAKE=make JULIA=julia @@ -15,11 +14,11 @@ CURLFLAGS = --retry 5 --location .DELETE_ON_ERROR: -utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt - $(RUBY) data_generator.rb < UnicodeData.txt > $@ +RAWDATA = UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt EastAsianWidth.txt emoji-data.txt -CharWidths.txt: charwidths.jl EastAsianWidth.txt - $(JULIA) charwidths.jl > $@ +utf8proc_data.c.new: data_generator.jl $(RAWDATA) + $(JULIA) --project=. -e 'using Pkg; Pkg.instantiate()' + $(JULIA) --project=. data_generator.jl > $@ # Unicode data version (must also update utf8proc_unicode_version function) UNICODE_VERSION=15.1.0 @@ -52,12 +51,12 @@ emoji-data.txt: $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt Uppercase.txt: DerivedCoreProperties.txt - $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@ + $(JULIA) -e 'print(match(r"# Derived Property: Uppercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@ Lowercase.txt: DerivedCoreProperties.txt - $(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@ + $(JULIA) -e 'print(match(r"# Derived Property: Lowercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@ clean: - rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt + rm -f $(RAWDATA) NormalizationTest.txt GraphemeBreakTest.txt rm -f Uppercase.txt Lowercase.txt rm -f utf8proc_data.c.new diff --git a/data/Manifest.toml b/data/Manifest.toml new file mode 100644 index 0000000..d397411 --- /dev/null +++ b/data/Manifest.toml @@ -0,0 +1,69 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.9.3" +manifest_format = "2.0" +project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6" + +[[deps.Adapt]] +deps = ["LinearAlgebra", "Requires"] +git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "3.7.2" + + [deps.Adapt.extensions] + AdaptStaticArraysExt = "StaticArrays" + + [deps.Adapt.weakdeps] + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + +[[deps.Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[deps.CompilerSupportLibraries_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "1.0.5+0" + +[[deps.Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[deps.LinearAlgebra]] +deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[deps.OffsetArrays]] +deps = ["Adapt"] +git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb" +uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +version = "1.12.10" + +[[deps.OpenBLAS_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.21+4" + +[[deps.Random]] +deps = ["SHA", "Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[deps.Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.3.0" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[deps.libblastrampoline_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" +version = "5.8.0+0" diff --git a/data/Project.toml b/data/Project.toml new file mode 100644 index 0000000..af0961e --- /dev/null +++ b/data/Project.toml @@ -0,0 +1,2 @@ +[deps] +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" diff --git a/data/charwidths.jl b/data/charwidths.jl deleted file mode 100644 index 1346217..0000000 --- a/data/charwidths.jl +++ /dev/null @@ -1,169 +0,0 @@ -# Following work by @jiahao, we compute character widths using a combination of -# * character category -# * UAX 11: East Asian Width -# * a few exceptions as needed -# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 -# -# We used to also use data from GNU Unifont, but that has proven unreliable -# and unlikely to match widths assumed by terminals. -# -# Requires Julia (obviously) and FontForge. - -############################################################################# -CharWidths = Dict{Int,Int}() - -############################################################################# -# Use ../libutf8proc for category codes, rather than the one in Julia, -# to minimize bootstrapping complexity when a new version of Unicode comes out. -catcode(c) = ccall((:utf8proc_category,"../libutf8proc"), Cint, (Int32,), c) - -# utf8proc category constants (must match h) -const UTF8PROC_CATEGORY_CN = 0 -const UTF8PROC_CATEGORY_LU = 1 -const UTF8PROC_CATEGORY_LL = 2 -const UTF8PROC_CATEGORY_LT = 3 -const UTF8PROC_CATEGORY_LM = 4 -const UTF8PROC_CATEGORY_LO = 5 -const UTF8PROC_CATEGORY_MN = 6 -const UTF8PROC_CATEGORY_MC = 7 -const UTF8PROC_CATEGORY_ME = 8 -const UTF8PROC_CATEGORY_ND = 9 -const UTF8PROC_CATEGORY_NL = 10 -const UTF8PROC_CATEGORY_NO = 11 -const UTF8PROC_CATEGORY_PC = 12 -const UTF8PROC_CATEGORY_PD = 13 -const UTF8PROC_CATEGORY_PS = 14 -const UTF8PROC_CATEGORY_PE = 15 -const UTF8PROC_CATEGORY_PI = 16 -const UTF8PROC_CATEGORY_PF = 17 -const UTF8PROC_CATEGORY_PO = 18 -const UTF8PROC_CATEGORY_SM = 19 -const UTF8PROC_CATEGORY_SC = 20 -const UTF8PROC_CATEGORY_SK = 21 -const UTF8PROC_CATEGORY_SO = 22 -const UTF8PROC_CATEGORY_ZS = 23 -const UTF8PROC_CATEGORY_ZL = 24 -const UTF8PROC_CATEGORY_ZP = 25 -const UTF8PROC_CATEGORY_CC = 26 -const UTF8PROC_CATEGORY_CF = 27 -const UTF8PROC_CATEGORY_CS = 28 -const UTF8PROC_CATEGORY_CO = 29 - -############################################################################# -# Use a default width of 1 for all character categories that are -# letter/symbol/number-like, as well as for unassigned/private-use chars. -# This can be overridden by UAX 11 -# below, but provides a useful nonzero fallback for new codepoints when -# a new Unicode version has been released but Unifont hasn't been updated yet. - -zerowidth = Set{Int}() # categories that may contain zero-width chars -push!(zerowidth, UTF8PROC_CATEGORY_MN) -push!(zerowidth, UTF8PROC_CATEGORY_MC) -push!(zerowidth, UTF8PROC_CATEGORY_ME) -# push!(zerowidth, UTF8PROC_CATEGORY_SK) # see issue #167 -push!(zerowidth, UTF8PROC_CATEGORY_ZL) -push!(zerowidth, UTF8PROC_CATEGORY_ZP) -push!(zerowidth, UTF8PROC_CATEGORY_CC) -push!(zerowidth, UTF8PROC_CATEGORY_CF) -push!(zerowidth, UTF8PROC_CATEGORY_CS) -for c in 0x0000:0x110000 - if catcode(c) ∉ zerowidth - CharWidths[c] = 1 - end -end - -############################################################################# -# Widths from UAX #11: East Asian Width -# .. these take precedence for all codepoints -# listed explicitly as wide/full/narrow/half-width - -for line in readlines(open("EastAsianWidth.txt")) - #Strip comments - (isempty(line) || line[1] == '#') && continue - precomment = split(line, '#')[1] - #Parse code point range and width code - tokens = split(precomment, ';') - length(tokens) >= 2 || continue - charrange = tokens[1] - width = strip(tokens[2]) - #Parse code point range into Julia UnitRange - rangetokens = split(charrange, "..") - charstart = parse(UInt32, "0x"*rangetokens[1]) - charend = parse(UInt32, "0x"*rangetokens[length(rangetokens)>1 ? 2 : 1]) - - #Assign widths - for c in charstart:charend - if width=="W" || width=="F" # wide or full - CharWidths[c]=2 - elseif width=="Na"|| width=="H" - CharWidths[c]=1 - end - end -end - -############################################################################# -# A few exceptions to the above cases, found by manual comparison -# to other wcwidth functions and similar checks. - -for c in keys(CharWidths) - cat = catcode(c) - - # make sure format control character (category Cf) have width 0 - # (some of these, like U+0601, can have a width in some cases - # but normally act like prepended combining marks. U+fff9 etc - # are also odd, but have zero width in typical terminal contexts) - if cat==UTF8PROC_CATEGORY_CF - CharWidths[c]=0 - end - - # Unifont has nonzero width for a number of non-spacing combining - # characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and - # the variation selectors - if cat==UTF8PROC_CATEGORY_MN - CharWidths[c]=0 - end - - # We also assign width of one to unassigned and private-use - # codepoints (Unifont includes ConScript Unicode Registry PUA fonts, - # but since these are nonstandard it seems questionable to use Unifont metrics; - # if they are printed as the replacement character U+FFFD they will have width 1). - if cat==UTF8PROC_CATEGORY_CO || cat==UTF8PROC_CATEGORY_CN - CharWidths[c]=1 - end - - # for some reason, Unifont has width-2 glyphs for ASCII control chars - if cat==UTF8PROC_CATEGORY_CC - CharWidths[c]=0 - end -end - -#Soft hyphen is typically printed as a hyphen (-) in terminals. -CharWidths[0x00ad]=1 - -#By definition, should have zero width (on the same line) -#0x002028 '
' category: Zl name: LINE SEPARATOR/ -#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/ -CharWidths[0x2028]=0 -CharWidths[0x2029]=0 - -############################################################################# -# Output (to a file or pipe) for processing by data_generator.rb, -# encoded as a sequence of intervals. - -firstc = 0x000000 -lastv = 0 -uhex(c) = uppercase(string(c,base=16,pad=4)) -for c in 0x0000:0x110000 - global firstc, lastv - v = get(CharWidths, c, 0) - if v != lastv || c == 0x110000 - v < 4 || error("invalid charwidth $v for $c") - if firstc+1 < c - println(uhex(firstc), "..", uhex(c-1), "; ", lastv) - else - println(uhex(firstc), "; ", lastv) - end - firstc = c - lastv = v - end -end diff --git a/data/data_generator.jl b/data/data_generator.jl new file mode 100644 index 0000000..deecb3b --- /dev/null +++ b/data/data_generator.jl @@ -0,0 +1,559 @@ +using OffsetArrays: Origin + +parsehex(str) = parse(UInt32, str, base=16) + +function parse_hex_range(line) + m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line) + if isnothing(m) + return nothing + end + i = parsehex(m[1]) + j = !isnothing(m[3]) ? parsehex(m[3]) : i + desc = rstrip(m[4]) + return (i:j, desc) +end + +function read_hex_ranges(filename) + [r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)] +end + +function collect_codepoints(range_desc, description) + list = UInt32[] + for (r,d) in range_desc + if d == description + append!(list, r) + end + end + list +end + +function set_all!(d, keys, value) + for k in keys + d[k] = value + end +end + +#------------------------------------------------------------------------------- + +derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt") + +ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point")) +uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase")) +lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase")) + + +#------------------------------------------------------------------------------- +function derive_indic_conjunct_break(derived_core_properties) + props = Dict{UInt32, String}() + set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"), "LINKER") + set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT") + set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"), "EXTEND") + props +end + +let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties) + global function get_indic_conjunct_break(code) + get(indic_conjunct_break, code, "NONE") + end +end + +#------------------------------------------------------------------------------- +function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename) + grapheme_boundclass = Dict{UInt32, String}() + for (r,desc) in read_hex_ranges(grapheme_break_filename) + set_all!(grapheme_boundclass, r, Base.uppercase(desc)) + end + for (r,desc) in read_hex_ranges(emoji_data_filename) + if desc == "Extended_Pictographic" + set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC") + elseif desc == "Emoji_Modifier" + set_all!(grapheme_boundclass, r, "EXTEND") + end + end + return grapheme_boundclass +end + +let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt") + global function get_grapheme_boundclass(code) + get(grapheme_boundclasses, code, "OTHER") + end +end + +#------------------------------------------------------------------------------- +function read_composition_exclusions(pattern) + section = match(pattern, read("CompositionExclusions.txt",String)).match + es = UInt32[] + for line in split(section, '\n') + m = match(r"^([0-9A-F]+) +#"i, line) + if !isnothing(m) + push!(es, parsehex(m[1])) + end + end + es +end + +exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s)) +excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s)) + +# FIXME: Replicate a bug in the ruby code +push!(exclusions, 0) +push!(excl_version, 0) + +#------------------------------------------------------------------------------- +function read_case_folding(filename) + case_folding = Dict{UInt32,Vector{UInt32}}() + for line in readlines(filename) + m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line) + !isnothing(m) || continue + case_folding[parsehex(m[1])] = parsehex.(split(m[2])) + end + case_folding +end + +let case_folding = read_case_folding("CaseFolding.txt") + global function get_case_folding(code) + get(case_folding, code, nothing) + end +end + +#------------------------------------------------------------------------------- +# Utilities for reading per-char properties from UnicodeData.txt +function split_unicode_data_line(line) + m = match(r""" + ([0-9A-F]+); # code + ([^;]+); # name + ([A-Z]+); # general category + ([0-9]+); # canonical combining class + ([A-Z]+); # bidi class + (<([A-Z]*)>)? # decomposition type + ((\ ?[0-9A-F]+)*); # decompomposition mapping + ([0-9]*); # decimal digit + ([0-9]*); # digit + ([^;]*); # numeric + ([YN]*); # bidi mirrored + ([^;]*); # unicode 1.0 name + ([^;]*); # iso comment + ([0-9A-F]*); # simple uppercase mapping + ([0-9A-F]*); # simple lowercase mapping + ([0-9A-F]*)$ # simple titlecase mapping + """ix, line) + @assert !isnothing(m) + code = parse(UInt32, m[1], base=16) + (code = code, + name = m[2], + category = m[3], + combining_class = parse(Int, m[4]), + bidi_class = m[5], + decomp_type = m[7], + decomp_mapping = m[8] == "" ? nothing : parsehex.(split(m[8])), + bidi_mirrored = m[13] == "Y", + # issue #130: use nonstandard uppercase ß -> ẞ + # issue #195: if character is uppercase but has no lowercase mapping, + # then make lowercase mapping = itself (vice versa for lowercase) + uppercase_mapping = m[16] != "" ? parsehex(m[16]) : + code == 0x000000df ? 0x00001e9e : + m[17] == "" && code in lowercase ? code : + nothing, + lowercase_mapping = m[17] != "" ? parsehex(m[17]) : + m[16] == "" && code in uppercase ? code : + nothing, + titlecase_mapping = m[18] != "" ? parsehex(m[18]) : + code == 0x000000df ? 0x00001e9e : + nothing, + ) +end + +function read_unicode_data(filename) + raw_char_props = split_unicode_data_line.(readlines(filename)) + char_props = Origin(0)(Vector{eltype(raw_char_props)}()) + @assert issorted(raw_char_props, by=c->c.code) + raw_char_props = Iterators.Stateful(raw_char_props) + while !isempty(raw_char_props) + c = popfirst!(raw_char_props) + if occursin(", First>", c.name) + nc = popfirst!(raw_char_props) + @assert occursin(", Last>", nc.name) + name = replace(c.name, ", First"=>"") + for i in c.code:nc.code + push!(char_props, (; c..., name=name, code=i)) + end + else + push!(char_props, c) + end + end + return char_props +end + +char_props = read_unicode_data("UnicodeData.txt") +char_hash = Dict(c.code=>c for c in char_props) + +#------------------------------------------------------------------------------- +# Read character widths from UAX #11: East Asian Width +function read_east_asian_widths(filename) + ea_widths = Dict{UInt32,Int}() + for (rng,widthcode) in read_hex_ranges(filename) + w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full + widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width + nothing + if !isnothing(w) + set_all!(ea_widths, rng, w) + end + end + return ea_widths +end + +let ea_widths = read_east_asian_widths("EastAsianWidth.txt") + # Following work by @jiahao, we compute character widths using a combination of + # * character category + # * UAX 11: East Asian Width + # * a few exceptions as needed + # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 + global function derive_char_width(code, category) + # Use a default width of 1 for all character categories that are + # letter/symbol/number-like, as well as for unassigned/private-use chars. + # This provides a useful nonzero fallback for new codepoints when a new + # Unicode version has been released. + width = 1 + + # Various zero-width categories + # + # "Sk" not included in zero width - see issue #167 + if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs") + width = 0 + end + + # Widths from UAX #11: East Asian Width + eaw = get(ea_widths, code, nothing) + if !isnothing(eaw) + width = eaw + end + + # A few exceptional cases, found by manual comparison to other wcwidth + # functions and similar checks. + if category == "Mn" + width = 0 + end + + if code == 0x00ad + # Soft hyphen is typically printed as a hyphen (-) in terminals. + width = 1 + elseif code == 0x2028 || code == 0x2029 + #By definition, should have zero width (on the same line) + #0x002028 '
' category: Zl name: LINE SEPARATOR/ + #0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/ + width = 0 + end + + return width + end +end + +#------------------------------------------------------------------------------- +# Construct data tables which will drive libutf8proc +# +# These tables are "compressed" with an ad-hoc compression scheme (largely some +# simple deduplication and indexing) which can easily and efficiently be +# decompressed on the C side at runtime. + +# Inverse decomposition mapping tables for combining two characters into a single one. +comb1st_indices = Dict{UInt32,Int}() +comb1st_indices_sorted_keys = Origin(0)(UInt32[]) +comb2nd_indices = Dict{UInt32,Int}() +comb2nd_indices_sorted_keys = Origin(0)(UInt32[]) +comb2nd_indices_nonbasic = Set{UInt32}() +comb_array = Origin(0)(Vector{Dict{Int,UInt32}}()) +for char in char_props + if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) && + length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) && + char_hash[char.decomp_mapping[1]].combining_class == 0 && + char.code ∉ exclusions + dm0 = char.decomp_mapping[1] + dm1 = char.decomp_mapping[2] + if !haskey(comb1st_indices, dm0) + comb1st_indices[dm0] = length(comb1st_indices) + push!(comb1st_indices_sorted_keys, dm0) + push!(comb_array, Dict{Int,UInt32}()) + @assert length(comb1st_indices) == length(comb_array) + end + if !haskey(comb2nd_indices, dm1) + push!(comb2nd_indices_sorted_keys, dm1) + comb2nd_indices[dm1] = length(comb2nd_indices) + end + @assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1]) + comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code + if char.code > 0xFFFF + push!(comb2nd_indices_nonbasic, dm1) + end + end +end + +comb_indices = Dict{UInt32,Int}() +comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices))) +comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices))) +let + cumoffset = 0 + for dm0 in comb1st_indices_sorted_keys + index = comb1st_indices[dm0] + first = nothing + last = nothing + offset = 0 + for b in eachindex(comb2nd_indices_sorted_keys) + dm1 = comb2nd_indices_sorted_keys[b] + if haskey(comb_array[index], b) + if isnothing(first) + first = offset + end + last = offset + if dm1 in comb2nd_indices_nonbasic + last += 1 + end + end + offset += 1 + if dm1 in comb2nd_indices_nonbasic + offset += 1 + end + end + comb1st_indices_firstoffsets[index] = first + comb1st_indices_lastoffsets[index] = last + @assert !haskey(comb_indices, dm0) + comb_indices[dm0] = cumoffset + cumoffset += last - first + 1 + 2 + end + + offset = 0 + for dm1 in comb2nd_indices_sorted_keys + @assert !haskey(comb_indices, dm1) + comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset) + @assert comb2nd_indices[dm1] + offset <= 0x4000 + if dm1 in comb2nd_indices_nonbasic + comb_indices[dm1] |= 0x4000 + offset += 1 + end + end +end + +utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq)) + +# Utility for packing all UTF-16 encoded sequences into one big array +struct UTF16Sequences + storage::Vector{UInt16} + indices::Dict{Vector{UInt16},Int} +end +UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}()) + +""" +Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where +* The 14 low bits are the index into the `sequences.storage` array where the + sequence resides +* The two top bits are the length of the sequence, or if equal to 3, the first + entry of the sequence itself contains the length. +""" +function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector) + if length(utf32_seq) == 0 + return typemax(UInt16) + end + # lencode contains the length of the UTF-32 sequence after decoding + # No sequence has len 0, so we encode len 1 as 0, len 2 as 1. + # We have only 2 bits for the length, though, so longer sequences are + # encoded in the sequence data itself. + seq_lencode = length(utf32_seq) - 1 + utf16_seq = utf16_encode(utf32_seq) + idx = get!(sequences.indices, utf16_seq) do + i = length(sequences.storage) + utf16_seq_enc = seq_lencode < 3 ? utf16_seq : + pushfirst!(copy(utf16_seq), seq_lencode) + append!(sequences.storage, utf16_seq_enc) + i + end + @assert idx <= 0x3FFF + seq_code = idx | (min(seq_lencode, 3) << 14) + return seq_code +end + +function encode_sequence!(sequences::UTF16Sequences, code::Integer) + encode_sequence!(sequences, [code]) +end + +function encode_sequence!(sequences::UTF16Sequences, ::Nothing) + return typemax(UInt16) +end + +function char_table_properties!(sequences, char) + code = char.code + + return ( + category = char.category, + combining_class = char.combining_class, + bidi_class = char.bidi_class, + decomp_type = char.decomp_type, + decomp_seqindex = encode_sequence!(sequences, char.decomp_mapping), + casefold_seqindex = encode_sequence!(sequences, get_case_folding(code)), + uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping), + lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping), + titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping), + comb_index = get(comb_indices, code, typemax(UInt16)), + bidi_mirrored = char.bidi_mirrored, + comp_exclusion = code in exclusions || code in excl_version, + ignorable = code in ignorable, + control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") && + # FIXME: Ruby bug compat - should be `code in (0x200C, 0x200D)` + !(char.category in (0x200C, 0x200D)), + charwidth = derive_char_width(code, char.category), + boundclass = get_grapheme_boundclass(code), + indic_conjunct_break = get_indic_conjunct_break(code), + ) +end + +# Many character properties are duplicates. Deduplicate them, constructing a +# per-character array of indicies into the properties array +sequences = UTF16Sequences() + +# FIXME: Hack to force ordering compat with Ruby code +for c in char_props + encode_sequence!(sequences, c.decomp_mapping) + encode_sequence!(sequences, get_case_folding(c.code)) +end + +char_table_props = [char_table_properties!(sequences, cp) for cp in char_props] + +deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}()) +char_property_indices = Origin(0)(zeros(Int, 0x00110000)) +let index_map = Dict{eltype(char_table_props),Int}() + for (char, table_props) in zip(char_props, char_table_props) + entry_idx = get!(index_map, table_props) do + idx = length(deduplicated_props) + push!(deduplicated_props, table_props) + idx + end + # Add 1 because unassigned codes occupy slot at index 0 + char_property_indices[char.code] = entry_idx + 1 + end +end + +# Now compress char_property_indices by breaking it into pages and +# deduplicating those (this works as compression because there are large +# contiguous ranges of code space with identical properties) +prop_page_indices = Int[] +prop_pages = Int[] +let + page_size = 0x100 + page_index_map = Dict{Vector{Int}, Int}() + for page in Iterators.partition(char_property_indices, page_size) + page_idx = get!(page_index_map, page) do + idx = length(prop_pages) + append!(prop_pages, page) + idx + end + push!(prop_page_indices, page_idx) + end +end + +#------------------------------------------------------------------------------- +function write_c_index_array(io, array, linelen) + print(io, "{\n ") + i = 0 + for x in array + i += 1 + if i == linelen + i = 0 + print(io, "\n ") + end + print(io, x, ", ") + end + print(io, "};\n\n") +end + +function c_enum_name(prefix, str) + if isnothing(str) + return "0" + else + return "UTF8PROC_$(prefix)_$(Base.uppercase(str))" + end +end + +function c_uint16(seqindex) + if seqindex == typemax(UInt16) + return "UINT16_MAX" + else + return string(seqindex) + end +end + +function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props, + comb1st_indices_firstoffsets, comb1st_indices_lastoffsets, + comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic) + print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ") + write_c_index_array(io, sequences.storage, 8) + print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ") + write_c_index_array(io, prop_page_indices, 8) + print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ") + write_c_index_array(io, prop_pages, 8) + + print(io, """ + static const utf8proc_property_t utf8proc_properties[] = { + {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE}, + """) + for prop in deduplicated_props + print(io, " {", + c_enum_name("CATEGORY", prop.category), ", ", + prop.combining_class, ", ", + c_enum_name("BIDI_CLASS", prop.bidi_class), ", ", + c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ", + c_uint16(prop.decomp_seqindex), ", ", + c_uint16(prop.casefold_seqindex), ", ", + c_uint16(prop.uppercase_seqindex), ", ", + c_uint16(prop.lowercase_seqindex), ", ", + c_uint16(prop.titlecase_seqindex), ", ", + c_uint16(prop.comb_index), ", ", + prop.bidi_mirrored, ", ", + prop.comp_exclusion, ", ", + prop.ignorable, ", ", + prop.control_boundary, ", ", + prop.charwidth, ", ", + "0, ", # bitfield padding + c_enum_name("BOUNDCLASS", prop.boundclass), ", ", + c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break), + "},\n" + ) + end + print(io, "};\n\n") + + print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n ") + i = 0 + for a in eachindex(comb1st_indices_firstoffsets) + offset = 0 + print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ") + for b in eachindex(comb2nd_indices_sorted_keys) + dm1 = comb2nd_indices_sorted_keys[b] + if offset > comb1st_indices_lastoffsets[a] + break + end + if offset >= comb1st_indices_firstoffsets[a] + i += 1 + if i == 8 + i = 0 + print(io, "\n ") + end + v = get(comb_array[a], b, 0) + if dm1 in comb2nd_indices_nonbasic + print(io, (v & 0xFFFF0000) >> 16, ", ") + end + print(io, v & 0xFFFF, ", ") + end + offset += 1 + if dm1 in comb2nd_indices_nonbasic + offset += 1 + end + end + print(io, "\n") + end + print(io, "};\n\n") +end + + +if !isinteractive() + print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props, + comb1st_indices_firstoffsets, comb1st_indices_lastoffsets, + comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic) +end + diff --git a/data/data_generator.rb b/data/data_generator.rb deleted file mode 100644 index 91cc03d..0000000 --- a/data/data_generator.rb +++ /dev/null @@ -1,475 +0,0 @@ -#!/usr/bin/env ruby - -# This file was used to generate the 'unicode_data.c' file by parsing the -# Unicode data file 'UnicodeData.txt' of the Unicode Character Database. -# It is included for informational purposes only and not intended for -# production use. - - -# Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer, -# Benito van der Zander, Michaël Meyer, and other contributors. -# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - - -# This file contains derived data from a modified version of the -# Unicode data files. The following license applies to that data: -# -# COPYRIGHT AND PERMISSION NOTICE -# -# Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed -# under the Terms of Use in http://www.unicode.org/copyright.html. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of the Unicode data files and any associated documentation (the "Data -# Files") or Unicode software and any associated documentation (the -# "Software") to deal in the Data Files or Software without restriction, -# including without limitation the rights to use, copy, modify, merge, -# publish, distribute, and/or sell copies of the Data Files or Software, and -# to permit persons to whom the Data Files or Software are furnished to do -# so, provided that (a) the above copyright notice(s) and this permission -# notice appear with all copies of the Data Files or Software, (b) both the -# above copyright notice(s) and this permission notice appear in associated -# documentation, and (c) there is clear notice in each modified Data File or -# in the Software as well as in the documentation associated with the Data -# File(s) or Software that the data or software has been modified. -# -# THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY -# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF -# THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS -# INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR -# CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF -# USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER -# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -# PERFORMANCE OF THE DATA FILES OR SOFTWARE. -# -# Except as contained in this notice, the name of a copyright holder shall -# not be used in advertising or otherwise to promote the sale, use or other -# dealings in these Data Files or Software without prior written -# authorization of the copyright holder. - - -$ignorable_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m] -$ignorable = [] -$ignorable_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ - $1.hex.upto($2.hex) { |e2| $ignorable << e2 } - elsif entry =~ /^[0-9A-F]+/ - $ignorable << $&.hex - end -end - -$uppercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Uppercase.*?# Total code points:/m] -$uppercase = [] -$uppercase_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ - $1.hex.upto($2.hex) { |e2| $uppercase << e2 } - elsif entry =~ /^[0-9A-F]+/ - $uppercase << $&.hex - end -end - -$lowercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Lowercase.*?# Total code points:/m] -$lowercase = [] -$lowercase_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ - $1.hex.upto($2.hex) { |e2| $lowercase << e2 } - elsif entry =~ /^[0-9A-F]+/ - $lowercase << $&.hex - end -end - -$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m] -$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE") -$icb_linker_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ - $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" } - elsif entry =~ /^[0-9A-F]+/ - $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" - end -end -$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m] -$icb_consonant_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ - $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" } - elsif entry =~ /^[0-9A-F]+/ - $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" - end -end -$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m] -$icb_extend_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ - $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" } - elsif entry =~ /^[0-9A-F]+/ - $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" - end -end - -$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8') -$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER") -$grapheme_boundclass_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/ - $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase } - elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/ - $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase - end -end - -$emoji_data_list = File.read("emoji-data.txt", :encoding => 'utf-8') -$emoji_data_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/ - $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" } - elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/ - $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" - elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/ - $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" } - elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/ - $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND" - end -end - -$charwidth_list = File.read("CharWidths.txt", :encoding => 'utf-8') -$charwidth = Hash.new(0) -$charwidth_list.each_line do |entry| - if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/ - $1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i } - elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/ - $charwidth[$1.hex] = $2.to_i - end -end - -$exclusions = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(1\) Script Specifics.*?# Total code points:/m] -$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex } - -$excl_version = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m] -$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex } - -$case_folding_string = File.read("CaseFolding.txt", :encoding => 'utf-8') -$case_folding = {} -$case_folding_string.chomp.split("\n").each do |line| - next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i - $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex } -end - -$int_array = [] -$int_array_indicies = {} - -def str2c(string, prefix) - return "0" if string.nil? - return "UTF8PROC_#{prefix}_#{string.upcase}" -end -def pushary(array) - idx = $int_array_indicies[array] - unless idx - $int_array_indicies[array] = $int_array.length - idx = $int_array.length - array.each { |entry| $int_array << entry } - end - return idx -end -def cpary2utf16encoded(array) - return array.flat_map { |cp| - if (cp <= 0xFFFF) - raise "utf-16 code: #{cp}" if cp & 0b1111100000000000 == 0b1101100000000000 - cp - else - temp = cp - 0x10000 - [(temp >> 10) | 0b1101100000000000, (temp & 0b0000001111111111) | 0b1101110000000000] - end - } -end -def cpary2c(array) - return "UINT16_MAX" if array.nil? || array.length == 0 - lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ... - array = cpary2utf16encoded(array) - if lencode >= 3 #we have only 2 bits for the length - array = [lencode] + array - lencode = 3 - end - idx = pushary(array) - raise "Array index out of bound" if idx > 0x3FFF - return "#{idx | (lencode << 14)}" -end -def singlecpmap(cp) - return "UINT16_MAX" if cp == nil - idx = pushary(cpary2utf16encoded([cp])) - raise "Array index out of bound" if idx > 0xFFFF - return "#{idx}" -end - -class UnicodeChar - attr_accessor :code, :name, :category, :combining_class, :bidi_class, - :decomp_type, :decomp_mapping, - :bidi_mirrored, - :uppercase_mapping, :lowercase_mapping, :titlecase_mapping, - #caches: - :c_entry_index, :c_decomp_mapping, :c_case_folding - def initialize(line) - raise "Could not parse input." unless line =~ /^ - ([0-9A-F]+); # code - ([^;]+); # name - ([A-Z]+); # general category - ([0-9]+); # canonical combining class - ([A-Z]+); # bidi class - (<([A-Z]*)>)? # decomposition type - ((\ ?[0-9A-F]+)*); # decompomposition mapping - ([0-9]*); # decimal digit - ([0-9]*); # digit - ([^;]*); # numeric - ([YN]*); # bidi mirrored - ([^;]*); # unicode 1.0 name - ([^;]*); # iso comment - ([0-9A-F]*); # simple uppercase mapping - ([0-9A-F]*); # simple lowercase mapping - ([0-9A-F]*)$/ix # simple titlecase mapping - @code = $1.hex - @name = $2 - @category = $3 - @combining_class = Integer($4) - @bidi_class = $5 - @decomp_type = $7 - @decomp_mapping = ($8=='') ? nil : - $8.split.collect { |element| element.hex } - @bidi_mirrored = ($13=='Y') ? true : false - # issue #130: use nonstandard uppercase ß -> ẞ - # issue #195: if character is uppercase but has no lowercase mapping, - # then make lowercase mapping = itself (vice versa for lowercase) - @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex - @lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex - @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex - end - def case_folding - $case_folding[code] - end - def c_entry(comb_indicies) - " " << - "{#{str2c category, 'CATEGORY'}, #{combining_class}, " << - "#{str2c bidi_class, 'BIDI_CLASS'}, " << - "#{str2c decomp_type, 'DECOMP_TYPE'}, " << - "#{c_decomp_mapping}, " << - "#{c_case_folding}, " << - "#{singlecpmap uppercase_mapping }, " << - "#{singlecpmap lowercase_mapping }, " << - "#{singlecpmap titlecase_mapping }, " << - "#{comb_indicies[code] ? comb_indicies[code]: 'UINT16_MAX'}, " << - "#{bidi_mirrored}, " << - "#{$exclusions.include?(code) or $excl_version.include?(code)}, " << - "#{$ignorable.include?(code)}, " << - "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << - "#{$charwidth[code]}, 0, " << - "#{$grapheme_boundclass[code]}, " << - "#{$icb[code]}},\n" - end -end - -chars = [] -char_hash = {} - -while gets - if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i - first = $1.hex - gets - char = UnicodeChar.new($_) - raise "No last character of sequence found." unless - $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i - last = $1.hex - name = "<#{$2}>" - for i in first..last - char_clone = char.clone - char_clone.code = i - char_clone.name = name - char_hash[char_clone.code] = char_clone - chars << char_clone - end - else - char = UnicodeChar.new($_) - char_hash[char.code] = char - chars << char - end -end - -comb1st_indicies = {} -comb2nd_indicies = {} -comb2nd_indicies_sorted_keys = [] -comb2nd_indicies_nonbasic = {} -comb_array = [] - -chars.each do |char| - if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and - char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and - char_hash[char.decomp_mapping[0]].combining_class == 0 and - not $exclusions.include?(char.code) - - dm0 = char.decomp_mapping[0] - dm1 = char.decomp_mapping[1] - unless comb1st_indicies[dm0] - comb1st_indicies[dm0] = comb1st_indicies.keys.length - end - unless comb2nd_indicies[dm1] - comb2nd_indicies_sorted_keys << dm1 - comb2nd_indicies[dm1] = comb2nd_indicies.keys.length - end - comb_array[comb1st_indicies[dm0]] ||= [] - raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] - comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code - - comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF - end - char.c_decomp_mapping = cpary2c(char.decomp_mapping) - char.c_case_folding = cpary2c(char.case_folding) -end - -comb_indicies = {} -cumoffset = 0 -comb1st_indicies_lastoffsets = [] -comb1st_indicies_firstoffsets = [] -comb1st_indicies.each do |dm0, index| - first = nil - last = nil - offset = 0 - comb2nd_indicies_sorted_keys.each_with_index do |dm1, b| - if comb_array[index][b] - first = offset unless first - last = offset - last += 1 if comb2nd_indicies_nonbasic[dm1] - end - offset += 1 - offset += 1 if comb2nd_indicies_nonbasic[dm1] - end - comb1st_indicies_firstoffsets[index] = first - comb1st_indicies_lastoffsets[index] = last - raise "double index" if comb_indicies[dm0] - comb_indicies[dm0] = cumoffset - cumoffset += last - first + 1 + 2 -end - -offset = 0 -comb2nd_indicies_sorted_keys.each do |dm1| - raise "double index" if comb_indicies[dm1] - comb_indicies[dm1] = 0x8000 | (comb2nd_indicies[dm1] + offset) - raise "too large comb index" if comb2nd_indicies[dm1] + offset > 0x4000 - if comb2nd_indicies_nonbasic[dm1] - comb_indicies[dm1] = comb_indicies[dm1] | 0x4000 - offset += 1 - end -end - -properties_indicies = {} -properties = [] -chars.each do |char| - c_entry = char.c_entry(comb_indicies) - char.c_entry_index = properties_indicies[c_entry] - unless char.c_entry_index - properties_indicies[c_entry] = properties.length - char.c_entry_index = properties.length - properties << c_entry - end -end - -stage1 = [] -stage2 = [] -for code in 0...0x110000 - next unless code % 0x100 == 0 - stage2_entry = [] - for code2 in code...(code+0x100) - if char_hash[code2] - stage2_entry << (char_hash[code2].c_entry_index + 1) - else - stage2_entry << 0 - end - end - old_index = stage2.index(stage2_entry) - if old_index - stage1 << (old_index * 0x100) - else - stage1 << (stage2.length * 0x100) - stage2 << stage2_entry - end -end - -$stdout << "static const utf8proc_uint16_t utf8proc_sequences[] = {\n " -i = 0 -$int_array.each do |entry| - i += 1 - if i == 8 - i = 0 - $stdout << "\n " - end - $stdout << entry << ", " -end -$stdout << "};\n\n" - -$stdout << "static const utf8proc_uint16_t utf8proc_stage1table[] = {\n " -i = 0 -stage1.each do |entry| - i += 1 - if i == 8 - i = 0 - $stdout << "\n " - end - $stdout << entry << ", " -end -$stdout << "};\n\n" - -$stdout << "static const utf8proc_uint16_t utf8proc_stage2table[] = {\n " -i = 0 -stage2.flatten.each do |entry| - i += 1 - if i == 8 - i = 0 - $stdout << "\n " - end - $stdout << entry << ", " -end -$stdout << "};\n\n" - -$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n" -$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n" -properties.each { |line| - $stdout << line -} -$stdout << "};\n\n" - - - -$stdout << "static const utf8proc_uint16_t utf8proc_combinations[] = {\n " -i = 0 -comb1st_indicies.keys.each_index do |a| - offset = 0 - $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", " - comb2nd_indicies_sorted_keys.each_with_index do |dm1, b| - break if offset > comb1st_indicies_lastoffsets[a] - if offset >= comb1st_indicies_firstoffsets[a] - i += 1 - if i == 8 - i = 0 - $stdout << "\n " - end - v = comb_array[a][b] ? comb_array[a][b] : 0 - $stdout << (( v & 0xFFFF0000 ) >> 16) << ", " if comb2nd_indicies_nonbasic[dm1] - $stdout << (v & 0xFFFF) << ", " - end - offset += 1 - offset += 1 if comb2nd_indicies_nonbasic[dm1] - end - $stdout << "\n" -end -$stdout << "};\n\n"