diff --git a/spec/std/regex_spec.cr b/spec/std/regex_spec.cr index c38194bc8785..5935e85060fd 100644 --- a/spec/std/regex_spec.cr +++ b/spec/std/regex_spec.cr @@ -200,12 +200,16 @@ describe "Regex" do /foo/.matches?("foo", options: Regex::Options::ANCHORED).should be_true end - it "matches a large single line string" do - LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled - pending! "PCRE JIT mode not available." unless 1 == jit_enabled + it "doesn't crash with a large single line string" do + {% if Regex::Engine.resolve.name == "Regex::PCRE" %} + LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled + pending! "PCRE JIT mode not available." unless 1 == jit_enabled + {% end %} str = File.read(datapath("large_single_line_string.txt")) - str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/).should be_false + str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/) + # We don't care whether this actually matches or not, it's just to make + # sure the engine does not stack overflow with a large string. end end @@ -422,6 +426,12 @@ describe "Regex" do it ".error?" do Regex.error?("(foo|bar)").should be_nil - Regex.error?("(foo|bar").should eq "missing ) at 8" + Regex.error?("(foo|bar").should eq( + if Regex::Engine.to_s == "Regex::PCRE2" + "missing closing parenthesis at 8" + else + "missing ) at 8" + end + ) end end diff --git a/src/regex/engine.cr b/src/regex/engine.cr index ad69e5d034bf..766917f87dd2 100644 --- a/src/regex/engine.cr +++ b/src/regex/engine.cr @@ -1,4 +1,11 @@ -require "./pcre" +{% if flag?(:use_pcre2) || (!flag?(:use_pcre) && !flag?(:win32) && `hash pkg-config 2> /dev/null && pkg-config --silence-errors --modversion libpcre2-8 || printf %s false` != "false") %} + require "./pcre2" -# :nodoc: -alias Regex::Engine = PCRE + # :nodoc: + alias Regex::Engine = PCRE2 +{% else %} + require "./pcre" + + # :nodoc: + alias Regex::Engine = PCRE +{% end %} diff --git a/src/regex/lib_pcre2.cr b/src/regex/lib_pcre2.cr new file mode 100644 index 000000000000..922c492b7e1a --- /dev/null +++ b/src/regex/lib_pcre2.cr @@ -0,0 +1,89 @@ +@[Link("pcre2-8")] +lib LibPCRE2 + alias Int = LibC::Int + + UNSET = ~LibC::SizeT.new(0) + + ANCHORED = 0x80000000 + NO_UTF_CHECK = 0x40000000 + ENDANCHORED = 0x20000000 + + ALLOW_EMPTY_CLASS = 0x00000001 + ALT_BSUX = 0x00000002 + AUTO_CALLOUT = 0x00000004 + CASELESS = 0x00000008 + DOLLAR_ENDONLY = 0x00000010 + DOTALL = 0x00000020 + DUPNAMES = 0x00000040 + EXTENDED = 0x00000080 + FIRSTLINE = 0x00000100 + MATCH_UNSET_BACKREF = 0x00000200 + MULTILINE = 0x00000400 + NEVER_UCP = 0x00000800 + NEVER_UTF = 0x00001000 + NO_AUTO_CAPTURE = 0x00002000 + NO_AUTO_POSSESS = 0x00004000 + NO_DOTSTAR_ANCHOR = 0x00008000 + NO_START_OPTIMIZE = 0x00010000 + UCP = 0x00020000 + UNGREEDY = 0x00040000 + UTF = 0x00080000 + NEVER_BACKSLASH_C = 0x00100000 + ALT_CIRCUMFLEX = 0x00200000 + ALT_VERBNAMES = 0x00400000 + USE_OFFSET_LIMIT = 0x00800000 + EXTENDED_MORE = 0x01000000 + LITERAL = 0x02000000 + MATCH_INVALID_UTF = 0x04000000 + + ERROR_NOMATCH = -1 + + INFO_ALLOPTIONS = 0 + INFO_ARGOPTIONS = 1 + INFO_BACKREFMAX = 2 + INFO_BSR = 3 + INFO_CAPTURECOUNT = 4 + INFO_FIRSTCODEUNIT = 5 + INFO_FIRSTCODETYPE = 6 + INFO_FIRSTBITMAP = 7 + INFO_HASCRORLF = 8 + INFO_JCHANGED = 9 + INFO_JITSIZE = 10 + INFO_LASTCODEUNIT = 11 + INFO_LASTCODETYPE = 12 + INFO_MATCHEMPTY = 13 + INFO_MATCHLIMIT = 14 + INFO_MAXLOOKBEHIND = 15 + INFO_MINLENGTH = 16 + INFO_NAMECOUNT = 17 + INFO_NAMEENTRYSIZE = 18 + INFO_NAMETABLE = 19 + INFO_NEWLINE = 20 + INFO_DEPTHLIMIT = 21 + INFO_RECURSIONLIMIT = 21 # Obsolete synonym + INFO_SIZE = 22 + INFO_HASBACKSLASHC = 23 + INFO_FRAMESIZE = 24 + INFO_HEAPLIMIT = 25 + INFO_EXTRAOPTIONS = 26 + + type Code = Void* + type CompileContext = Void* + type MatchData = Void* + + fun get_error_message = pcre2_get_error_message_8(errorcode : Int, buffer : UInt8*, bufflen : LibC::SizeT) : Int + + fun compile = pcre2_compile_8(pattern : UInt8*, length : LibC::SizeT, options : UInt32, errorcode : LibC::SizeT*, erroroffset : Int*, ccontext : CompileContext*) : Code* + fun code_free = pcre2_code_free_8(code : Code*) : Void + + fun pattern_info = pcre2_pattern_info_8(code : Code*, what : UInt32, where : Void*) : Int + + fun match = pcre2_match_8(code : Code*, subject : UInt8*, length : LibC::SizeT, startoffset : LibC::SizeT, options : UInt32, match_data : MatchData*, mcontext : Void*) : Int + fun match_data_create_from_pattern = pcre2_match_data_create_from_pattern_8(code : Code*, gcontext : Void*) : MatchData* + fun match_data_free = pcre2_match_data_free_8(match_data : MatchData*) : Void + + fun substring_nametable_scan = pcre2_substring_nametable_scan_8(code : Code*, name : UInt8*, first : UInt8*, last : UInt8*) : Int + + fun get_ovector_pointer = pcre2_get_ovector_pointer_8(match_data : MatchData*) : LibC::SizeT* + fun get_ovector_count = pcre2_get_ovector_count_8(match_data : MatchData*) : UInt32 +end diff --git a/src/regex/pcre2.cr b/src/regex/pcre2.cr new file mode 100644 index 000000000000..3eea20280268 --- /dev/null +++ b/src/regex/pcre2.cr @@ -0,0 +1,176 @@ +require "./lib_pcre2" + +# :nodoc: +module Regex::PCRE2 + @re : LibPCRE2::Code* + + # :nodoc: + def initialize(*, _source @source : String, _options @options) + @re = PCRE2.compile(source, pcre2_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message| + raise ArgumentError.new(error_message) + end + end + + protected def self.compile(source, options) + if res = LibPCRE2.compile(source, source.bytesize, options, out errorcode, out erroroffset, nil) + res + else + message = String.new(256) do |buffer| + bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256) + {bytesize, 0} + end + yield "#{message} at #{erroroffset}" + end + end + + private def pcre2_options(options) + flag = 0 + options.each do |option| + flag |= case option + when .ignore_case? then LibPCRE2::CASELESS + when .multiline? then LibPCRE2::DOTALL | LibPCRE2::MULTILINE + when .extended? then LibPCRE2::EXTENDED + when .anchored? then LibPCRE2::ANCHORED + when .utf_8? then LibPCRE2::UTF + when .no_utf8_check? then LibPCRE2::NO_UTF_CHECK + when .dupnames? then LibPCRE2::DUPNAMES + when .ucp? then LibPCRE2::UCP + else + raise "unreachable" + end + end + flag + end + + def finalize + {% unless flag?(:interpreted) %} + LibPCRE2.code_free @re + {% end %} + end + + protected def self.error_impl(source) + code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message| + return error_message + end + + LibPCRE2.code_free code + + nil + end + + private def pattern_info(what) + value = uninitialized UInt32 + pattern_info(what, pointerof(value)) + value + end + + private def pattern_info(what, where) + ret = LibPCRE2.pattern_info(@re, what, where) + if ret != 0 + raise "error pattern_info #{what}: #{ret}" + end + end + + private def name_table_impl + lookup = Hash(Int32, String).new + + each_capture_group do |capture_number, name_entry| + lookup[capture_number] = String.new(name_entry.to_unsafe + 2) + end + + lookup + end + + # :nodoc: + def each_capture_group + name_table = uninitialized UInt8* + pattern_info(LibPCRE2::INFO_NAMETABLE, pointerof(name_table)) + + name_entry_size = pattern_info(LibPCRE2::INFO_NAMEENTRYSIZE) + + name_count = pattern_info(LibPCRE2::INFO_NAMECOUNT) + name_count.times do + capture_number = (name_table[0] << 8) | name_table[1] + + yield capture_number, Slice.new(name_table, name_entry_size) + + name_table += name_entry_size + end + end + + private def capture_count_impl + pattern_info(LibPCRE2::INFO_CAPTURECOUNT).to_i32 + end + + private def match_impl(str, byte_index, options) + match_data = match_data(str, byte_index, options) || return + + ovector = LibPCRE2.get_ovector_pointer(match_data) + ovector_count = LibPCRE2.get_ovector_count(match_data) + LibPCRE2.match_data_free(match_data) + + ::Regex::MatchData.new(self, @re, str, byte_index, ovector, ovector_count.to_i32 - 1) + end + + private def matches_impl(str, byte_index, options) + if match_data = match_data(str, byte_index, options) + LibPCRE2.match_data_free(match_data) + true + else + false + end + end + + private def match_data(str, byte_index, options) + match_data = LibPCRE2.match_data_create_from_pattern(@re, nil) + match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, nil) + + if match_count < 0 + LibPCRE2.match_data_free(match_data) + case match_count + when LibPCRE2::ERROR_NOMATCH + return + else + raise "error!" + end + end + + match_data + end + + module MatchData + # :nodoc: + def initialize(@regex : Regex, @code : LibPCRE2::Code*, @string : String, @pos : Int32, @ovector : UInt64*, @group_size : Int32) + end + + private def byte_range(n, &) + n += size if n < 0 + range = Range.new(@ovector[n * 2].to_i32!, @ovector[n * 2 + 1].to_i32!, exclusive: true) + if range.begin < 0 || range.end < 0 + yield n + else + range + end + end + + private def fetch_impl(group_name : String) + selected_range = nil + exists = false + @regex.each_capture_group do |number, name_entry| + if name_entry[2, group_name.bytesize] == group_name.to_slice + exists = true + range = byte_range(number) { nil } + if (range && selected_range && range.begin > selected_range.begin) || !selected_range + selected_range = range + end + end + end + + if selected_range + @string.byte_slice(selected_range.begin, selected_range.end - selected_range.begin) + else + yield exists + end + end + end +end