crystal-lang · straight-shoota · Dec 16, 2022 · Nov 26, 2022 · Dec 14, 2022 · Dec 14, 2022
diff --git a/spec/std/regex_spec.cr b/spec/std/regex_spec.cr
@@ -200,12 +200,16 @@ describe "Regex" do
       /foo/.matches?("foo", options: Regex::Options::ANCHORED).should be_true
     end
 
-    it "matches a large single line string" do
-      LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
-      pending! "PCRE JIT mode not available." unless 1 == jit_enabled
+    it "doesn't crash with a large single line string" do
+      {% if Regex::Engine.resolve.name == "Regex::PCRE" %}
+        LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
+        pending! "PCRE JIT mode not available." unless 1 == jit_enabled
+      {% end %}
 
       str = File.read(datapath("large_single_line_string.txt"))
-      str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/).should be_false
+      str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
+      # We don't care whether this actually matches or not, it's just to make
+      # sure the engine does not stack overflow with a large string.
     end
   end
 
@@ -422,6 +426,12 @@ describe "Regex" do
 
   it ".error?" do
     Regex.error?("(foo|bar)").should be_nil
-    Regex.error?("(foo|bar").should eq "missing ) at 8"
+    Regex.error?("(foo|bar").should eq(
+      if Regex::Engine.to_s == "Regex::PCRE2"
+        "missing closing parenthesis at 8"
+      else
+        "missing ) at 8"
+      end
+    )
   end
 end
diff --git a/src/regex/engine.cr b/src/regex/engine.cr
@@ -1,4 +1,11 @@
-require "./pcre"
+{% if flag?(:use_pcre2) || (!flag?(:use_pcre) && !flag?(:win32) && `hash pkg-config 2> /dev/null && pkg-config --silence-errors --modversion libpcre2-8 || printf %s false` != "false") %}
+  require "./pcre2"
 
-# :nodoc:
-alias Regex::Engine = PCRE
+  # :nodoc:
+  alias Regex::Engine = PCRE2
+{% else %}
+  require "./pcre"
+
+  # :nodoc:
+  alias Regex::Engine = PCRE
+{% end %}
diff --git a/src/regex/lib_pcre2.cr b/src/regex/lib_pcre2.cr
@@ -0,0 +1,89 @@
+@[Link("pcre2-8")]
+lib LibPCRE2
+  alias Int = LibC::Int
+
+  UNSET = ~LibC::SizeT.new(0)
+
+  ANCHORED     = 0x80000000
+  NO_UTF_CHECK = 0x40000000
+  ENDANCHORED  = 0x20000000
+
+  ALLOW_EMPTY_CLASS   = 0x00000001
+  ALT_BSUX            = 0x00000002
+  AUTO_CALLOUT        = 0x00000004
+  CASELESS            = 0x00000008
+  DOLLAR_ENDONLY      = 0x00000010
+  DOTALL              = 0x00000020
+  DUPNAMES            = 0x00000040
+  EXTENDED            = 0x00000080
+  FIRSTLINE           = 0x00000100
+  MATCH_UNSET_BACKREF = 0x00000200
+  MULTILINE           = 0x00000400
+  NEVER_UCP           = 0x00000800
+  NEVER_UTF           = 0x00001000
+  NO_AUTO_CAPTURE     = 0x00002000
+  NO_AUTO_POSSESS     = 0x00004000
+  NO_DOTSTAR_ANCHOR   = 0x00008000
+  NO_START_OPTIMIZE   = 0x00010000
+  UCP                 = 0x00020000
+  UNGREEDY            = 0x00040000
+  UTF                 = 0x00080000
+  NEVER_BACKSLASH_C   = 0x00100000
+  ALT_CIRCUMFLEX      = 0x00200000
+  ALT_VERBNAMES       = 0x00400000
+  USE_OFFSET_LIMIT    = 0x00800000
+  EXTENDED_MORE       = 0x01000000
+  LITERAL             = 0x02000000
+  MATCH_INVALID_UTF   = 0x04000000
+
+  ERROR_NOMATCH = -1
+
+  INFO_ALLOPTIONS     =  0
+  INFO_ARGOPTIONS     =  1
+  INFO_BACKREFMAX     =  2
+  INFO_BSR            =  3
+  INFO_CAPTURECOUNT   =  4
+  INFO_FIRSTCODEUNIT  =  5
+  INFO_FIRSTCODETYPE  =  6
+  INFO_FIRSTBITMAP    =  7
+  INFO_HASCRORLF      =  8
+  INFO_JCHANGED       =  9
+  INFO_JITSIZE        = 10
+  INFO_LASTCODEUNIT   = 11
+  INFO_LASTCODETYPE   = 12
+  INFO_MATCHEMPTY     = 13
+  INFO_MATCHLIMIT     = 14
+  INFO_MAXLOOKBEHIND  = 15
+  INFO_MINLENGTH      = 16
+  INFO_NAMECOUNT      = 17
+  INFO_NAMEENTRYSIZE  = 18
+  INFO_NAMETABLE      = 19
+  INFO_NEWLINE        = 20
+  INFO_DEPTHLIMIT     = 21
+  INFO_RECURSIONLIMIT = 21 # Obsolete synonym
+  INFO_SIZE           = 22
+  INFO_HASBACKSLASHC  = 23
+  INFO_FRAMESIZE      = 24
+  INFO_HEAPLIMIT      = 25
+  INFO_EXTRAOPTIONS   = 26
+
+  type Code = Void*
+  type CompileContext = Void*
+  type MatchData = Void*
+
+  fun get_error_message = pcre2_get_error_message_8(errorcode : Int, buffer : UInt8*, bufflen : LibC::SizeT) : Int
+
+  fun compile = pcre2_compile_8(pattern : UInt8*, length : LibC::SizeT, options : UInt32, errorcode : LibC::SizeT*, erroroffset : Int*, ccontext : CompileContext*) : Code*
+  fun code_free = pcre2_code_free_8(code : Code*) : Void
+
+  fun pattern_info = pcre2_pattern_info_8(code : Code*, what : UInt32, where : Void*) : Int
+
+  fun match = pcre2_match_8(code : Code*, subject : UInt8*, length : LibC::SizeT, startoffset : LibC::SizeT, options : UInt32, match_data : MatchData*, mcontext : Void*) : Int
+  fun match_data_create_from_pattern = pcre2_match_data_create_from_pattern_8(code : Code*, gcontext : Void*) : MatchData*
+  fun match_data_free = pcre2_match_data_free_8(match_data : MatchData*) : Void
+
+  fun substring_nametable_scan = pcre2_substring_nametable_scan_8(code : Code*, name : UInt8*, first : UInt8*, last : UInt8*) : Int
+
+  fun get_ovector_pointer = pcre2_get_ovector_pointer_8(match_data : MatchData*) : LibC::SizeT*
+  fun get_ovector_count = pcre2_get_ovector_count_8(match_data : MatchData*) : UInt32
+end
diff --git a/src/regex/pcre2.cr b/src/regex/pcre2.cr
@@ -0,0 +1,176 @@
+require "./lib_pcre2"
+
+# :nodoc:
+module Regex::PCRE2
+  @re : LibPCRE2::Code*
+
+  # :nodoc:
+  def initialize(*, _source @source : String, _options @options)
+    @re = PCRE2.compile(source, pcre2_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
+      raise ArgumentError.new(error_message)
+    end
+  end
+
+  protected def self.compile(source, options)
+    if res = LibPCRE2.compile(source, source.bytesize, options, out errorcode, out erroroffset, nil)
+      res
+    else
+      message = String.new(256) do |buffer|
+        bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
+        {bytesize, 0}
+      end
+      yield "#{message} at #{erroroffset}"
+    end
+  end
+
+  private def pcre2_options(options)
+    flag = 0
+    options.each do |option|
+      flag |= case option
+              when .ignore_case?   then LibPCRE2::CASELESS
+              when .multiline?     then LibPCRE2::DOTALL | LibPCRE2::MULTILINE
+              when .extended?      then LibPCRE2::EXTENDED
+              when .anchored?      then LibPCRE2::ANCHORED
+              when .utf_8?         then LibPCRE2::UTF
+              when .no_utf8_check? then LibPCRE2::NO_UTF_CHECK
+              when .dupnames?      then LibPCRE2::DUPNAMES
+              when .ucp?           then LibPCRE2::UCP
+              else
+                raise "unreachable"
+              end
+    end
+    flag
+  end
+
+  def finalize
+    {% unless flag?(:interpreted) %}
+      LibPCRE2.code_free @re
+    {% end %}
+  end
+
+  protected def self.error_impl(source)
+    code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
+      return error_message
+    end
+
+    LibPCRE2.code_free code
+
+    nil
+  end
+
+  private def pattern_info(what)
+    value = uninitialized UInt32
+    pattern_info(what, pointerof(value))
+    value
+  end
+
+  private def pattern_info(what, where)
+    ret = LibPCRE2.pattern_info(@re, what, where)
+    if ret != 0
+      raise "error pattern_info #{what}: #{ret}"
+    end
+  end
+
+  private def name_table_impl
+    lookup = Hash(Int32, String).new
+
+    each_capture_group do |capture_number, name_entry|
+      lookup[capture_number] = String.new(name_entry.to_unsafe + 2)
+    end
+
+    lookup
+  end
+
+  # :nodoc:
+  def each_capture_group
+    name_table = uninitialized UInt8*
+    pattern_info(LibPCRE2::INFO_NAMETABLE, pointerof(name_table))
+
+    name_entry_size = pattern_info(LibPCRE2::INFO_NAMEENTRYSIZE)
+
+    name_count = pattern_info(LibPCRE2::INFO_NAMECOUNT)
+    name_count.times do
+      capture_number = (name_table[0] << 8) | name_table[1]
+
+      yield capture_number, Slice.new(name_table, name_entry_size)
+
+      name_table += name_entry_size
+    end
+  end
+
+  private def capture_count_impl
+    pattern_info(LibPCRE2::INFO_CAPTURECOUNT).to_i32
+  end
+
+  private def match_impl(str, byte_index, options)
+    match_data = match_data(str, byte_index, options) || return
+
+    ovector = LibPCRE2.get_ovector_pointer(match_data)
+    ovector_count = LibPCRE2.get_ovector_count(match_data)
+    LibPCRE2.match_data_free(match_data)
+
+    ::Regex::MatchData.new(self, @re, str, byte_index, ovector, ovector_count.to_i32 - 1)
+  end
+
+  private def matches_impl(str, byte_index, options)
+    if match_data = match_data(str, byte_index, options)
+      LibPCRE2.match_data_free(match_data)
+      true
+    else
+      false
+    end
+  end
+
+  private def match_data(str, byte_index, options)
+    match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
+    match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, nil)
+
+    if match_count < 0
+      LibPCRE2.match_data_free(match_data)
+      case match_count
+      when LibPCRE2::ERROR_NOMATCH
+        return
+      else
+        raise "error!"
+      end
+    end
+
+    match_data
+  end
+
+  module MatchData
+    # :nodoc:
+    def initialize(@regex : Regex, @code : LibPCRE2::Code*, @string : String, @pos : Int32, @ovector : UInt64*, @group_size : Int32)
+    end
+
+    private def byte_range(n, &)
+      n += size if n < 0
+      range = Range.new(@ovector[n * 2].to_i32!, @ovector[n * 2 + 1].to_i32!, exclusive: true)
+      if range.begin < 0 || range.end < 0
+        yield n
+      else
+        range
+      end
+    end
+
+    private def fetch_impl(group_name : String)
+      selected_range = nil
+      exists = false
+      @regex.each_capture_group do |number, name_entry|
+        if name_entry[2, group_name.bytesize] == group_name.to_slice
+          exists = true
+          range = byte_range(number) { nil }
+          if (range && selected_range && range.begin > selected_range.begin) || !selected_range
+            selected_range = range
+          end
+        end
+      end
+
+      if selected_range
+        @string.byte_slice(selected_range.begin, selected_range.end - selected_range.begin)
+      else
+        yield exists
+      end
+    end
+  end
+end