Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Regex engine on PCRE2 #12840

Merged
merged 4 commits into from
Dec 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions spec/std/regex_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,16 @@ describe "Regex" do
/foo/.matches?("foo", options: Regex::Options::ANCHORED).should be_true
end

it "matches a large single line string" do
LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
pending! "PCRE JIT mode not available." unless 1 == jit_enabled
it "doesn't crash with a large single line string" do
{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
pending! "PCRE JIT mode not available." unless 1 == jit_enabled
{% end %}

str = File.read(datapath("large_single_line_string.txt"))
str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/).should be_false
str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
# We don't care whether this actually matches or not, it's just to make
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A little problem is that the name of the test is wrong. It matches a large single line string, but it was not expected to match. A better name could be doesn't crash with a...

# sure the engine does not stack overflow with a large string.
end
end

Expand Down Expand Up @@ -422,6 +426,12 @@ describe "Regex" do

it ".error?" do
Regex.error?("(foo|bar)").should be_nil
Regex.error?("(foo|bar").should eq "missing ) at 8"
Regex.error?("(foo|bar").should eq(
if Regex::Engine.to_s == "Regex::PCRE2"
"missing closing parenthesis at 8"
else
"missing ) at 8"
end
)
end
end
13 changes: 10 additions & 3 deletions src/regex/engine.cr
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
require "./pcre"
{% if flag?(:use_pcre2) || (!flag?(:use_pcre) && !flag?(:win32) && `hash pkg-config 2> /dev/null && pkg-config --silence-errors --modversion libpcre2-8 || printf %s false` != "false") %}
require "./pcre2"

# :nodoc:
alias Regex::Engine = PCRE
# :nodoc:
alias Regex::Engine = PCRE2
{% else %}
require "./pcre"

# :nodoc:
alias Regex::Engine = PCRE
{% end %}
89 changes: 89 additions & 0 deletions src/regex/lib_pcre2.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
@[Link("pcre2-8")]
lib LibPCRE2
alias Int = LibC::Int

UNSET = ~LibC::SizeT.new(0)

ANCHORED = 0x80000000
NO_UTF_CHECK = 0x40000000
ENDANCHORED = 0x20000000

ALLOW_EMPTY_CLASS = 0x00000001
ALT_BSUX = 0x00000002
AUTO_CALLOUT = 0x00000004
CASELESS = 0x00000008
DOLLAR_ENDONLY = 0x00000010
DOTALL = 0x00000020
DUPNAMES = 0x00000040
EXTENDED = 0x00000080
FIRSTLINE = 0x00000100
MATCH_UNSET_BACKREF = 0x00000200
MULTILINE = 0x00000400
NEVER_UCP = 0x00000800
NEVER_UTF = 0x00001000
NO_AUTO_CAPTURE = 0x00002000
NO_AUTO_POSSESS = 0x00004000
NO_DOTSTAR_ANCHOR = 0x00008000
NO_START_OPTIMIZE = 0x00010000
UCP = 0x00020000
UNGREEDY = 0x00040000
UTF = 0x00080000
NEVER_BACKSLASH_C = 0x00100000
ALT_CIRCUMFLEX = 0x00200000
ALT_VERBNAMES = 0x00400000
USE_OFFSET_LIMIT = 0x00800000
EXTENDED_MORE = 0x01000000
LITERAL = 0x02000000
MATCH_INVALID_UTF = 0x04000000

ERROR_NOMATCH = -1

INFO_ALLOPTIONS = 0
INFO_ARGOPTIONS = 1
INFO_BACKREFMAX = 2
INFO_BSR = 3
INFO_CAPTURECOUNT = 4
INFO_FIRSTCODEUNIT = 5
INFO_FIRSTCODETYPE = 6
INFO_FIRSTBITMAP = 7
INFO_HASCRORLF = 8
INFO_JCHANGED = 9
INFO_JITSIZE = 10
INFO_LASTCODEUNIT = 11
INFO_LASTCODETYPE = 12
INFO_MATCHEMPTY = 13
INFO_MATCHLIMIT = 14
INFO_MAXLOOKBEHIND = 15
INFO_MINLENGTH = 16
INFO_NAMECOUNT = 17
INFO_NAMEENTRYSIZE = 18
INFO_NAMETABLE = 19
INFO_NEWLINE = 20
INFO_DEPTHLIMIT = 21
INFO_RECURSIONLIMIT = 21 # Obsolete synonym
INFO_SIZE = 22
INFO_HASBACKSLASHC = 23
INFO_FRAMESIZE = 24
INFO_HEAPLIMIT = 25
INFO_EXTRAOPTIONS = 26

type Code = Void*
type CompileContext = Void*
type MatchData = Void*

fun get_error_message = pcre2_get_error_message_8(errorcode : Int, buffer : UInt8*, bufflen : LibC::SizeT) : Int

fun compile = pcre2_compile_8(pattern : UInt8*, length : LibC::SizeT, options : UInt32, errorcode : LibC::SizeT*, erroroffset : Int*, ccontext : CompileContext*) : Code*
fun code_free = pcre2_code_free_8(code : Code*) : Void

fun pattern_info = pcre2_pattern_info_8(code : Code*, what : UInt32, where : Void*) : Int

fun match = pcre2_match_8(code : Code*, subject : UInt8*, length : LibC::SizeT, startoffset : LibC::SizeT, options : UInt32, match_data : MatchData*, mcontext : Void*) : Int
fun match_data_create_from_pattern = pcre2_match_data_create_from_pattern_8(code : Code*, gcontext : Void*) : MatchData*
fun match_data_free = pcre2_match_data_free_8(match_data : MatchData*) : Void

fun substring_nametable_scan = pcre2_substring_nametable_scan_8(code : Code*, name : UInt8*, first : UInt8*, last : UInt8*) : Int

fun get_ovector_pointer = pcre2_get_ovector_pointer_8(match_data : MatchData*) : LibC::SizeT*
fun get_ovector_count = pcre2_get_ovector_count_8(match_data : MatchData*) : UInt32
end
176 changes: 176 additions & 0 deletions src/regex/pcre2.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
require "./lib_pcre2"

# :nodoc:
module Regex::PCRE2
@re : LibPCRE2::Code*

# :nodoc:
def initialize(*, _source @source : String, _options @options)
@re = PCRE2.compile(source, pcre2_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
raise ArgumentError.new(error_message)
end
end

protected def self.compile(source, options)
if res = LibPCRE2.compile(source, source.bytesize, options, out errorcode, out erroroffset, nil)
res
else
message = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
yield "#{message} at #{erroroffset}"
end
end

private def pcre2_options(options)
flag = 0
options.each do |option|
flag |= case option
when .ignore_case? then LibPCRE2::CASELESS
when .multiline? then LibPCRE2::DOTALL | LibPCRE2::MULTILINE
when .extended? then LibPCRE2::EXTENDED
when .anchored? then LibPCRE2::ANCHORED
when .utf_8? then LibPCRE2::UTF
when .no_utf8_check? then LibPCRE2::NO_UTF_CHECK
when .dupnames? then LibPCRE2::DUPNAMES
when .ucp? then LibPCRE2::UCP
else
raise "unreachable"
end
end
flag
end

def finalize
{% unless flag?(:interpreted) %}
LibPCRE2.code_free @re
{% end %}
end

protected def self.error_impl(source)
code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
return error_message
end

LibPCRE2.code_free code

nil
end

private def pattern_info(what)
value = uninitialized UInt32
pattern_info(what, pointerof(value))
value
end

private def pattern_info(what, where)
ret = LibPCRE2.pattern_info(@re, what, where)
if ret != 0
raise "error pattern_info #{what}: #{ret}"
end
end

private def name_table_impl
lookup = Hash(Int32, String).new

each_capture_group do |capture_number, name_entry|
lookup[capture_number] = String.new(name_entry.to_unsafe + 2)
end

lookup
end

# :nodoc:
def each_capture_group
name_table = uninitialized UInt8*
pattern_info(LibPCRE2::INFO_NAMETABLE, pointerof(name_table))

name_entry_size = pattern_info(LibPCRE2::INFO_NAMEENTRYSIZE)

name_count = pattern_info(LibPCRE2::INFO_NAMECOUNT)
name_count.times do
capture_number = (name_table[0] << 8) | name_table[1]

yield capture_number, Slice.new(name_table, name_entry_size)

name_table += name_entry_size
end
end

private def capture_count_impl
pattern_info(LibPCRE2::INFO_CAPTURECOUNT).to_i32
end

private def match_impl(str, byte_index, options)
match_data = match_data(str, byte_index, options) || return

ovector = LibPCRE2.get_ovector_pointer(match_data)
ovector_count = LibPCRE2.get_ovector_count(match_data)
LibPCRE2.match_data_free(match_data)

::Regex::MatchData.new(self, @re, str, byte_index, ovector, ovector_count.to_i32 - 1)
end

private def matches_impl(str, byte_index, options)
if match_data = match_data(str, byte_index, options)
LibPCRE2.match_data_free(match_data)
true
else
false
end
end

private def match_data(str, byte_index, options)
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, nil)

if match_count < 0
LibPCRE2.match_data_free(match_data)
case match_count
when LibPCRE2::ERROR_NOMATCH
return
else
raise "error!"
end
end

match_data
end

module MatchData
# :nodoc:
def initialize(@regex : Regex, @code : LibPCRE2::Code*, @string : String, @pos : Int32, @ovector : UInt64*, @group_size : Int32)
end

private def byte_range(n, &)
n += size if n < 0
range = Range.new(@ovector[n * 2].to_i32!, @ovector[n * 2 + 1].to_i32!, exclusive: true)
if range.begin < 0 || range.end < 0
yield n
else
range
end
end

private def fetch_impl(group_name : String)
selected_range = nil
exists = false
@regex.each_capture_group do |number, name_entry|
if name_entry[2, group_name.bytesize] == group_name.to_slice
exists = true
range = byte_range(number) { nil }
if (range && selected_range && range.begin > selected_range.begin) || !selected_range
selected_range = range
end
end
end

if selected_range
@string.byte_slice(selected_range.begin, selected_range.end - selected_range.begin)
else
yield exists
end
end
end
end