Skip to content

Commit

Permalink
Add IOSource#match? method (#216)
Browse files Browse the repository at this point in the history
## Why?
`StringScanner#match?` is faster than `StringScanner#check`.

See: ruby/strscan#111

## Benchmark
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     18.819      19.362        32.846       34.708 i/s -     100.000 times in 5.313905s 5.164791s 3.044500s 2.881200s
                 sax     28.188      29.982        48.386       52.554 i/s -     100.000 times in 3.547597s 3.335304s 2.066732s 1.902809s
                pull     31.962      33.902        57.868       60.662 i/s -     100.000 times in 3.128689s 2.949690s 1.728071s 1.648467s
              stream     31.436      33.030        52.808       56.647 i/s -     100.000 times in 3.181095s 3.027574s 1.893635s 1.765304s

Comparison:
                              dom
         after(YJIT):        34.7 i/s
        before(YJIT):        32.8 i/s - 1.06x  slower
               after:        19.4 i/s - 1.79x  slower
              before:        18.8 i/s - 1.84x  slower

                              sax
         after(YJIT):        52.6 i/s
        before(YJIT):        48.4 i/s - 1.09x  slower
               after:        30.0 i/s - 1.75x  slower
              before:        28.2 i/s - 1.86x  slower

                             pull
         after(YJIT):        60.7 i/s
        before(YJIT):        57.9 i/s - 1.05x  slower
               after:        33.9 i/s - 1.79x  slower
              before:        32.0 i/s - 1.90x  slower

                           stream
         after(YJIT):        56.6 i/s
        before(YJIT):        52.8 i/s - 1.07x  slower
               after:        33.0 i/s - 1.72x  slower
              before:        31.4 i/s - 1.80x  slower

```

- YJIT=ON : 1.05x - 1.09x faster
- YJIT=OFF : 1.02x - 1.06x faster

---------

Co-authored-by: Sutou Kouhei <[email protected]>
  • Loading branch information
naitoh and kou authored Oct 27, 2024
1 parent 6a8c041 commit 8ef7502
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 42 deletions.
84 changes: 42 additions & 42 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,10 @@ def pull_event
@source.ensure_buffer
if @document_status == nil
start_position = @source.position
if @source.match("<?", true)
if @source.match?("<?", true)
return process_instruction
elsif @source.match("<!", true)
if @source.match("--", true)
elsif @source.match?("<!", true)
if @source.match?("--", true)
md = @source.match(/(.*?)-->/um, true)
if md.nil?
raise REXML::ParseException.new("Unclosed comment", @source)
Expand All @@ -281,10 +281,10 @@ def pull_event
raise REXML::ParseException.new("Malformed comment", @source)
end
return [ :comment, md[1] ]
elsif @source.match("DOCTYPE", true)
elsif @source.match?("DOCTYPE", true)
base_error_message = "Malformed DOCTYPE"
unless @source.match(/\s+/um, true)
if @source.match(">")
unless @source.match?(/\s+/um, true)
if @source.match?(">")
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid name"
Expand All @@ -293,10 +293,10 @@ def pull_event
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
if @source.match(/\s*\[/um, true)
if @source.match?(/\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\s*>/um, true)
elsif @source.match?(/\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
@source.ensure_buffer
Expand All @@ -308,9 +308,9 @@ def pull_event
# For backward compatibility
id[1], id[2] = id[2], nil
end
if @source.match(/\s*\[/um, true)
if @source.match?(/\s*\[/um, true)
@document_status = :in_doctype
elsif @source.match(/\s*>/um, true)
elsif @source.match?(/\s*>/um, true)
@document_status = :after_doctype
@source.ensure_buffer
else
Expand All @@ -320,7 +320,7 @@ def pull_event
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\s*/um, true)
@source.match?(/\s*/um, true)
@stack << [ :end_doctype ]
end
return args
Expand All @@ -331,14 +331,14 @@ def pull_event
end
end
if @document_status == :in_doctype
@source.match(/\s*/um, true) # skip spaces
@source.match?(/\s*/um, true) # skip spaces
start_position = @source.position
if @source.match("<!", true)
if @source.match("ELEMENT", true)
if @source.match?("<!", true)
if @source.match?("ELEMENT", true)
md = @source.match(/(.*?)>/um, true)
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
return [ :elementdecl, "<!ELEMENT" + md[1] ]
elsif @source.match("ENTITY", true)
elsif @source.match?("ENTITY", true)
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
unless match_data
raise REXML::ParseException.new("Malformed entity declaration", @source)
Expand Down Expand Up @@ -370,7 +370,7 @@ def pull_event
end
match << '%' if ref
return match
elsif @source.match("ATTLIST", true)
elsif @source.match?("ATTLIST", true)
md = @source.match(Private::ATTLISTDECL_END, true)
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
Expand All @@ -390,10 +390,10 @@ def pull_event
end
end
return [ :attlistdecl, element, pairs, contents ]
elsif @source.match("NOTATION", true)
elsif @source.match?("NOTATION", true)
base_error_message = "Malformed notation declaration"
unless @source.match(/\s+/um, true)
if @source.match(">")
unless @source.match?(/\s+/um, true)
if @source.match?(">")
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid name"
Expand All @@ -405,7 +405,7 @@ def pull_event
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\s*>/um, true)
unless @source.match?(/\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
Expand All @@ -419,7 +419,7 @@ def pull_event
end
elsif match = @source.match(/(%.*?;)\s*/um, true)
return [ :externalentity, match[1] ]
elsif @source.match(/\]\s*>/um, true)
elsif @source.match?(/\]\s*>/um, true)
@document_status = :after_doctype
return [ :end_doctype ]
end
Expand All @@ -428,16 +428,16 @@ def pull_event
end
end
if @document_status == :after_doctype
@source.match(/\s*/um, true)
@source.match?(/\s*/um, true)
end
begin
start_position = @source.position
if @source.match("<", true)
if @source.match?("<", true)
# :text's read_until may remain only "<" in buffer. In the
# case, buffer is empty here. So we need to fill buffer
# here explicitly.
@source.ensure_buffer
if @source.match("/", true)
if @source.match?("/", true)
@namespaces_restore_stack.pop
last_tag = @tags.pop
md = @source.match(Private::CLOSE_PATTERN, true)
Expand All @@ -452,7 +452,7 @@ def pull_event
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
elsif @source.match("!", true)
elsif @source.match?("!", true)
md = @source.match(/([^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
Expand All @@ -470,7 +470,7 @@ def pull_event
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif @source.match("?", true)
elsif @source.match?("?", true)
return process_instruction
else
# Get the next tag
Expand Down Expand Up @@ -651,7 +651,7 @@ def need_source_encoding_update?(xml_declaration_encoding)
def parse_name(base_error_message)
md = @source.match(Private::NAME_PATTERN, true)
unless md
if @source.match(/\S/um)
if @source.match?(/\S/um)
message = "#{base_error_message}: invalid name"
else
message = "#{base_error_message}: name is missing"
Expand Down Expand Up @@ -693,34 +693,34 @@ def parse_id_invalid_details(accept_external_id:,
accept_public_id:)
public = /\A\s*PUBLIC/um
system = /\A\s*SYSTEM/um
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um)
if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
return "public ID literal is missing"
end
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um)
return "invalid public ID literal"
end
if accept_public_id
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
return "system ID literal is missing"
end
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
end
"garbage after system literal"
else
"garbage after public ID literal"
end
elsif accept_external_id and @source.match(/#{system}/um)
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
elsif accept_external_id and @source.match?(/#{system}/um)
if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
return "system literal is missing"
end
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
end
"garbage after system literal"
else
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
return "invalid ID type"
end
"ID type is missing"
Expand All @@ -729,15 +729,15 @@ def parse_id_invalid_details(accept_external_id:,

def process_instruction
name = parse_name("Malformed XML: Invalid processing instruction node")
if @source.match(/\s+/um, true)
if @source.match?(/\s+/um, true)
match_data = @source.match(/(.*?)\?>/um, true)
unless match_data
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
end
content = match_data[1]
else
content = nil
unless @source.match("?>", true)
unless @source.match?("?>", true)
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
end
end
Expand Down Expand Up @@ -767,17 +767,17 @@ def parse_attributes(prefixes)
expanded_names = {}
closed = false
while true
if @source.match(">", true)
if @source.match?(">", true)
return attributes, closed
elsif @source.match("/>", true)
elsif @source.match?("/>", true)
closed = true
return attributes, closed
elsif match = @source.match(QNAME, true)
name = match[1]
prefix = match[2]
local_part = match[3]

unless @source.match(/\s*=\s*/um, true)
unless @source.match?(/\s*=\s*/um, true)
message = "Missing attribute equal: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
Expand All @@ -793,7 +793,7 @@ def parse_attributes(prefixes)
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
raise REXML::ParseException.new(message, @source)
end
@source.match(/\s*/um, true)
@source.match?(/\s*/um, true)
if prefix == "xmlns"
if local_part == "xml"
if value != Private::XML_PREFIXED_NAMESPACE
Expand Down
35 changes: 35 additions & 0 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ def scan(pattern)
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
super(pattern)
end

def match?(pattern)
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
super(pattern)
end

def skip(pattern)
pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
super(pattern)
end
end
end
using StringScannerCheckScanString
Expand Down Expand Up @@ -126,6 +136,14 @@ def match(pattern, cons=false)
end
end

def match?(pattern, cons=false)
if cons
!@scanner.skip(pattern).nil?
else
!@scanner.match?(pattern).nil?
end
end

def position
@scanner.pos
end
Expand Down Expand Up @@ -267,6 +285,23 @@ def match( pattern, cons=false )
md.nil? ? nil : @scanner
end

def match?( pattern, cons=false )
# To avoid performance issue, we need to increase bytes to read per scan
min_bytes = 1
while true
if cons
n_matched_bytes = @scanner.skip(pattern)
else
n_matched_bytes = @scanner.match?(pattern)
end
return true if n_matched_bytes
return false if pattern.is_a?(String)
return false if @source.nil?
return false unless read(nil, min_bytes)
min_bytes *= 2
end
end

def empty?
super and ( @source.nil? || @source.eof? )
end
Expand Down

0 comments on commit 8ef7502

Please sign in to comment.