From 2a7f8a4767bd3b2771187fc2e3e4f2f8b6296914 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 23 Jun 2024 21:08:03 +0900 Subject: [PATCH] Optimize BaseParser#unnormalize method ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.704 18.106 34.215 33.806 i/s - 100.000 times in 5.648398s 5.523110s 2.922698s 2.958036s sax 25.664 25.302 48.429 48.602 i/s - 100.000 times in 3.896488s 3.952289s 2.064859s 2.057537s pull 28.966 29.215 61.710 62.068 i/s - 100.000 times in 3.452275s 3.422901s 1.620480s 1.611129s stream 28.291 28.426 53.860 55.548 i/s - 100.000 times in 3.534716s 3.517884s 1.856667s 1.800247s Comparison: dom before(YJIT): 34.2 i/s after(YJIT): 33.8 i/s - 1.01x slower after: 18.1 i/s - 1.89x slower before: 17.7 i/s - 1.93x slower sax after(YJIT): 48.6 i/s before(YJIT): 48.4 i/s - 1.00x slower before: 25.7 i/s - 1.89x slower after: 25.3 i/s - 1.92x slower pull after(YJIT): 62.1 i/s before(YJIT): 61.7 i/s - 1.01x slower after: 29.2 i/s - 2.12x slower before: 29.0 i/s - 2.14x slower stream after(YJIT): 55.5 i/s before(YJIT): 53.9 i/s - 1.03x slower after: 28.4 i/s - 1.95x slower before: 28.3 i/s - 1.96x slower ``` - YJIT=ON : 1.00x - 1.03x faster - YJIT=OFF : 0.98x - 1.02x faster --- lib/rexml/parsers/baseparser.rb | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 2f068e0c..275372ee 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -132,6 +132,13 @@ module Private GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end end private_constant :Private @@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.gsub( /\r\n?/, "\n" ) + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') @@ -518,7 +525,7 @@ def unnormalize( string, entities=nil, filter=nil ) unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value - re = /&#{entity_reference};/ + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] @@ -526,7 +533,7 @@ def unnormalize( string, entities=nil, filter=nil ) end end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end