From e2546e6ecade16b04c9ee528e5be8509fe16c2d6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 1 Aug 2024 11:23:43 +0900 Subject: [PATCH] parse pi: improve invalid case detection --- lib/rexml/parsers/baseparser.rb | 35 +++++++++++++---------- test/parse/test_processing_instruction.rb | 35 +++++++++++++++++++++-- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index b5df6dbc..44dc6580 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -124,11 +124,10 @@ class BaseParser } module Private - INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um - NAME_PATTERN = /\s*#{NAME}/um + NAME_PATTERN = /#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um @@ -242,7 +241,7 @@ def pull_event if @document_status == nil start_position = @source.position if @source.match("/um, true) @@ -442,7 +441,7 @@ def pull_event raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.match("?", true) - return process_instruction(start_position) + return process_instruction else # Get the next tag md = @source.match(Private::TAG_PATTERN, true) @@ -588,14 +587,14 @@ def need_source_encoding_update?(xml_declaration_encoding) def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md - if @source.match(/\s*\S/um) + if @source.match(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" end raise REXML::ParseException.new(message, @source) end - md[1] + md[0] end def parse_id(base_error_message, @@ -664,18 +663,24 @@ def parse_id_invalid_details(accept_external_id:, end end - def process_instruction(start_position) - match_data = @source.match(Private::INSTRUCTION_END, true) - unless match_data - message = "Invalid processing instruction node" - @source.position = start_position - raise REXML::ParseException.new(message, @source) + def process_instruction + name = parse_name("Malformed XML: Invalid processing instruction node") + if @source.match(/\s+/um, true) + match_data = @source.match(/(.*?)\?>/um, true) + unless match_data + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end + content = match_data[1] + else + content = nil + unless @source.match("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end end - if match_data[1] == "xml" + if name == "xml" if @document_status raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) end - content = match_data[2] version = VERSION.match(content) version = version[1] unless version.nil? encoding = ENCODING.match(content) @@ -690,7 +695,7 @@ def process_instruction(start_position) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, match_data[1], match_data[2]] + [:processing_instruction, name, content] end def parse_attributes(prefixes, curr_ns) diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index 49cf23a5..fba79cea 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -17,11 +17,37 @@ def test_no_name parse("") end assert_equal(<<-DETAIL.chomp, exception.to_s) -Invalid processing instruction node +Malformed XML: Invalid processing instruction node: invalid name Line: 1 Position: 4 Last 80 unconsumed characters: - +?> + DETAIL + end + + def test_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse("") + assert_equal("con?tent", document.root.children.first.content) + end + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n|