Skip to content

Commit

Permalink
fix: Extra content at the end of the document
Browse files Browse the repository at this point in the history
## Why?

XML with additional content at the end of the document is invalid.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc

```
[27]   	Misc	   ::=   	Comment | PI | S
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI

```
[16]   	PI	   ::=   	'<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget

```
[17]   	PITarget	   ::=   	Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
```
  • Loading branch information
naitoh committed Jul 4, 2024
1 parent c4fb89b commit 641d9d1
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 0 deletions.
13 changes: 13 additions & 0 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def initialize( source )
self.stream = source
@listeners = []
@prefixes = Set.new
@root_tag = nil
end

def add_listener( listener )
Expand Down Expand Up @@ -460,15 +461,24 @@ def pull_event
@closed = tag
@nsstack.shift
else
if @tags.size.zero? && !@root_tag.nil?
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
end
@tags.push( tag )
end
@root_tag ||= tag
return [ :start_element, tag, attributes ]
end
else
text = @source.read_until("<")
if text.chomp!("<")
@source.position -= "<".bytesize
end
if @tags.size.zero? && !@root_tag.nil?
if text.strip != ""
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
end
end
return [ :text, text ]
end
rescue REXML::UndefinedNamespaceException
Expand Down Expand Up @@ -635,6 +645,9 @@ def process_instruction(start_position)
@source.position = start_position
raise REXML::ParseException.new(message, @source)
end
if @tags.size.zero? && !@root_tag.nil? && match_data[1] == "xml"
raise ParseException.new("Malformed XML: Extra XML declaration at the end of the document (got '<?#{match_data[1]}')", @source)
end
if @document_status.nil? and match_data[1] == "xml"
content = match_data[2]
version = VERSION.match(content)
Expand Down
75 changes: 75 additions & 0 deletions test/parser/test_base_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,80 @@ def test_large_xml
parser.position < xml.bytesize
end
end

def test_parse_exception_for_multiple_root_elements
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a></a><b>')
while parser.has_next?
parser.pull
end
end

assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed XML: Extra tag at the end of the document (got '<b')
Line: 1
Position: 10
Last 80 unconsumed characters:
DETAIL
end

def test_parse_exception_for_extra_content_at_the_end_of_the_document
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a></a>c')
while parser.has_next?
parser.pull
end
end

assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed XML: Extra content at the end of the document (got 'c')
Line: 1
Position: 8
Last 80 unconsumed characters:
DETAIL
end

def test_parse_exception_for_extra_xml_declaration_at_the_end_of_the_document
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a></a><?xml version="1.0" ?>')
while parser.has_next?
parser.pull
end
end

assert_equal(<<-DETAIL.chomp, exception.to_s)
Malformed XML: Extra XML declaration at the end of the document (got '<?xml')
Line: 1
Position: 29
Last 80 unconsumed characters:
DETAIL
end

def test_extra_processing_instruction_node_at_the_end_of_the_document
parser = REXML::Parsers::BaseParser.new('<a></a><?abc version="1.0" ?>')

events = {}
while parser.has_next?
event = parser.pull
events[event[0]] = event[1]
end

assert_equal("abc", events[:processing_instruction])
end

def test_extra_comments_at_the_end_of_the_document
parser = REXML::Parsers::BaseParser.new('<a></a><!-- ok comment -->')

events = {}
while parser.has_next?
event = parser.pull
events[event[0]] = event[1]
end

assert_equal(" ok comment ", events[:comment])
end
end
end

0 comments on commit 641d9d1

Please sign in to comment.