Skip to content

Commit

Permalink
fix(libxml): default Reader node encoding to UTF-8 (#3084)
Browse files Browse the repository at this point in the history
**What problem is this PR intended to solve?**

default Reader node encoding to UTF-8 when it's not specified either as
a method param or in the document

Fixes #2891


**Have you included adequate test coverage?**

Yes, I've added coverage.


**Does this change affect the behavior of either the C or the Java
implementations?**

Yes, this updates the C implementation but does not update the Java
implementation because Reader encoding is already wonky there in a few
edge cases.
  • Loading branch information
flavorjones authored Dec 29, 2023
2 parents c6847be + 26fcb96 commit bb114cf
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 10 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

---

## v1.16.next / unreleased

### Fixed

* [CRuby] `XML::Reader` defaults the encoding to UTF-8 if it's not specified in either the document or as a method parameter. Previously non-ASCII characters were serialized as NCRs in this case. [#2891] (@flavorjones)


## v1.16.0 / 2023-12-27

### Notable Changes
Expand Down
28 changes: 24 additions & 4 deletions ext/nokogiri/xml_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@ VALUE cNokogiriXmlReader;
static void
xml_reader_deallocate(void *data)
{
// free the document separately because we _may_ have triggered preservation by calling
// xmlTextReaderCurrentDoc during a read_more.
xmlTextReaderPtr reader = data;
xmlDocPtr doc = xmlTextReaderCurrentDoc(reader);
xmlFreeTextReader(reader);
if (doc) {
xmlFreeDoc(doc);
}
}

static const rb_data_type_t xml_reader_type = {
Expand Down Expand Up @@ -515,6 +521,7 @@ read_more(VALUE self)
xmlErrorConstPtr error;
VALUE error_list;
int ret;
xmlDocPtr c_document;

TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

Expand All @@ -524,6 +531,16 @@ read_more(VALUE self)
ret = xmlTextReaderRead(reader);
xmlSetStructuredErrorFunc(NULL, NULL);

c_document = xmlTextReaderCurrentDoc(reader);
if (c_document && c_document->encoding == NULL) {
VALUE constructor_encoding = rb_iv_get(self, "@encoding");
if (RTEST(constructor_encoding)) {
c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding));
} else {
c_document->encoding = xmlStrdup(BAD_CAST "UTF-8");
}
}

if (ret == 1) { return self; }
if (ret == 0) { return Qnil; }

Expand Down Expand Up @@ -707,15 +724,18 @@ rb_xml_reader_encoding(VALUE rb_reader)
const char *parser_encoding;
VALUE constructor_encoding;

TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding) {
return NOKOGIRI_STR_NEW2(parser_encoding);
}

constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}

TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding == NULL) { return Qnil; }
return NOKOGIRI_STR_NEW2(parser_encoding);
return Qnil;
}

void
Expand Down
115 changes: 109 additions & 6 deletions test/xml/test_reader_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,135 @@ def setup
)
end

def test_libxml2_detects_internal_encoding_correctly
skip_unless_libxml2("This feature wasn't implemented for JRuby")
def test_detects_internal_encoding_correctly
skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
<anotaci\xF3n>inspiraci\xF3n</anotaci\xF3n>
XML

assert_nil(reader.encoding)

reader.each do
assert_equal("ISO-8859-1", reader.encoding)
end
end

def test_libxml2_overrides_internal_encoding_when_specified
def test_reader_defaults_internal_encoding_to_utf8
skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0"?>
<root attr="foo"><employee /></root>
XML

assert_nil(reader.encoding)

reader.each do
assert_equal("UTF-8", reader.encoding)
end
end

def test_override_internal_encoding_when_specified
if Nokogiri.uses_libxml? && !Nokogiri::VersionInfo.instance.libxml2_has_iconv?
skip("iconv is not compiled into libxml2")
end

#
# note that libxml2 behavior around document encoding changed at least twice between 2.9
# and 2.12, so the testing here is superficial -- asserting on the reported encoding, but
# not asserting on the bytes in the document or the serialized nodes.
#
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
<foo>asdf</foo>
XML

assert_equal("UTF-8", reader.encoding)
reader.each do

reader.read

if Nokogiri.jruby? || Nokogiri.uses_libxml?(">= 2.12.0")
assert_equal("UTF-8", reader.encoding)
else
assert_equal("ISO-8859-1", reader.encoding)
end

reader = Nokogiri::XML::Reader(<<~XML, nil, "ISO-8859-1")
<?xml version="1.0" encoding="UTF-8"?>
<foo>asdf</foo>
XML

assert_equal("ISO-8859-1", reader.encoding)

reader.read

if Nokogiri.jruby? || Nokogiri.uses_libxml?(">= 2.12.0")
assert_equal("ISO-8859-1", reader.encoding)
else
assert_equal("UTF-8", reader.encoding)
end
end

def test_attribute_encoding_issue_2891_no_encoding_specified
if Nokogiri.uses_libxml? && !Nokogiri::VersionInfo.instance.libxml2_has_iconv?
skip("iconv is not compiled into libxml2")
end

# https://github.com/sparklemotion/nokogiri/issues/2891
reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0"?>
<anotación tipo="inspiración">INSPIRACIÓN</anotación>
XML

assert_nil(reader.encoding)

reader.read

assert_equal("UTF-8", reader.encoding) unless Nokogiri.jruby? # JRuby doesn't support encoding detection
assert_equal(
"<anotación tipo=\"inspiración\">INSPIRACIÓN</anotación>",
reader.outer_xml,
)
end

def test_attribute_encoding_issue_2891_correct_encoding_specified
if Nokogiri.uses_libxml? && !Nokogiri::VersionInfo.instance.libxml2_has_iconv?
skip("iconv is not compiled into libxml2")
end

# https://github.com/sparklemotion/nokogiri/issues/2891
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0"?>
<anotación tipo="inspiración">INSPIRACIÓN</anotación>
XML

assert_equal("UTF-8", reader.encoding)

reader.read

assert_equal("UTF-8", reader.encoding)
assert_equal(
"<anotación tipo=\"inspiración\">INSPIRACIÓN</anotación>",
reader.outer_xml,
)
end

def test_attribute_encoding_issue_2891_correct_encoding_specified_non_utf8
xml = <<~XML
<?xml version="1.0"?>
<test>\u{82B1}\u{82F1}</test>
XML
reader = Nokogiri::XML::Reader(xml, nil, "Shift_JIS")

assert_equal("Shift_JIS", reader.encoding)

reader.read

assert_equal("Shift_JIS", reader.encoding)
end

def test_attribute_at
@reader.each do |node|
next unless (attribute = node.attribute_at(0))
Expand Down

0 comments on commit bb114cf

Please sign in to comment.