From 35c1857a7c431c895b22923be317e0738b72d0e3 Mon Sep 17 00:00:00 2001 From: Inductiveload Date: Wed, 28 Jul 2021 04:46:50 +0100 Subject: [PATCH] Avoid blindly re-encoding HTML files Previously, HTML files werei stripped of their XML Processing Instruction headers and re-encoded from UTF-8 to HTML-ENTITIIES to be fed into the DomDocument. This caused problems for documents with CDATA blocks that contained Unicode, as it's not correct to escape that as HTML entities in the general case. For example CSS or binary data doesn't use that escaping system. Instead, load it directly and then remove the PI nodes after the fact. Bug: https://phabricator.wikimedia.org/T271390 --- src/Util/Util.php | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Util/Util.php b/src/Util/Util.php index 89302d47..7081803a 100644 --- a/src/Util/Util.php +++ b/src/Util/Util.php @@ -185,10 +185,26 @@ public static function extractErrorMessage( ?ResponseInterface $resp, RequestInt */ public static function buildDOMDocumentFromHtml( string $html ): DOMDocument { $document = new DOMDocument( '1.0', 'UTF-8' ); + libxml_use_internal_errors( true ); - $document->loadHTML( mb_convert_encoding( str_replace( '', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) ); + // enforce a UTF-8 encoding declaration + $document->loadHTML( '' . $html ); libxml_clear_errors(); + $document->encoding = 'UTF-8'; + + // Dirty fix to strip out existing XML Processing Instruction nodes + // (we already have one from the creation of the DOMDocument) + // + // This is better than re-encoding from UTF-8 to HTML-ENTITIES, because + // that will escape things even in CDATA blocks (e.g. T271390) + // https://www.php.net/manual/en/domdocument.loadhtml.php#95251 + foreach ( $document->childNodes as $item ) { + if ( $item->nodeType === XML_PI_NODE ) { + $document->removeChild( $item ); + } + } + return $document; }