Avoid blindly re-encoding HTML files

Previously, HTML files werei stripped of their XML Processing Instruction headers and re-encoded from UTF-8 to HTML-ENTITIIES to be fed into the DomDocument. This caused problems for documents with CDATA blocks that contained Unicode, as it's not correct to escape that as HTML entities in the general case. For example CSS or binary data doesn't use that escaping system. Instead, load it directly and then remove the PI nodes after the fact. Bug: https://phabricator.wikimedia.org/T271390
wikimedia · Jul 28, 2021 · d3fed6a · d3fed6a
1 parent 92c17e0
commit d3fed6a
Showing 1 changed file with 15 additions and 1 deletion.
diff --git a/src/Util/Util.php b/src/Util/Util.php
@@ -185,9 +185,23 @@ public static function extractErrorMessage( ?ResponseInterface $resp, RequestInt
 	 */
 	public static function buildDOMDocumentFromHtml( string $html ): DOMDocument {
 		$document = new DOMDocument( '1.0', 'UTF-8' );
+
 		libxml_use_internal_errors( true );
-		$document->loadHTML( mb_convert_encoding( str_replace( '<?xml version="1.0" encoding="UTF-8" ?>', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) );
+		$document->loadHTML( $html );
 		libxml_clear_errors();
+
+		// Dirty fix to strip out existing XML Processing Instruction nodes
+		// (we already have one from the creation of the DOMDocument)
+		//
+		// This is better than re-encoding from UTF-8 to HTML-ENTITIES, because
+		// that will escape things even in CDATA blocks (e.g. T271390)
+		// https://www.php.net/manual/en/domdocument.loadhtml.php#95251
+		foreach ( $document->childNodes as $item ) {
+		    if ( $item->nodeType === XML_PI_NODE ) {
+		        $document->removeChild( $item );
+		    }
+		}
+
 		$document->encoding = 'UTF-8';
 		return $document;
 	}