From 35c1857a7c431c895b22923be317e0738b72d0e3 Mon Sep 17 00:00:00 2001
From: Inductiveload <inductiveload@gmail.com>
Date: Wed, 28 Jul 2021 04:46:50 +0100
Subject: [PATCH] Avoid blindly re-encoding HTML files

Previously, HTML files werei stripped of their XML
Processing Instruction headers and re-encoded from
UTF-8 to HTML-ENTITIIES to be fed into the DomDocument.

This caused problems for documents with CDATA blocks that
contained Unicode, as it's not correct to escape that as
HTML entities in the general case. For example CSS or
binary data doesn't use that escaping system.

Instead, load it directly and then remove the PI nodes
after the fact.

Bug: https://phabricator.wikimedia.org/T271390
---
 src/Util/Util.php | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/Util/Util.php b/src/Util/Util.php
index 89302d47..7081803a 100644
--- a/src/Util/Util.php
+++ b/src/Util/Util.php
@@ -185,10 +185,26 @@ public static function extractErrorMessage( ?ResponseInterface $resp, RequestInt
 	 */
 	public static function buildDOMDocumentFromHtml( string $html ): DOMDocument {
 		$document = new DOMDocument( '1.0', 'UTF-8' );
+
 		libxml_use_internal_errors( true );
-		$document->loadHTML( mb_convert_encoding( str_replace( '<?xml version="1.0" encoding="UTF-8" ?>', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) );
+		// enforce a UTF-8 encoding declaration
+		$document->loadHTML( '<?xml encoding="utf-8" ?>' . $html );
 		libxml_clear_errors();
+
 		$document->encoding = 'UTF-8';
+
+		// Dirty fix to strip out existing XML Processing Instruction nodes
+		// (we already have one from the creation of the DOMDocument)
+		//
+		// This is better than re-encoding from UTF-8 to HTML-ENTITIES, because
+		// that will escape things even in CDATA blocks (e.g. T271390)
+		// https://www.php.net/manual/en/domdocument.loadhtml.php#95251
+		foreach ( $document->childNodes as $item ) {
+			if ( $item->nodeType === XML_PI_NODE ) {
+				$document->removeChild( $item );
+			}
+		}
+
 		return $document;
 	}