diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index ee6209c69e0ae..289347840d002 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -247,6 +247,86 @@ * } * } * + * ## Tokens and finer-grained processing. + * + * >>> Stub documentation. + * + * It's also possible to scan through every lexical token in + * the HTML document using the `next_token()` function. This + * alternative form takes no argument and provides no built-in + * query syntax. + * + * Example: + * + * $title = '(untitled)'; + * $text_content = ''; + * while ( $processor->next_token() ) { + * switch ( $processor->get_node_name() ) { + * case '#text': + * $text .= $processor->get_node_text(); + * break; + * + * case 'HR': + * $text .= "\n"; + * break; + * + * case 'TITLE': + * $title = $processor->get_node_text(); + * break; + * } + * } + * return trim( "# {$title}\n\n{$text_content}\n" ); + * + * ### Tokens and _modifiable text_. + * + * #### Special "atomic" HTML elements. + * + * Not all HTML elements are able to contain other elements inside of them. + * For instance, the contents inside a TITLE element are plaintext (except + * that character references like & will be decoded). This means that + * if the string `` appears inside a TITLE element, then it's not an + * image tag, but rather it's text describing an image tag. Likewise, the + * contents of a SCRIPT or STYLE element are handled entirely separately in + * a browser than the contents of other elements because they represent a + * different language than HTML. + * + * For these elements the Tag Processor treats the entire sequence as one, + * from the opening tag, including its contents, through its closing tag. + * This means that the it's not possible to match the closing tag for a + * SCRIPT element unless it's unexpected; the Tag Processor already matched + * it when it found the opening tag. + * + * The inner contents of these elements are that element's _modifiable text_. + * + * The special elements are: + * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy + * style of including Javascript inside of HTML comments to avoid accidentally + * closing the SCRIPT from inside a Javascript string. E.g. `console.log( '' )`. + * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any + * character references are decoded. E.g. "1 &lt; 2 < 3" becomes "1 < 2 < 3". + * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as + * raw plaintext and left as-si. E.g. "1 &lt; 2 < 3" remains "1 &lt; 2 < 3". + * + * #### Other tokens with modifiable text. + * + * There are also non-elements which are atomic in nature and contain modifiable text. + * + * - `#text` nodes, whose entire token _is_ the modifiable text. + * - Comment nodes and nodes that became comments because of some syntax error. The + * text for these nodes is the portion of the comment inside of the syntax. E.g. for + * "<!-- comment -->" the text is " comment " (note that the spaces are part of it). + * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for + * "<![CDATA[some content]]>" the text is "some content". + * - "Funky comments," which are a special case of invalid closing tags whose name is + * invalid. The text for these nodes is the text that a browser would transform into + * an HTML when parsing. E.g. for "</%post_author>" the text is "%post_author". + * + * And there are non-elements which are atomic in nature but have no modifiable text. + * - `DOCTYPE` nodes like "<DOCTYPE html>" which have no closing tag. + * - XML Processing instruction nodes like "<". + * - The empty end tag "<" which is ignored in the browser and DOM but exposed + * to the HTML API. + * * ## Design and limitations * * The Tag Processor is designed to linearly scan HTML documents and tokenize @@ -320,7 +400,8 @@ * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive. * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token. - * Introduces "special" elements which act like void elements, e.g. STYLE. + * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE. + * Allows scanning through all tokens and processing modifiable text, where applicable. */ class WP_HTML_Tag_Processor { /** @@ -396,12 +477,18 @@ class WP_HTML_Tag_Processor { /** * Specifies mode of operation of the parser at any given time. * - * | State | Meaning | - * | --------------|----------------------------------------------------------------------| - * | *Ready* | The parser is ready to run. | - * | *Complete* | There is nothing left to parse. | - * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | - * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | + * | State | Meaning | + * | ----------------|----------------------------------------------------------------------| + * | *Ready* | The parser is ready to run. | + * | *Complete* | There is nothing left to parse. | + * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | + * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | + * | *Text node* | Found a #text node; this is plaintext and modifiable. | + * | *CDATA node* | Found a CDATA section; this is modifiable. | + * | *PI node* | Found a Processing Instruction; this is modifiable. | + * | *Comment* | Found a comment or bogus comment; this is modifiable. | + * | *Presumptuous* | Found an empty tag closer: ``. | + * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. | * * @since 6.5.0 * @@ -409,6 +496,13 @@ class WP_HTML_Tag_Processor { * @see WP_HTML_Tag_Processor::STATE_COMPLETE * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG + * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE + * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE + * @see WP_HTML_Tag_Processor::STATE_PI_NODE + * @see WP_HTML_Tag_Processor::STATE_COMMENT + * @see WP_HTML_Tag_Processor::STATE_DOCTYPE + * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG + * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT * * @var string */ @@ -490,6 +584,24 @@ class WP_HTML_Tag_Processor { */ private $tag_name_length; + /** + * Byte offset into input document where current modifiable text starts. + * + * @since 6.5.0 + * + * @var int + */ + private $text_starts_at; + + /** + * Byte length of modifiable text. + * + * @since 6.5.0 + * + * @var string + */ + private $text_length; + /** * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. * @@ -705,8 +817,8 @@ public function next_tag( $query = null ) { * @return bool Whether a token was parsed. */ public function next_token() { - $this->get_updated_html(); $was_at = $this->bytes_already_parsed; + $this->get_updated_html(); // Don't proceed if there's nothing more to scan. if ( @@ -736,6 +848,19 @@ public function next_token() { return false; } + /* + * for legacy reasons the rest of this function handles tags and their + * attributes. if the processor has reached the end of the document + * or if it matched any other token then it should return here. + */ + if ( + self::STATE_INCOMPLETE !== $this->parser_state && + self::STATE_COMPLETE !== $this->parser_state && + self::STATE_MATCHED_TAG !== $this->parser_state + ) { + return true; + } + // Parse all of its attributes. while ( $this->parse_next_attribute() ) { continue; @@ -762,7 +887,7 @@ public function next_token() { } $this->parser_state = self::STATE_MATCHED_TAG; $this->token_length = $tag_ends_at - $this->token_starts_at; - $this->bytes_already_parsed = $tag_ends_at; + $this->bytes_already_parsed = $tag_ends_at + 1; /* * For non-DATA sections which might contain text that looks like HTML tags but @@ -771,8 +896,8 @@ public function next_token() { */ $t = $this->html[ $this->tag_name_starts_at ]; if ( - ! $this->is_closing_tag && - ( + $this->is_closing_tag || + ! ( 'i' === $t || 'I' === $t || 'n' === $t || 'N' === $t || 's' === $t || 'S' === $t || @@ -780,38 +905,69 @@ public function next_token() { 'x' === $t || 'X' === $t ) ) { - $tag_name = $this->get_tag(); + return true; + } - if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { - $this->parser_state = self::STATE_INCOMPLETE; - $this->bytes_already_parsed = $was_at; + $tag_name = $this->get_tag(); + if ( + 'SCRIPT' !== $tag_name && + 'TEXTAREA' !== $tag_name && + 'TITLE' !== $tag_name && + 'IFRAME' !== $tag_name && + 'NOEMBED' !== $tag_name && + 'NOFRAMES' !== $tag_name && + 'STYLE' !== $tag_name && + 'XMP' !== $tag_name + ) { + return true; + } - return false; - } elseif ( - ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && - ! $this->skip_rcdata( $tag_name ) - ) { - $this->parser_state = self::STATE_INCOMPLETE; - $this->bytes_already_parsed = $was_at; + // Preserve the opening tag pointers. + $tag_name_starts_at = $this->tag_name_starts_at; + $tag_name_length = $this->tag_name_length; + $tag_ends_at = $this->token_starts_at + $this->token_length; - return false; - } elseif ( - ( - 'IFRAME' === $tag_name || - 'NOEMBED' === $tag_name || - 'NOFRAMES' === $tag_name || - 'STYLE' === $tag_name || - 'XMP' === $tag_name - ) && - ! $this->skip_rawtext( $tag_name ) - ) { - $this->parser_state = self::STATE_INCOMPLETE; - $this->bytes_already_parsed = $was_at; + // Find the closing tag. + $found_closer = false; + switch ( $tag_name ) { + case 'SCRIPT': + $found_closer = $this->skip_script_data(); + break; - return false; - } + case 'TEXTAREA': + case 'TITLE': + $found_closer = $this->skip_rcdata( $tag_name ); + break; + + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + case 'STYLE': + case 'XMP': + $found_closer = $this->skip_rawtext( $tag_name ); + break; } + if ( ! $found_closer ) { + $this->parser_state = self::STATE_INCOMPLETE; + $this->bytes_already_parsed = $was_at; + return false; + } + + /* + * The values here look like they reference the opening tag but they reference + * the closing that instead. This is why the opening tag values were stored + * above in a variable. It reads confusingly here, but that's because the + * functions that skip the contents have moved all the internal cursors past + * the inner content of the tag. + */ + $this->token_starts_at = $was_at; + $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; + $this->text_starts_at = $tag_ends_at + 1; + $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; + $this->tag_name_starts_at = $tag_name_starts_at; + $this->tag_name_length = $tag_name_length; + return true; } @@ -1007,7 +1163,7 @@ public function has_class( $wanted_class ) { */ public function set_bookmark( $name ) { // It only makes sense to set a bookmark if the parser has paused on a concrete token. - if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + if ( self::STATE_INCOMPLETE === $this->parser_state ) { return false; } @@ -1082,15 +1238,15 @@ private function skip_rcdata( $tag_name ) { $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { - $at = strpos( $this->html, 'html, 'tag_name_starts_at = $at; // Fail if there is no possible tag closer. if ( false === $at || ( $at + $tag_length ) >= $doc_length ) { return false; } - $closer_potentially_starts_at = $at; - $at += 2; + $at += 2; /* * Find a case-insensitive match to the tag name. @@ -1131,13 +1287,23 @@ private function skip_rcdata( $tag_name ) { while ( $this->parse_next_attribute() ) { continue; } + $at = $this->bytes_already_parsed; if ( $at >= strlen( $this->html ) ) { return false; } - if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { - $this->bytes_already_parsed = $closer_potentially_starts_at; + if ( '>' === $html[ $at ] ) { + $this->bytes_already_parsed = $at + 1; + return true; + } + + if ( $at + 1 >= strlen( $this->html ) ) { + return false; + } + + if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) { + $this->bytes_already_parsed = $at + 2; return true; } } @@ -1259,6 +1425,7 @@ private function skip_script_data() { if ( $is_closing ) { $this->bytes_already_parsed = $closer_potentially_starts_at; + $this->tag_name_starts_at = $closer_potentially_starts_at; if ( $this->bytes_already_parsed >= $doc_length ) { return false; } @@ -1274,7 +1441,7 @@ private function skip_script_data() { } if ( '>' === $html[ $this->bytes_already_parsed ] ) { - $this->bytes_already_parsed = $closer_potentially_starts_at; + ++$this->bytes_already_parsed; return true; } } @@ -1303,17 +1470,34 @@ private function parse_next_tag() { $html = $this->html; $doc_length = strlen( $html ); - $at = $this->bytes_already_parsed; + $was_at = $this->bytes_already_parsed; + $at = $was_at; - while ( false !== $at && $at < $doc_length ) { + while ( false !== $at && $at <= $doc_length ) { $at = strpos( $html, '<', $at ); + if ( $at > $was_at ) { + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = $at - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $this->token_length; + $this->bytes_already_parsed = $at; + return true; + } + /* * This does not imply an incomplete parse; it indicates that there * can be nothing left in the document other than a #text node. */ if ( false === $at ) { - return false; + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = strlen( $html ) - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $this->token_length; + $this->bytes_already_parsed = strlen( $html ); + return true; } $this->token_starts_at = $at; @@ -1342,8 +1526,9 @@ private function parse_next_tag() { $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); if ( $tag_name_prefix_length > 0 ) { ++$at; - $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); + $this->parser_state = self::STATE_MATCHED_TAG; $this->tag_name_starts_at = $at; + $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); $this->bytes_already_parsed = $at + $this->tag_name_length; return true; } @@ -1383,8 +1568,13 @@ private function parse_next_tag() { // Abruptly-closed empty comments are a sequence of dashes followed by `>`. $span_of_dashes = strspn( $html, '-', $closer_at ); if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { - $at = $closer_at + $span_of_dashes + 1; - continue; + // @todo This could go wrong if the closer is shorter than `` because there's no inside. + $this->parser_state = self::STATE_COMMENT; + $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = max( 0, $closer_at - $this->text_starts_at ); + $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1; + return true; } /* @@ -1403,13 +1593,25 @@ private function parse_next_tag() { } if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { - $at = $closer_at + 3; - continue 2; + $this->parser_state = self::STATE_COMMENT; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; } - if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { - $at = $closer_at + 4; - continue 2; + if ( + $closer_at + 3 < $doc_length && + '!' === $html[ $closer_at + 2 ] && + '>' === $html[ $closer_at + 3 ] + ) { + $this->parser_state = self::STATE_COMMENT; + $this->token_length = $closer_at + 4 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 4; + return true; } } } @@ -1436,8 +1638,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 3; - continue; + $this->parser_state = self::STATE_CDATA_NODE; + $this->token_length = $closer_at + 4 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; } /* @@ -1462,8 +1668,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_DOCTYPE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } /* @@ -1477,8 +1687,6 @@ private function parse_next_tag() { return false; } - - continue; } /* @@ -1491,8 +1699,10 @@ private function parse_next_tag() { * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name */ if ( '>' === $html[ $at + 1 ] ) { - ++$at; - continue; + $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; + $this->token_length = $at + 2 - $this->token_starts_at; + $this->bytes_already_parsed = $at + 2; + return true; } /* @@ -1507,8 +1717,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_DOCTYPE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } /* @@ -1530,8 +1744,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_FUNKY_COMMENT; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } ++$at; @@ -1692,6 +1910,8 @@ private function after_tag() { $this->token_length = null; $this->tag_name_starts_at = null; $this->tag_name_length = null; + $this->text_starts_at = 0; + $this->text_length = 0; $this->is_closing_tag = null; $this->attributes = array(); $this->duplicate_attributes = null; @@ -2281,6 +2501,70 @@ public function is_tag_closer() { ); } + public function get_node_type() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return '#tag'; + + case self::STATE_DOCTYPE: + return '#doctype'; + + case self::STATE_PI_NODE: + return '#processing-instruction'; + + default: + return $this->get_node_name(); + } + } + + public function get_node_name() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return $this->get_tag(); + + case self::STATE_TEXT_NODE: + return '#text'; + + case self::STATE_CDATA_NODE: + return '#cdata-section'; + + case self::STATE_PI_NODE: + // @todo add the PI tag. + return '?'; + + case self::STATE_COMMENT: + return '#comment'; + + case self::STATE_DOCTYPE: + return 'html'; + + case self::STATE_PRESUMPTUOUS_TAG: + return '#presumptuous-tag'; + + case self::STATE_FUNKY_COMMENT: + return '#funky-comment'; + } + } + + public function get_node_text() { + $at = $this->text_starts_at; + $length = $this->text_length; + + if ( self::STATE_MATCHED_TAG === $this->parser_state ) { + switch ( $this->get_tag() ) { + case 'PRE': + case 'TEXTAREA': + if ( "\n" === $this->html[ $at ] ) { + ++$at; + --$length; + } + break; + } + } + + return substr( $this->html, $at, $length ); + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * @@ -2797,4 +3081,12 @@ private function matches() { * @access private */ const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; + + const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; + const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; + const STATE_PI_NODE = 'STATE_PI_NODE'; + const STATE_COMMENT = 'STATE_COMMENT'; + const STATE_DOCTYPE = 'STATE_DOCTYPE'; + const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG'; + const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY'; } diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index 8a681d2cb0042..355c03d941183 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -557,8 +557,10 @@ public function test_next_tag_should_stop_on_rcdata_and_script_tag_closers_when_ $p = new WP_HTML_Tag_Processor( '' ); $p->next_tag(); - $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer' ); - $this->assertTrue( $p->is_tag_closer(), 'Indicated a ' ); $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer when there was no tag opener' ); @@ -566,8 +568,10 @@ public function test_next_tag_should_stop_on_rcdata_and_script_tag_closers_when_ $p = new WP_HTML_Tag_Processor( '' ); $p->next_tag(); - $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer' ); - $this->assertTrue( $p->is_tag_closer(), 'Indicated a ' ); $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer when there was no tag opener' ); @@ -575,8 +579,10 @@ public function test_next_tag_should_stop_on_rcdata_and_script_tag_closers_when_ $p = new WP_HTML_Tag_Processor( 'abc' ); $p->next_tag(); - $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer' ); - $this->assertTrue( $p->is_tag_closer(), 'Indicated a tag opener is a tag closer' ); + $this->assertFalse( + $p->next_tag( array( 'tag_closers' => 'visit' ) ), + 'Should not have found closing TITLE when closing an opener.' + ); $p = new WP_HTML_Tag_Processor( 'abc' ); $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer when there was no tag opener' );