WP_HTML_Tag_Processor::parse_next_tag() │ private │ WP 6.2.0
Parses the next tag.
This will find and start parsing the next tag, including the opening <, the potential closer /, and the tag name. It does not parse the attributes or scan to the closing >; these are left for other methods.
Method of the class: WP_HTML_Tag_Processor{}
No Hooks.
Return
true|false
. Whether a tag was found before the end of the document.
Usage
// private - for code of main (parent) class only $result = $this->parse_next_tag();
Changelog
Since 6.2.0 | Introduced. |
Since 6.2.1 | Support abruptly-closed comments, invalid-tag-closer-comments, and empty elements. |
WP_HTML_Tag_Processor::parse_next_tag() WP HTML Tag Processor::parse next tag code WP 6.6.2
private function parse_next_tag() { $this->after_tag(); $html = $this->html; $doc_length = strlen( $html ); $was_at = $this->bytes_already_parsed; $at = $was_at; while ( false !== $at && $at < $doc_length ) { $at = strpos( $html, '<', $at ); /* * This does not imply an incomplete parse; it indicates that there * can be nothing left in the document other than a #text node. */ if ( false === $at ) { $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; $this->token_length = strlen( $html ) - $was_at; $this->text_starts_at = $was_at; $this->text_length = $this->token_length; $this->bytes_already_parsed = strlen( $html ); return true; } if ( $at > $was_at ) { /* * A "<" normally starts a new HTML tag or syntax token, but in cases where the * following character can't produce a valid token, the "<" is instead treated * as plaintext and the parser should skip over it. This avoids a problem when * following earlier practices of typing emoji with text, e.g. "<3". This * should be a heart, not a tag. It's supposed to be rendered, not hidden. * * At this point the parser checks if this is one of those cases and if it is * will continue searching for the next "<" in search of a token boundary. * * @see https://html.spec.whatwg.org/#tag-open-state */ if ( strlen( $html ) > $at + 1 ) { $next_character = $html[ $at + 1 ]; $at_another_node = ( '!' === $next_character || '/' === $next_character || '?' === $next_character || ( 'A' <= $next_character && $next_character <= 'Z' ) || ( 'a' <= $next_character && $next_character <= 'z' ) ); if ( ! $at_another_node ) { ++$at; continue; } } $this->parser_state = self::STATE_TEXT_NODE; $this->token_starts_at = $was_at; $this->token_length = $at - $was_at; $this->text_starts_at = $was_at; $this->text_length = $this->token_length; $this->bytes_already_parsed = $at; return true; } $this->token_starts_at = $at; if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) { $this->is_closing_tag = true; ++$at; } else { $this->is_closing_tag = false; } /* * HTML tag names must start with [a-zA-Z] otherwise they are not tags. * For example, "<3" is rendered as text, not a tag opener. If at least * one letter follows the "<" then _it is_ a tag, but if the following * character is anything else it _is not a tag_. * * It's not uncommon to find non-tags starting with `<` in an HTML * document, so it's good for performance to make this pre-check before * continuing to attempt to parse a tag name. * * Reference: * * https://html.spec.whatwg.org/multipage/parsing.html#data-state * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); if ( $tag_name_prefix_length > 0 ) { ++$at; $this->parser_state = self::STATE_MATCHED_TAG; $this->tag_name_starts_at = $at; $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); $this->bytes_already_parsed = $at + $this->tag_name_length; return true; } /* * Abort if no tag is found before the end of * the document. There is nothing left to parse. */ if ( $at + 1 >= $doc_length ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } /* * `<!` transitions to markup declaration open state * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state */ if ( ! $this->is_closing_tag && '!' === $html[ $at + 1 ] ) { /* * `<!--` transitions to a comment state – apply further comment rules. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( $doc_length > $at + 3 && '-' === $html[ $at + 2 ] && '-' === $html[ $at + 3 ] ) { $closer_at = $at + 4; // If it's not possible to close the comment then there is nothing more to scan. if ( $doc_length <= $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } // Abruptly-closed empty comments are a sequence of dashes followed by `>`. $span_of_dashes = strspn( $html, '-', $closer_at ); if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { /* * @todo When implementing `set_modifiable_text()` ensure that updates to this token * don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment * and bogus comment syntax, these leave no clear insertion point for text and * they need to be modified specially in order to contain text. E.g. to store * `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which * involves inserting an additional `-` into the token after the modifiable text. */ $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT; $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; // Only provide modifiable text if the token is long enough to contain it. if ( $span_of_dashes >= 2 ) { $this->comment_type = self::COMMENT_AS_HTML_COMMENT; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $span_of_dashes - 2; } $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1; return true; } /* * Comments may be closed by either a --> or an invalid --!>. * The first occurrence closes the comment. * * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment */ --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. while ( ++$closer_at < $doc_length ) { $closer_at = strpos( $html, '--', $closer_at ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_HTML_COMMENT; $this->token_length = $closer_at + 3 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 3; return true; } if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_HTML_COMMENT; $this->token_length = $closer_at + 4 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 4; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 4; return true; } } } /* * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest > * These are ASCII-case-insensitive. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( $doc_length > $at + 8 && ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) && ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) && ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) && ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] ) ) { $closer_at = strpos( $html, '>', $at + 9 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_DOCTYPE; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 9; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; return true; } /* * Anything else here is an incorrectly-opened comment and transitions * to the bogus comment state - skip to the nearest >. If no closer is * found then the HTML was truncated inside the markup declaration. */ $closer_at = strpos( $html, '>', $at + 1 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_INVALID_HTML; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; /* * Identify nodes that would be CDATA if HTML had CDATA sections. * * This section must occur after identifying the bogus comment end * because in an HTML parser it will span to the nearest `>`, even * if there's no `]]>` as would be required in an XML document. It * is therefore not possible to parse a CDATA section containing * a `>` in the HTML syntax. * * Inside foreign elements there is a discrepancy between browsers * and the specification on this. * * @todo Track whether the Tag Processor is inside a foreign element * and require the proper closing `]]>` in those cases. */ if ( $this->token_length >= 10 && '[' === $html[ $this->token_starts_at + 2 ] && 'C' === $html[ $this->token_starts_at + 3 ] && 'D' === $html[ $this->token_starts_at + 4 ] && 'A' === $html[ $this->token_starts_at + 5 ] && 'T' === $html[ $this->token_starts_at + 6 ] && 'A' === $html[ $this->token_starts_at + 7 ] && '[' === $html[ $this->token_starts_at + 8 ] && ']' === $html[ $closer_at - 1 ] && ']' === $html[ $closer_at - 2 ] ) { $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE; $this->text_starts_at += 7; $this->text_length -= 9; } return true; } /* * </> is a missing end tag name, which is ignored. * * This was also known as the "presumptuous empty tag" * in early discussions as it was proposed to close * the nearest previous opening tag. * * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name */ if ( '>' === $html[ $at + 1 ] ) { // `<>` is interpreted as plaintext. if ( ! $this->is_closing_tag ) { ++$at; continue; } $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; $this->token_length = $at + 2 - $this->token_starts_at; $this->bytes_already_parsed = $at + 2; return true; } /* * `<?` transitions to a bogus comment state – skip to the nearest > * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( ! $this->is_closing_tag && '?' === $html[ $at + 1 ] ) { $closer_at = strpos( $html, '>', $at + 2 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_COMMENT; $this->comment_type = self::COMMENT_AS_INVALID_HTML; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; /* * Identify a Processing Instruction node were HTML to have them. * * This section must occur after identifying the bogus comment end * because in an HTML parser it will span to the nearest `>`, even * if there's no `?>` as would be required in an XML document. It * is therefore not possible to parse a Processing Instruction node * containing a `>` in the HTML syntax. * * XML allows for more target names, but this code only identifies * those with ASCII-representable target names. This means that it * may identify some Processing Instruction nodes as bogus comments, * but it will not misinterpret the HTML structure. By limiting the * identification to these target names the Tag Processor can avoid * the need to start parsing UTF-8 sequences. * * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | * [#x10000-#xEFFFF] * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] * * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget */ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 ); $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' ); if ( 0 < $pi_target_length ) { $pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length ); $this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE; $this->tag_name_starts_at = $this->token_starts_at + 2; $this->tag_name_length = $pi_target_length; $this->text_starts_at += $pi_target_length; $this->text_length -= $pi_target_length + 1; } } return true; } /* * If a non-alpha starts the tag name in a tag closer it's a comment. * Find the first `>`, which closes the comment. * * This parser classifies these particular comments as special "funky comments" * which are made available for further processing. * * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name */ if ( $this->is_closing_tag ) { // No chance of finding a closer. if ( $at + 3 > $doc_length ) { return false; } $closer_at = strpos( $html, '>', $at + 2 ); if ( false === $closer_at ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } $this->parser_state = self::STATE_FUNKY_COMMENT; $this->token_length = $closer_at + 1 - $this->token_starts_at; $this->text_starts_at = $this->token_starts_at + 2; $this->text_length = $closer_at - $this->text_starts_at; $this->bytes_already_parsed = $closer_at + 1; return true; } ++$at; } return false; }