WP_HTML_Tag_Processor::parse_next_tag()privateWP 6.2.0

Parses the next tag.

This will find and start parsing the next tag, including the opening <, the potential closer /, and the tag name. It does not parse the attributes or scan to the closing >; these are left for other methods.

Method of the class: WP_HTML_Tag_Processor{}

No Hooks.

Return

true|false. Whether a tag was found before the end of the document.

Usage

// private - for code of main (parent) class only
$result = $this->parse_next_tag();

Changelog

Since 6.2.0 Introduced.
Since 6.2.1 Support abruptly-closed comments, invalid-tag-closer-comments, and empty elements.

WP_HTML_Tag_Processor::parse_next_tag() code WP 6.6.2

private function parse_next_tag() {
	$this->after_tag();

	$html       = $this->html;
	$doc_length = strlen( $html );
	$was_at     = $this->bytes_already_parsed;
	$at         = $was_at;

	while ( false !== $at && $at < $doc_length ) {
		$at = strpos( $html, '<', $at );

		/*
		 * This does not imply an incomplete parse; it indicates that there
		 * can be nothing left in the document other than a #text node.
		 */
		if ( false === $at ) {
			$this->parser_state         = self::STATE_TEXT_NODE;
			$this->token_starts_at      = $was_at;
			$this->token_length         = strlen( $html ) - $was_at;
			$this->text_starts_at       = $was_at;
			$this->text_length          = $this->token_length;
			$this->bytes_already_parsed = strlen( $html );
			return true;
		}

		if ( $at > $was_at ) {
			/*
			 * A "<" normally starts a new HTML tag or syntax token, but in cases where the
			 * following character can't produce a valid token, the "<" is instead treated
			 * as plaintext and the parser should skip over it. This avoids a problem when
			 * following earlier practices of typing emoji with text, e.g. "<3". This
			 * should be a heart, not a tag. It's supposed to be rendered, not hidden.
			 *
			 * At this point the parser checks if this is one of those cases and if it is
			 * will continue searching for the next "<" in search of a token boundary.
			 *
			 * @see https://html.spec.whatwg.org/#tag-open-state
			 */
			if ( strlen( $html ) > $at + 1 ) {
				$next_character  = $html[ $at + 1 ];
				$at_another_node = (
					'!' === $next_character ||
					'/' === $next_character ||
					'?' === $next_character ||
					( 'A' <= $next_character && $next_character <= 'Z' ) ||
					( 'a' <= $next_character && $next_character <= 'z' )
				);
				if ( ! $at_another_node ) {
					++$at;
					continue;
				}
			}

			$this->parser_state         = self::STATE_TEXT_NODE;
			$this->token_starts_at      = $was_at;
			$this->token_length         = $at - $was_at;
			$this->text_starts_at       = $was_at;
			$this->text_length          = $this->token_length;
			$this->bytes_already_parsed = $at;
			return true;
		}

		$this->token_starts_at = $at;

		if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) {
			$this->is_closing_tag = true;
			++$at;
		} else {
			$this->is_closing_tag = false;
		}

		/*
		 * HTML tag names must start with [a-zA-Z] otherwise they are not tags.
		 * For example, "<3" is rendered as text, not a tag opener. If at least
		 * one letter follows the "<" then _it is_ a tag, but if the following
		 * character is anything else it _is not a tag_.
		 *
		 * It's not uncommon to find non-tags starting with `<` in an HTML
		 * document, so it's good for performance to make this pre-check before
		 * continuing to attempt to parse a tag name.
		 *
		 * Reference:
		 * * https://html.spec.whatwg.org/multipage/parsing.html#data-state
		 * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
		 */
		$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
		if ( $tag_name_prefix_length > 0 ) {
			++$at;
			$this->parser_state         = self::STATE_MATCHED_TAG;
			$this->tag_name_starts_at   = $at;
			$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
			$this->bytes_already_parsed = $at + $this->tag_name_length;
			return true;
		}

		/*
		 * Abort if no tag is found before the end of
		 * the document. There is nothing left to parse.
		 */
		if ( $at + 1 >= $doc_length ) {
			$this->parser_state = self::STATE_INCOMPLETE_INPUT;

			return false;
		}

		/*
		 * `<!` transitions to markup declaration open state
		 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
		 */
		if ( ! $this->is_closing_tag && '!' === $html[ $at + 1 ] ) {
			/*
			 * `<!--` transitions to a comment state – apply further comment rules.
			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
			 */
			if (
				$doc_length > $at + 3 &&
				'-' === $html[ $at + 2 ] &&
				'-' === $html[ $at + 3 ]
			) {
				$closer_at = $at + 4;
				// If it's not possible to close the comment then there is nothing more to scan.
				if ( $doc_length <= $closer_at ) {
					$this->parser_state = self::STATE_INCOMPLETE_INPUT;

					return false;
				}

				// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
				$span_of_dashes = strspn( $html, '-', $closer_at );
				if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
					/*
					 * @todo When implementing `set_modifiable_text()` ensure that updates to this token
					 *       don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment
					 *       and bogus comment syntax, these leave no clear insertion point for text and
					 *       they need to be modified specially in order to contain text. E.g. to store
					 *       `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which
					 *       involves inserting an additional `-` into the token after the modifiable text.
					 */
					$this->parser_state = self::STATE_COMMENT;
					$this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
					$this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;

					// Only provide modifiable text if the token is long enough to contain it.
					if ( $span_of_dashes >= 2 ) {
						$this->comment_type   = self::COMMENT_AS_HTML_COMMENT;
						$this->text_starts_at = $this->token_starts_at + 4;
						$this->text_length    = $span_of_dashes - 2;
					}

					$this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
					return true;
				}

				/*
				 * Comments may be closed by either a --> or an invalid --!>.
				 * The first occurrence closes the comment.
				 *
				 * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
				 */
				--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
				while ( ++$closer_at < $doc_length ) {
					$closer_at = strpos( $html, '--', $closer_at );
					if ( false === $closer_at ) {
						$this->parser_state = self::STATE_INCOMPLETE_INPUT;

						return false;
					}

					if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
						$this->parser_state         = self::STATE_COMMENT;
						$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
						$this->token_length         = $closer_at + 3 - $this->token_starts_at;
						$this->text_starts_at       = $this->token_starts_at + 4;
						$this->text_length          = $closer_at - $this->text_starts_at;
						$this->bytes_already_parsed = $closer_at + 3;
						return true;
					}

					if (
						$closer_at + 3 < $doc_length &&
						'!' === $html[ $closer_at + 2 ] &&
						'>' === $html[ $closer_at + 3 ]
					) {
						$this->parser_state         = self::STATE_COMMENT;
						$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
						$this->token_length         = $closer_at + 4 - $this->token_starts_at;
						$this->text_starts_at       = $this->token_starts_at + 4;
						$this->text_length          = $closer_at - $this->text_starts_at;
						$this->bytes_already_parsed = $closer_at + 4;
						return true;
					}
				}
			}

			/*
			 * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest >
			 * These are ASCII-case-insensitive.
			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
			 */
			if (
				$doc_length > $at + 8 &&
				( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
				( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
				( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
				( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
				( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
				( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
				( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
			) {
				$closer_at = strpos( $html, '>', $at + 9 );
				if ( false === $closer_at ) {
					$this->parser_state = self::STATE_INCOMPLETE_INPUT;

					return false;
				}

				$this->parser_state         = self::STATE_DOCTYPE;
				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
				$this->text_starts_at       = $this->token_starts_at + 9;
				$this->text_length          = $closer_at - $this->text_starts_at;
				$this->bytes_already_parsed = $closer_at + 1;
				return true;
			}

			/*
			 * Anything else here is an incorrectly-opened comment and transitions
			 * to the bogus comment state - skip to the nearest >. If no closer is
			 * found then the HTML was truncated inside the markup declaration.
			 */
			$closer_at = strpos( $html, '>', $at + 1 );
			if ( false === $closer_at ) {
				$this->parser_state = self::STATE_INCOMPLETE_INPUT;

				return false;
			}

			$this->parser_state         = self::STATE_COMMENT;
			$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
			$this->token_length         = $closer_at + 1 - $this->token_starts_at;
			$this->text_starts_at       = $this->token_starts_at + 2;
			$this->text_length          = $closer_at - $this->text_starts_at;
			$this->bytes_already_parsed = $closer_at + 1;

			/*
			 * Identify nodes that would be CDATA if HTML had CDATA sections.
			 *
			 * This section must occur after identifying the bogus comment end
			 * because in an HTML parser it will span to the nearest `>`, even
			 * if there's no `]]>` as would be required in an XML document. It
			 * is therefore not possible to parse a CDATA section containing
			 * a `>` in the HTML syntax.
			 *
			 * Inside foreign elements there is a discrepancy between browsers
			 * and the specification on this.
			 *
			 * @todo Track whether the Tag Processor is inside a foreign element
			 *       and require the proper closing `]]>` in those cases.
			 */
			if (
				$this->token_length >= 10 &&
				'[' === $html[ $this->token_starts_at + 2 ] &&
				'C' === $html[ $this->token_starts_at + 3 ] &&
				'D' === $html[ $this->token_starts_at + 4 ] &&
				'A' === $html[ $this->token_starts_at + 5 ] &&
				'T' === $html[ $this->token_starts_at + 6 ] &&
				'A' === $html[ $this->token_starts_at + 7 ] &&
				'[' === $html[ $this->token_starts_at + 8 ] &&
				']' === $html[ $closer_at - 1 ] &&
				']' === $html[ $closer_at - 2 ]
			) {
				$this->parser_state    = self::STATE_COMMENT;
				$this->comment_type    = self::COMMENT_AS_CDATA_LOOKALIKE;
				$this->text_starts_at += 7;
				$this->text_length    -= 9;
			}

			return true;
		}

		/*
		 * </> is a missing end tag name, which is ignored.
		 *
		 * This was also known as the "presumptuous empty tag"
		 * in early discussions as it was proposed to close
		 * the nearest previous opening tag.
		 *
		 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
		 */
		if ( '>' === $html[ $at + 1 ] ) {
			// `<>` is interpreted as plaintext.
			if ( ! $this->is_closing_tag ) {
				++$at;
				continue;
			}

			$this->parser_state         = self::STATE_PRESUMPTUOUS_TAG;
			$this->token_length         = $at + 2 - $this->token_starts_at;
			$this->bytes_already_parsed = $at + 2;
			return true;
		}

		/*
		 * `<?` transitions to a bogus comment state – skip to the nearest >
		 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
		 */
		if ( ! $this->is_closing_tag && '?' === $html[ $at + 1 ] ) {
			$closer_at = strpos( $html, '>', $at + 2 );
			if ( false === $closer_at ) {
				$this->parser_state = self::STATE_INCOMPLETE_INPUT;

				return false;
			}

			$this->parser_state         = self::STATE_COMMENT;
			$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
			$this->token_length         = $closer_at + 1 - $this->token_starts_at;
			$this->text_starts_at       = $this->token_starts_at + 2;
			$this->text_length          = $closer_at - $this->text_starts_at;
			$this->bytes_already_parsed = $closer_at + 1;

			/*
			 * Identify a Processing Instruction node were HTML to have them.
			 *
			 * This section must occur after identifying the bogus comment end
			 * because in an HTML parser it will span to the nearest `>`, even
			 * if there's no `?>` as would be required in an XML document. It
			 * is therefore not possible to parse a Processing Instruction node
			 * containing a `>` in the HTML syntax.
			 *
			 * XML allows for more target names, but this code only identifies
			 * those with ASCII-representable target names. This means that it
			 * may identify some Processing Instruction nodes as bogus comments,
			 * but it will not misinterpret the HTML structure. By limiting the
			 * identification to these target names the Tag Processor can avoid
			 * the need to start parsing UTF-8 sequences.
			 *
			 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
			 *                     [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
			 *                     [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
			 *                     [#x10000-#xEFFFF]
			 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
			 *
			 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
			 */
			if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
				$comment_text     = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
				$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );

				if ( 0 < $pi_target_length ) {
					$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );

					$this->comment_type       = self::COMMENT_AS_PI_NODE_LOOKALIKE;
					$this->tag_name_starts_at = $this->token_starts_at + 2;
					$this->tag_name_length    = $pi_target_length;
					$this->text_starts_at    += $pi_target_length;
					$this->text_length       -= $pi_target_length + 1;
				}
			}

			return true;
		}

		/*
		 * If a non-alpha starts the tag name in a tag closer it's a comment.
		 * Find the first `>`, which closes the comment.
		 *
		 * This parser classifies these particular comments as special "funky comments"
		 * which are made available for further processing.
		 *
		 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
		 */
		if ( $this->is_closing_tag ) {
			// No chance of finding a closer.
			if ( $at + 3 > $doc_length ) {
				return false;
			}

			$closer_at = strpos( $html, '>', $at + 2 );
			if ( false === $closer_at ) {
				$this->parser_state = self::STATE_INCOMPLETE_INPUT;

				return false;
			}

			$this->parser_state         = self::STATE_FUNKY_COMMENT;
			$this->token_length         = $closer_at + 1 - $this->token_starts_at;
			$this->text_starts_at       = $this->token_starts_at + 2;
			$this->text_length          = $closer_at - $this->text_starts_at;
			$this->bytes_already_parsed = $closer_at + 1;
			return true;
		}

		++$at;
	}

	return false;
}