WP_HTML_Tag_Processor::base_class_next_token()privateWP 6.5.0

Internal method which finds the next token in the HTML document.

This method is a protected internal function which implements the logic for finding the next token in a document. It exists so that the parser can update its state without affecting the location of the cursor in the document and without triggering subclass methods for things like next_token(), e.g. when applying patches before searching for the next token.

Method of the class: WP_HTML_Tag_Processor{}

Internal function — this function is designed to be used by the kernel itself. It is not recommended to use this function in your code.

No Hooks.

Return

true|false. Whether a token was parsed.

Usage

// private - for code of main (parent) class only
$result = $this->base_class_next_token(): bool;

Changelog

Since 6.5.0 Introduced.

WP_HTML_Tag_Processor::base_class_next_token() code WP 6.7.1

private function base_class_next_token(): bool {
	$was_at = $this->bytes_already_parsed;
	$this->after_tag();

	// Don't proceed if there's nothing more to scan.
	if (
		self::STATE_COMPLETE === $this->parser_state ||
		self::STATE_INCOMPLETE_INPUT === $this->parser_state
	) {
		return false;
	}

	/*
	 * The next step in the parsing loop determines the parsing state;
	 * clear it so that state doesn't linger from the previous step.
	 */
	$this->parser_state = self::STATE_READY;

	if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
		$this->parser_state = self::STATE_COMPLETE;
		return false;
	}

	// Find the next tag if it exists.
	if ( false === $this->parse_next_tag() ) {
		if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
			$this->bytes_already_parsed = $was_at;
		}

		return false;
	}

	/*
	 * For legacy reasons the rest of this function handles tags and their
	 * attributes. If the processor has reached the end of the document
	 * or if it matched any other token then it should return here to avoid
	 * attempting to process tag-specific syntax.
	 */
	if (
		self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
		self::STATE_COMPLETE !== $this->parser_state &&
		self::STATE_MATCHED_TAG !== $this->parser_state
	) {
		return true;
	}

	// Parse all of its attributes.
	while ( $this->parse_next_attribute() ) {
		continue;
	}

	// Ensure that the tag closes before the end of the document.
	if (
		self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
		$this->bytes_already_parsed >= strlen( $this->html )
	) {
		// Does this appropriately clear state (parsed attributes)?
		$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
		$this->bytes_already_parsed = $was_at;

		return false;
	}

	$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
	if ( false === $tag_ends_at ) {
		$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
		$this->bytes_already_parsed = $was_at;

		return false;
	}
	$this->parser_state         = self::STATE_MATCHED_TAG;
	$this->bytes_already_parsed = $tag_ends_at + 1;
	$this->token_length         = $this->bytes_already_parsed - $this->token_starts_at;

	/*
	 * Certain tags require additional processing. The first-letter pre-check
	 * avoids unnecessary string allocation when comparing the tag names.
	 *
	 *  - IFRAME
	 *  - LISTING (deprecated)
	 *  - NOEMBED (deprecated)
	 *  - NOFRAMES (deprecated)
	 *  - PRE
	 *  - SCRIPT
	 *  - STYLE
	 *  - TEXTAREA
	 *  - TITLE
	 *  - XMP (deprecated)
	 */
	if (
		$this->is_closing_tag ||
		'html' !== $this->parsing_namespace ||
		1 !== strspn( $this->html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 )
	) {
		return true;
	}

	$tag_name = $this->get_tag();

	/*
	 * For LISTING, PRE, and TEXTAREA, the first linefeed of an immediately-following
	 * text node is ignored as an authoring convenience.
	 *
	 * @see static::skip_newline_at
	 */
	if ( 'LISTING' === $tag_name || 'PRE' === $tag_name ) {
		$this->skip_newline_at = $this->bytes_already_parsed;
		return true;
	}

	/*
	 * There are certain elements whose children are not DATA but are instead
	 * RCDATA or RAWTEXT. These cannot contain other elements, and the contents
	 * are parsed as plaintext, with character references decoded in RCDATA but
	 * not in RAWTEXT.
	 *
	 * These elements are described here as "self-contained" or special atomic
	 * elements whose end tag is consumed with the opening tag, and they will
	 * contain modifiable text inside of them.
	 *
	 * Preserve the opening tag pointers, as these will be overwritten
	 * when finding the closing tag. They will be reset after finding
	 * the closing to tag to point to the opening of the special atomic
	 * tag sequence.
	 */
	$tag_name_starts_at   = $this->tag_name_starts_at;
	$tag_name_length      = $this->tag_name_length;
	$tag_ends_at          = $this->token_starts_at + $this->token_length;
	$attributes           = $this->attributes;
	$duplicate_attributes = $this->duplicate_attributes;

	// Find the closing tag if necessary.
	switch ( $tag_name ) {
		case 'SCRIPT':
			$found_closer = $this->skip_script_data();
			break;

		case 'TEXTAREA':
		case 'TITLE':
			$found_closer = $this->skip_rcdata( $tag_name );
			break;

		/*
		 * In the browser this list would include the NOSCRIPT element,
		 * but the Tag Processor is an environment with the scripting
		 * flag disabled, meaning that it needs to descend into the
		 * NOSCRIPT element to be able to properly process what will be
		 * sent to a browser.
		 *
		 * Note that this rule makes HTML5 syntax incompatible with XML,
		 * because the parsing of this token depends on client application.
		 * The NOSCRIPT element cannot be represented in the XHTML syntax.
		 */
		case 'IFRAME':
		case 'NOEMBED':
		case 'NOFRAMES':
		case 'STYLE':
		case 'XMP':
			$found_closer = $this->skip_rawtext( $tag_name );
			break;

		// No other tags should be treated in their entirety here.
		default:
			return true;
	}

	if ( ! $found_closer ) {
		$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
		$this->bytes_already_parsed = $was_at;
		return false;
	}

	/*
	 * The values here look like they reference the opening tag but they reference
	 * the closing tag instead. This is why the opening tag values were stored
	 * above in a variable. It reads confusingly here, but that's because the
	 * functions that skip the contents have moved all the internal cursors past
	 * the inner content of the tag.
	 */
	$this->token_starts_at      = $was_at;
	$this->token_length         = $this->bytes_already_parsed - $this->token_starts_at;
	$this->text_starts_at       = $tag_ends_at;
	$this->text_length          = $this->tag_name_starts_at - $this->text_starts_at;
	$this->tag_name_starts_at   = $tag_name_starts_at;
	$this->tag_name_length      = $tag_name_length;
	$this->attributes           = $attributes;
	$this->duplicate_attributes = $duplicate_attributes;

	return true;
}