WP_HTML_Tag_Processor::skip_script_dataprivateWP 6.2.0

Skips contents of script tags.

Method of the class: WP_HTML_Tag_Processor{}

No Hooks.

Returns

true|false. Whether the script tag was closed before the end of the document.

Usage

// private - for code of main (parent) class only
$result = $this->skip_script_data(): bool;

Changelog

Since 6.2.0 Introduced.

WP_HTML_Tag_Processor::skip_script_data() code WP 7.0

private function skip_script_data(): bool {
	$state      = 'unescaped';
	$html       = $this->html;
	$doc_length = strlen( $html );
	$at         = $this->bytes_already_parsed;

	while ( false !== $at && $at < $doc_length ) {
		$at += strcspn( $html, '-<', $at );

		/*
		 * Optimization: Terminating a complete script element requires at least eight
		 * additional bytes in the document. Some checks below may cause local escaped
		 * state transitions when processing shorter strings, but those transitions are
		 * irrelevant if the script tag is incomplete and the function must return false.
		 *
		 * This may need updating if those transitions become significant or exported from
		 * this function in some way, such as when building safe methods to embed JavaScript
		 * or data inside a SCRIPT element.
		 *
		 *     $at may be here.
		 *        ↓
		 *     ...</script>
		 *         ╰──┬───╯
		 *     $at + 8 additional bytes are required for a non-false return value.
		 *
		 * This single check eliminates the need to check lengths for the shorter spans:
		 *
		 *           $at may be here.
		 *                  ↓
		 *     <script><!-- --></script>
		 *                   ├╯
		 *             $at + 2 additional characters does not require a length check.
		 *
		 * The transition from "escaped" to "unescaped" is not relevant if the document ends:
		 *
		 *           $at may be here.
		 *                  ↓
		 *     <script><!-- -->[[END-OF-DOCUMENT]]
		 *                   ╰──┬───╯
		 *             $at + 8 additional bytes is not satisfied, return false.
		 */
		if ( $at + 8 >= $doc_length ) {
			return false;
		}

		/*
		 * For all script states a "-->"  transitions
		 * back into the normal unescaped script mode,
		 * even if that's the current state.
		 */
		if (
			'-' === $html[ $at ] &&
			'-' === $html[ $at + 1 ] &&
			'>' === $html[ $at + 2 ]
		) {
			$at   += 3;
			$state = 'unescaped';
			continue;
		}

		/*
		 * Everything of interest past here starts with "<".
		 * Check this character and advance position regardless.
		 */
		if ( '<' !== $html[ $at++ ] ) {
			continue;
		}

		/*
		 * "<!--" only transitions from _unescaped_ to _escaped_. This byte sequence is only
		 * significant in the _unescaped_ state and is ignored in any other state.
		 */
		if (
			'unescaped' === $state &&
			'!' === $html[ $at ] &&
			'-' === $html[ $at + 1 ] &&
			'-' === $html[ $at + 2 ]
		) {
			$at += 3;

			/*
			 * The parser is ready to enter the _escaped_ state, but may remain in the
			 * _unescaped_ state. This occurs when "<!--" is immediately followed by a
			 * sequence of 0 or more "-" followed by ">". This is similar to abruptly closed
			 * HTML comments like "<!-->" or "<!--->".
			 *
			 * Note that this check may advance the position significantly and requires a
			 * length check to prevent bad offsets on inputs like `<script><!---------`.
			 */
			$at += strspn( $html, '-', $at );
			if ( $at < $doc_length && '>' === $html[ $at ] ) {
				++$at;
				continue;
			}

			$state = 'escaped';
			continue;
		}

		if ( '/' === $html[ $at ] ) {
			$closer_potentially_starts_at = $at - 1;
			$is_closing                   = true;
			++$at;
		} else {
			$is_closing = false;
		}

		/*
		 * At this point the only remaining state-changes occur with the
		 * <script> and </script> tags; unless one of these appears next,
		 * proceed scanning to the next potential token in the text.
		 */
		if ( ! (
			( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
			( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
			( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
			( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) &&
			( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) &&
			( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] )
		) ) {
			++$at;
			continue;
		}

		/*
		 * Ensure that the script tag terminates to avoid matching on
		 * substrings of a non-match. For example, the sequence
		 * "<script123" should not end a script region even though
		 * "<script" is found within the text.
		 */
		$at += 6;
		$c   = $html[ $at ];
		if (
			/**
			 * These characters trigger state transitions of interest:
			 *
			 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state}
			 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state}
			 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state}
			 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state}
			 *
			 * The "\r" character is not present in the above references. However, "\r" must be
			 * treated the same as "\n". This is because the HTML Standard requires newline
			 * normalization during preprocessing which applies this replacement.
			 *
			 * - @see https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
			 * - @see https://infra.spec.whatwg.org/#normalize-newlines
			 */
			'>' !== $c &&
			' ' !== $c &&
			"\n" !== $c &&
			'/' !== $c &&
			"\t" !== $c &&
			"\f" !== $c &&
			"\r" !== $c
		) {
			continue;
		}

		if ( 'escaped' === $state && ! $is_closing ) {
			$state = 'double-escaped';
			continue;
		}

		if ( 'double-escaped' === $state && $is_closing ) {
			$state = 'escaped';
			continue;
		}

		if ( $is_closing ) {
			$this->bytes_already_parsed = $closer_potentially_starts_at;
			$this->tag_name_starts_at   = $closer_potentially_starts_at;
			if ( $this->bytes_already_parsed >= $doc_length ) {
				return false;
			}

			while ( $this->parse_next_attribute() ) {
				continue;
			}

			if ( $this->bytes_already_parsed >= $doc_length ) {
				return false;
			}

			if ( '>' === $html[ $this->bytes_already_parsed ] ) {
				++$this->bytes_already_parsed;
				return true;
			}
		}

		++$at;
	}

	return false;
}