Automattic\WooCommerce\EmailEditor\Engine\Renderer

Html2Text::get_documentprivate staticWC 1.0

Parse HTML into a DOMDocument

Method of the class: Html2Text{}

No Hooks.

Returns

\DOMDocument. The parsed document tree.

Usage

$result = Html2Text::get_document( $html, $options ): \DOMDocument;
$html(string) (required)
The input HTML.
$options(array) (required)
.

Html2Text::get_document() code WC 10.8.1

private static function get_document( string $html, array $options ): \DOMDocument {

	$doc = new \DOMDocument();

	$html = trim( $html );

	if ( ! $html ) {
		// DOMDocument doesn't support empty value and throws an error.
		// Return empty document instead.
		return $doc;
	}

	if ( '<' !== $html[0] ) {
		// If HTML does not begin with a tag, we put a body tag around it.
		// If we do not do this, PHP will insert a paragraph tag around
		// the first block of text for some reason which can mess up
		// the newlines. See pre.html test for an example.
		$html = '<body>' . $html . '</body>';
	}

	$header = '';
	// Use char sets for modern versions of php.
	if ( PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81 ) {
		// Use specified char_set, or auto detect if not set.
		$char_set = ! empty( $options['char_set'] ) && is_string( $options['char_set'] ) ? $options['char_set'] : 'auto';
		if ( 'auto' === $char_set ) {
			$detected = mb_detect_encoding( $html );
			$char_set = false !== $detected ? $detected : 'UTF-8';
		} elseif ( strpos( $char_set, ',' ) !== false ) {
			$encoding_list = explode( ',', $char_set );
			$encoding_list = array_map( 'trim', $encoding_list );
			$encoding_list = array_filter(
				$encoding_list,
				function ( $encoding ) {
					return ! empty( $encoding );
				}
			);
			if ( ! empty( $encoding_list ) ) {
				// Ensure we have a proper list with consecutive integer keys.
				$encoding_list = array_values( $encoding_list );
				mb_detect_order( $encoding_list );
				$detected = mb_detect_encoding( $html );
				$char_set = false !== $detected ? $detected : 'UTF-8';
			}
		}
		// Turn off error detection for Windows-1252 legacy html.
		if ( strpos( $char_set, '1252' ) !== false ) {
			$options['ignore_errors'] = true;
		}
		$header = '<?xml version="1.0" encoding="' . $char_set . '">';
	}

	if ( ! empty( $options['ignore_errors'] ) ) {
		// phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
		$doc->strictErrorChecking = false;
		// phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
		$doc->recover = true;
		// phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
		$doc->xmlStandalone  = true;
		$old_internal_errors = libxml_use_internal_errors( true );
		$load_result         = $doc->loadHTML( $header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE );
		libxml_use_internal_errors( $old_internal_errors );
	} else {
		$load_result = $doc->loadHTML( $header . $html );
	}

	if ( ! $load_result ) {
		// Log truncated HTML content for debugging purposes (limit to 500 chars to prevent log bloat).
		$html_preview = strlen( $html ) > 500 ? substr( $html, 0, 500 ) . '...[truncated]' : $html;
		// phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log -- Security: Logging sensitive data separately from user-facing exception messages.
		error_log( 'Html2Text: Failed to load HTML content: ' . htmlspecialchars( $html_preview, ENT_QUOTES, 'UTF-8' ) );
		// Throw a generic error message to avoid exposing sensitive data.
		throw new Html2Text_Exception( 'Could not load HTML - the content may be malformed.' );
	}

	return $doc;
}