Automattic\WooCommerce\EmailEditor\Engine\Renderer
Html2Text::get_document
Parse HTML into a DOMDocument
Method of the class: Html2Text{}
No Hooks.
Returns
\DOMDocument. The parsed document tree.
Usage
$result = Html2Text::get_document( $html, $options ): \DOMDocument;
- $html(string) (required)
- The input HTML.
- $options(array) (required)
- .
Html2Text::get_document() Html2Text::get document code WC 10.8.1
private static function get_document( string $html, array $options ): \DOMDocument {
$doc = new \DOMDocument();
$html = trim( $html );
if ( ! $html ) {
// DOMDocument doesn't support empty value and throws an error.
// Return empty document instead.
return $doc;
}
if ( '<' !== $html[0] ) {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>';
}
$header = '';
// Use char sets for modern versions of php.
if ( PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81 ) {
// Use specified char_set, or auto detect if not set.
$char_set = ! empty( $options['char_set'] ) && is_string( $options['char_set'] ) ? $options['char_set'] : 'auto';
if ( 'auto' === $char_set ) {
$detected = mb_detect_encoding( $html );
$char_set = false !== $detected ? $detected : 'UTF-8';
} elseif ( strpos( $char_set, ',' ) !== false ) {
$encoding_list = explode( ',', $char_set );
$encoding_list = array_map( 'trim', $encoding_list );
$encoding_list = array_filter(
$encoding_list,
function ( $encoding ) {
return ! empty( $encoding );
}
);
if ( ! empty( $encoding_list ) ) {
// Ensure we have a proper list with consecutive integer keys.
$encoding_list = array_values( $encoding_list );
mb_detect_order( $encoding_list );
$detected = mb_detect_encoding( $html );
$char_set = false !== $detected ? $detected : 'UTF-8';
}
}
// Turn off error detection for Windows-1252 legacy html.
if ( strpos( $char_set, '1252' ) !== false ) {
$options['ignore_errors'] = true;
}
$header = '<?xml version="1.0" encoding="' . $char_set . '">';
}
if ( ! empty( $options['ignore_errors'] ) ) {
// phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
$doc->strictErrorChecking = false;
// phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
$doc->recover = true;
// phpcs:ignore WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
$doc->xmlStandalone = true;
$old_internal_errors = libxml_use_internal_errors( true );
$load_result = $doc->loadHTML( $header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE );
libxml_use_internal_errors( $old_internal_errors );
} else {
$load_result = $doc->loadHTML( $header . $html );
}
if ( ! $load_result ) {
// Log truncated HTML content for debugging purposes (limit to 500 chars to prevent log bloat).
$html_preview = strlen( $html ) > 500 ? substr( $html, 0, 500 ) . '...[truncated]' : $html;
// phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log -- Security: Logging sensitive data separately from user-facing exception messages.
error_log( 'Html2Text: Failed to load HTML content: ' . htmlspecialchars( $html_preview, ENT_QUOTES, 'UTF-8' ) );
// Throw a generic error message to avoid exposing sensitive data.
throw new Html2Text_Exception( 'Could not load HTML - the content may be malformed.' );
}
return $doc;
}