Automattic\WooCommerce\Internal\Utilities

URL{}WC 1.0

Provides an easy method of assessing URLs, including filepaths (which will be silently converted to a file:// URL if provided).

No Hooks.

Usage

$URL = new URL();
// use class methods

Methods

  1. public __construct( string $url )
  2. public __toString()
  3. public get_all_parent_urls()
  4. public get_parent_url( int $level = 1 )
  5. public get_path( string $path_override = null )
  6. public get_url( array $component_overrides = array() )
  7. public is_absolute()
  8. public is_relative()
  9. private preprocess()
  10. private process_path()

URL{} code WC 8.7.0

class URL {
	/**
	 * Components of the URL being assessed.
	 *
	 * The keys match those potentially returned by the parse_url() function, except
	 * that they are always defined and 'drive' (Windows drive letter) has been added.
	 *
	 * @var string|null[]
	 */
	private $components = array(
		'drive'    => null,
		'fragment' => null,
		'host'     => null,
		'pass'     => null,
		'path'     => null,
		'port'     => null,
		'query'    => null,
		'scheme'   => null,
		'user'     => null,
	);

	/**
	 * If the URL (or filepath) is absolute.
	 *
	 * @var bool
	 */
	private $is_absolute;

	/**
	 * If the URL (or filepath) represents a directory other than the root directory.
	 *
	 * This is useful at different points in the process, when deciding whether to re-apply
	 * a trailing slash at the end of processing or when we need to calculate how many
	 * directory traversals are needed to form a (grand-)parent URL.
	 *
	 * @var bool
	 */
	private $is_non_root_directory;

	/**
	 * The components of the URL's path.
	 *
	 * For instance, in the case of "file:///srv/www/wp.site" (noting that a file URL has
	 * no host component) this would contain:
	 *
	 *     [ "srv", "www", "wp.site" ]
	 *
	 * In the case of a non-file URL such as "https://example.com/foo/bar/baz" (noting the
	 * host is not part of the path) it would contain:
	 *
	 *    [ "foo", "bar", "baz" ]
	 *
	 * @var array
	 */
	private $path_parts = array();

	/**
	 * The URL.
	 *
	 * @var string
	 */
	private $url;

	/**
	 * Creates and processes the provided URL (or filepath).
	 *
	 * @throws URLException If the URL (or filepath) is seriously malformed.
	 *
	 * @param string $url The URL (or filepath).
	 */
	public function __construct( string $url ) {
		$this->url = $url;
		$this->preprocess();
		$this->process_path();
	}

	/**
	 * Makes all slashes forward slashes, converts filepaths to file:// URLs, and
	 * other processing to help with comprehension of filepaths.
	 *
	 * @throws URLException If the URL is seriously malformed.
	 */
	private function preprocess() {
		// For consistency, all slashes should be forward slashes.
		$this->url = str_replace( '\\', '/', $this->url );

		// Windows: capture the drive letter if provided.
		if ( preg_match( '#^(file://)?([a-z]):/(?!/).*#i', $this->url, $matches ) ) {
			$this->components['drive'] = $matches[2];
		}

		/*
		 * If there is no scheme, assume and prepend "file://". An exception is made for cases where the URL simply
		 * starts with exactly two forward slashes, which indicates 'any scheme' (most commonly, that is used when
		 * there is freedom to switch between 'http' and 'https').
		 */
		if ( ! preg_match( '#^[a-z]+://#i', $this->url ) && ! preg_match( '#^//(?!/)#', $this->url ) ) {
			$this->url = 'file://' . $this->url;
		}

		$parsed_components = wp_parse_url( $this->url );

		// If we received a really badly formed URL, let's go no further.
		if ( false === $parsed_components ) {
			throw new URLException(
				sprintf(
				/* translators: %s is the URL. */
					__( '%s is not a valid URL.', 'woocommerce' ),
					$this->url
				)
			);
		}

		$this->components = array_merge( $this->components, $parsed_components );

		// File URLs cannot have a host. However, the initial path segment *or* the Windows drive letter
		// (if present) may be incorrectly be interpreted as the host name.
		if ( 'file' === $this->components['scheme'] && ! empty( $this->components['host'] ) ) {
			// If we do not have a drive letter, then simply merge the host and the path together.
			if ( null === $this->components['drive'] ) {
				$this->components['path'] = $this->components['host'] . ( $this->components['path'] ?? '' );
			}

			// Restore the host to null in this situation.
			$this->components['host'] = null;
		}
	}

	/**
	 * Simplifies the path if possible, by resolving directory traversals to the extent possible
	 * without touching the filesystem.
	 */
	private function process_path() {
		$segments                    = explode( '/', $this->components['path'] );
		$this->is_absolute           = substr( $this->components['path'], 0, 1 ) === '/' || ! empty( $this->components['host'] );
		$this->is_non_root_directory = substr( $this->components['path'], -1, 1 ) === '/' && strlen( $this->components['path'] ) > 1;
		$resolve_traversals          = 'file' !== $this->components['scheme'] || $this->is_absolute;
		$retain_traversals           = false;

		// Clean the path.
		foreach ( $segments as $part ) {
			// Drop empty segments.
			if ( strlen( $part ) === 0 || '.' === $part ) {
				continue;
			}

			// Directory traversals created with percent-encoding syntax should also be detected.
			$is_traversal = str_ireplace( '%2e', '.', $part ) === '..';

			// Resolve directory traversals (if allowed: see further comment relating to this).
			if ( $resolve_traversals && $is_traversal ) {
				if ( count( $this->path_parts ) > 0 && ! $retain_traversals ) {
					$this->path_parts = array_slice( $this->path_parts, 0, count( $this->path_parts ) - 1 );
					continue;
				} elseif ( $this->is_absolute ) {
					continue;
				}
			}

			/*
			 * Consider allowing directory traversals to be resolved (ie, the process that converts 'foo/bar/../baz' to
			 * 'foo/baz').
			 *
			 * 1. For this decision point, we are only concerned with relative filepaths (in all other cases,
			 *    $resolve_traversals will already be true).
			 * 2. This is a 'one time' and unidirectional operation. We only wish to flip from false to true, and we
			 *    never wish to do this more than once.
			 * 3. We only flip the switch after we have examined all leading '..' traversal segments.
			 */
			if ( false === $resolve_traversals && '..' !== $part && 'file' === $this->components['scheme'] && ! $this->is_absolute ) {
				$resolve_traversals = true;
			}

			/*
			 * Set a flag indicating that traversals should be retained. This is done to ensure we don't prematurely
			 * discard traversals at the start of the path.
			 */
			$retain_traversals = $resolve_traversals && '..' === $part;

			// Retain this part of the path.
			$this->path_parts[] = $part;
		}

		// Protect against empty relative paths.
		if ( count( $this->path_parts ) === 0 && ! $this->is_absolute ) {
			$this->path_parts = array( '.' );
			$this->is_non_root_directory = true;
		}

		// Reform the path from the processed segments, appending a leading slash if it is absolute and restoring
		// the Windows drive letter if we have one.
		$this->components['path'] = ( $this->is_absolute ? '/' : '' ) . implode( '/', $this->path_parts ) . ( $this->is_non_root_directory ? '/' : '' );
	}

	/**
	 * Returns the processed URL as a string.
	 *
	 * @return string
	 */
	public function __toString(): string {
		return $this->get_url();
	}

	/**
	 * Returns all possible parent URLs for the current URL.
	 *
	 * @return string[]
	 */
	public function get_all_parent_urls(): array {
		$max_parent = count( $this->path_parts );
		$parents    = array();

		/*
		 * If we are looking at a relative path that begins with at least one traversal (example: "../../foo")
		 * then we should only return one parent URL (otherwise, we'd potentially have to return an infinite
		 * number of parent URLs since we can't know how far the tree extends).
		 */
		if ( $max_parent > 0 && ! $this->is_absolute && '..' === $this->path_parts[0] ) {
			$max_parent = 1;
		}

		for ( $level = 1; $level <= $max_parent; $level++ ) {
			$parents[] = $this->get_parent_url( $level );
		}

		return $parents;
	}

	/**
	 * Outputs the parent URL.
	 *
	 * For example, if $this->get_url() returns "https://example.com/foo/bar/baz" then
	 * this method will return "https://example.com/foo/bar/".
	 *
	 * When a grand-parent is needed, the optional $level parameter can be used. By default
	 * this is set to 1 (parent). 2 will yield the grand-parent, 3 will yield the great
	 * grand-parent, etc.
	 *
	 * If a level is specified that exceeds the number of path segments, this method will
	 * return false.
	 *
	 * @param int $level Used to indicate the level of parent.
	 *
	 * @return string|false
	 */
	public function get_parent_url( int $level = 1 ) {
		if ( $level < 1 ) {
			$level = 1;
		}

		$parts_count               = count( $this->path_parts );
		$parent_path_parts_to_keep = $parts_count - $level;

		/*
		 * With the exception of file URLs, we do not allow obtaining (grand-)parent directories that require
		 * us to describe them using directory traversals. For example, given "http://hostname/foo/bar/baz.png" we do
		 * not permit determining anything more than 2 levels up (we cannot go beyond "http://hostname/").
		 */
		if ( 'file' !== $this->components['scheme'] && $parent_path_parts_to_keep < 0 ) {
			return false;
		}

		// In the specific case of an absolute filepath describing the root directory, there can be no parent.
		if ( 'file' === $this->components['scheme'] && $this->is_absolute && empty( $this->path_parts ) ) {
			return false;
		}

		// Handle cases where the path starts with one or more 'dot segments'. Since the path has already been
		// processed, we can be confident that any such segments are at the start of the path.
		if ( $parts_count > 0 && ( '.' === $this->path_parts[0] || '..' === $this->path_parts[0] ) ) {
			// Determine the index of the last dot segment (ex: given the path '/../../foo' it would be 1).
			$single_dots   = array_keys( $this->path_parts, '.', true );
			$double_dots   = array_keys( $this->path_parts, '..', true );
			$max_dot_index = max( array_merge( $single_dots, $double_dots ) );

			// Prepend the required number of traversals and discard unnessary trailing segments.
			$last_traversal = $max_dot_index + ( $this->is_non_root_directory ? 1 : 0 );
			$parent_path    = str_repeat( '../', $level ) . join( '/', array_slice( $this->path_parts, 0, $last_traversal ) );
		} elseif ( $parent_path_parts_to_keep < 0 ) {
			// For relative filepaths only, we use traversals to describe the requested parent.
			$parent_path = untrailingslashit( str_repeat( '../', $parent_path_parts_to_keep * -1 ) );
		} else {
			// Otherwise, in a very simple case, we just remove existing parts.
			$parent_path = implode( '/', array_slice( $this->path_parts, 0, $parent_path_parts_to_keep ) );
		}

		if ( $this->is_relative() && '' === $parent_path ) {
			$parent_path = '.';
		}

		// Append a trailing slash, since a parent is always a directory. The only exception is the current working directory.
		$parent_path .= '/';

		// For absolute paths, apply a leading slash (does not apply if we have a root path).
		if ( $this->is_absolute && 0 !== strpos( $parent_path, '/' ) ) {
			$parent_path = '/' . $parent_path;
		}

		// Form the parent URL (ditching the query and fragment, if set).
		$parent_url = $this->get_url(
			array(
				'path'     => $parent_path,
				'query'    => null,
				'fragment' => null,
			)
		);

		// We process the parent URL through a fresh instance of this class, for consistency.
		return ( new self( $parent_url ) )->get_url();
	}

	/**
	 * Outputs the processed URL.
	 *
	 * Borrows from https://www.php.net/manual/en/function.parse-url.php#106731
	 *
	 * @param array $component_overrides If provided, these will override values set in $this->components.
	 *
	 * @return string
	 */
	public function get_url( array $component_overrides = array() ): string {
		$components = array_merge( $this->components, $component_overrides );

		$scheme = null !== $components['scheme'] ? $components['scheme'] . '://' : '//';
		$host   = null !== $components['host'] ? $components['host'] : '';
		$port   = null !== $components['port'] ? ':' . $components['port'] : '';
		$path   = $this->get_path( $components['path'] );

		// Special handling for hostless URLs (typically, filepaths) referencing the current working directory.
		if ( '' === $host && ( '' === $path || '.' === $path ) ) {
			$path = './';
		}

		$user      = null !== $components['user'] ? $components['user'] : '';
		$pass      = null !== $components['pass'] ? ':' . $components['pass'] : '';
		$user_pass = ( ! empty( $user ) || ! empty( $pass ) ) ? $user . $pass . '@' : '';

		$query    = null !== $components['query'] ? '?' . $components['query'] : '';
		$fragment = null !== $components['fragment'] ? '#' . $components['fragment'] : '';

		return $scheme . $user_pass . $host . $port . $path . $query . $fragment;
	}

	/**
	 * Outputs the path. Especially useful if it was a a regular filepath that was passed in originally.
	 *
	 * @param string $path_override If provided this will be used as the URL path. Does not impact drive letter.
	 *
	 * @return string
	 */
	public function get_path( string $path_override = null ): string {
		return ( $this->components['drive'] ? $this->components['drive'] . ':' : '' ) . ( $path_override ?? $this->components['path'] );
	}

	/**
	 * Indicates if the URL or filepath was absolute.
	 *
	 * @return bool True if absolute, else false.
	 */
	public function is_absolute(): bool {
		return $this->is_absolute;
	}

	/**
	 * Indicates if the URL or filepath was relative.
	 *
	 * @return bool True if relative, else false.
	 */
	public function is_relative(): bool {
		return ! $this->is_absolute;
	}
}