Seditio Source
Root |
./othercms/phpBB3/vendor/s9e/text-formatter/src/Parser/AttributeFilters/UrlFilter.php
<?php

/**
* @package   s9e\TextFormatter
* @copyright Copyright (c) 2010-2021 The s9e authors
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\TextFormatter\Parser\AttributeFilters;

use
s9e\TextFormatter\Parser\Logger;

class
UrlFilter
{
   
/**
    * Filter a URL
    *
    * @param  mixed  $attrValue Original URL
    * @param  array  $urlConfig URL config
    * @param  Logger $logger    Parser's logger
    * @return mixed             Cleaned up URL if valid, FALSE otherwise
    */
   
public static function filter($attrValue, array $urlConfig, Logger $logger = null)
    {
       
/**
        * Trim the URL to conform with HTML5 then parse it
        * @link http://dev.w3.org/html5/spec/links.html#attr-hyperlink-href
        */
       
$p = self::parseUrl(trim($attrValue));

       
$error = self::validateUrl($urlConfig, $p);
        if (!empty(
$error))
        {
            if (isset(
$logger))
            {
               
$p['attrValue'] = $attrValue;
               
$logger->err($error, $p);
            }

            return
false;
        }

        return
self::rebuildUrl($p);
    }

   
/**
    * Parse a URL and return its components
    *
    * Similar to PHP's own parse_url() except that all parts are always returned
    *
    * @param  string $url Original URL
    * @return array
    */
   
protected static function parseUrl($url)
    {
       
$regexp = '(^(?:([a-z][-+.\\w]*):)?(?://(?:([^:/?#]*)(?::([^/?#]*)?)?@)?(?:(\\[[a-f\\d:]+\\]|[^:/?#]+)(?::(\\d*))?)?(?![^/?#]))?([^?#]*)(\\?[^#]*)?(#.*)?$)Di';

       
// NOTE: this regexp always matches because of the last three captures
       
preg_match($regexp, $url, $m);

       
$parts  = [];
       
$tokens = ['scheme', 'user', 'pass', 'host', 'port', 'path', 'query', 'fragment'];
        foreach (
$tokens as $i => $name)
        {
           
$parts[$name] = $m[$i + 1] ?? '';
        }

       
/**
        * @link http://tools.ietf.org/html/rfc3986#section-3.1
        *
        * 'An implementation should accept uppercase letters as equivalent to lowercase in
        * scheme names (e.g., allow "HTTP" as well as "http") for the sake of robustness but
        * should only produce lowercase scheme names for consistency.'
        */
       
$parts['scheme'] = strtolower($parts['scheme']);

       
/**
        * Normalize the domain label separators and remove trailing dots
        * @link http://url.spec.whatwg.org/#domain-label-separators
        */
       
$parts['host'] = rtrim(preg_replace("/\xE3\x80\x82|\xEF(?:\xBC\x8E|\xBD\xA1)/s", '.', $parts['host']), '.');

       
// Test whether host has non-ASCII characters and punycode it if possible
       
if (preg_match('#[^[:ascii:]]#', $parts['host']) && function_exists('idn_to_ascii'))
        {
           
$variant = (defined('INTL_IDNA_VARIANT_UTS46')) ? INTL_IDNA_VARIANT_UTS46 : 0;
           
$parts['host'] = idn_to_ascii($parts['host'], 0, $variant);
        }

        return
$parts;
    }

   
/**
    * Rebuild a parsed URL
    *
    * @param  array  $p Parsed URL
    * @return string
    */
   
protected static function rebuildUrl(array $p)
    {
       
$url = '';
        if (
$p['scheme'] !== '')
        {
           
$url .= $p['scheme'] . ':';
        }
        if (
$p['host'] !== '')
        {
           
$url .= '//';

           
// Add the credentials if applicable
           
if ($p['user'] !== '')
            {
               
// Reencode the credentials in case there are invalid chars in them, or suspicious
                // characters such as : or @ that could confuse a browser into connecting to the
                // wrong host (or at least, to a host that is different than the one we thought)
               
$url .= rawurlencode(urldecode($p['user']));

                if (
$p['pass'] !== '')
                {
                   
$url .= ':' . rawurlencode(urldecode($p['pass']));
                }

               
$url .= '@';
            }

           
$url .= $p['host'];

           
// Append the port number (note that as per the regexp it can only contain digits)
           
if ($p['port'] !== '')
            {
               
$url .= ':' . $p['port'];
            }
        }
        elseif (
$p['scheme'] === 'file')
        {
           
// Allow the file: scheme to not have a host and ensure it starts with slashes
           
$url .= '//';
        }

       
// Build the path, including the query and fragment parts
       
$path = $p['path'] . $p['query'] . $p['fragment'];

       
/**
        * "For consistency, URI producers and normalizers should use uppercase hexadecimal digits
        * for all percent- encodings."
        *
        * @link http://tools.ietf.org/html/rfc3986#section-2.1
        */
       
$path = preg_replace_callback(
           
'/%.?[a-f]/',
            function (
$m)
            {
                return
strtoupper($m[0]);
            },
           
$path
       
);

       
// Append the sanitized path to the URL
       
$url .= self::sanitizeUrl($path);

       
// Replace the first colon if there's no scheme and it could potentially be interpreted as
        // the scheme separator
       
if (!$p['scheme'])
        {
           
$url = preg_replace('#^([^/]*):#', '$1%3A', $url);
        }

        return
$url;
    }

   
/**
    * Sanitize a URL for safe use regardless of context
    *
    * This method URL-encodes some sensitive characters in case someone would want to use the URL in
    * some JavaScript thingy, or in CSS. We also encode characters that are not allowed in the path
    * of a URL as defined in RFC 3986 appendix A, including percent signs that are not immediately
    * followed by two hex digits.
    *
    * " and ' to prevent breaking out of quotes (JavaScript or otherwise)
    * ( and ) to prevent the use of functions in JavaScript (eval()) or CSS (expression())
    * < and > to prevent breaking out of <script>
    * \r and \n because they're illegal in JavaScript
    * [ and ] because the W3 validator rejects them and they "should" be escaped as per RFC 3986
    * Non-ASCII characters as per RFC 3986
    * Control codes and spaces, as per RFC 3986
    *
    * @link http://sla.ckers.org/forum/read.php?2,51478
    * @link http://timelessrepo.com/json-isnt-a-javascript-subset
    * @link http://www.ietf.org/rfc/rfc3986.txt
    * @link http://stackoverflow.com/a/1547922
    * @link http://tools.ietf.org/html/rfc3986#appendix-A
    *
    * @param  string $url Original URL
    * @return string      Sanitized URL
    */
   
public static function sanitizeUrl($url)
    {
        return
preg_replace_callback(
           
'/%(?![0-9A-Fa-f]{2})|[^!#-&*-;=?-Z_a-z~]/',
            function (
$m)
            {
                return
rawurlencode($m[0]);
            },
           
$url
       
);
    }

   
/**
    * Validate a parsed URL
    *
    * @param  array      $urlConfig URL config
    * @param  array      $p         Parsed URL
    * @return string|null           Error message if invalid, or NULL
    */
   
protected static function validateUrl(array $urlConfig, array $p)
    {
        if (
$p['scheme'] !== '' && !preg_match($urlConfig['allowedSchemes'], $p['scheme']))
        {
            return
'URL scheme is not allowed';
        }

        if (
$p['host'] !== '')
        {
           
/**
            * Test whether the host is valid
            * @link http://tools.ietf.org/html/rfc1035#section-2.3.1
            * @link http://tools.ietf.org/html/rfc1123#section-2
            */
           
$regexp = '/^(?!-)[-a-z0-9]{0,62}[a-z0-9](?:\\.(?!-)[-a-z0-9]{0,62}[a-z0-9])*$/i';
            if (!
preg_match($regexp, $p['host']))
            {
               
// If the host invalid, retest as an IPv4 and IPv6 address (IPv6 in brackets)
               
if (!NetworkFilter::filterIpv4($p['host'])
                 && !
NetworkFilter::filterIpv6(preg_replace('/^\\[(.*)\\]$/', '$1', $p['host'])))
                {
                    return
'URL host is invalid';
                }
            }

            if ((isset(
$urlConfig['disallowedHosts']) && preg_match($urlConfig['disallowedHosts'], $p['host']))
             || (isset(
$urlConfig['restrictedHosts']) && !preg_match($urlConfig['restrictedHosts'], $p['host'])))
            {
                return
'URL host is not allowed';
            }
        }
        elseif (
preg_match('(^(?:(?:f|ht)tps?)$)', $p['scheme']))
        {
            return
'Missing host';
        }
    }
}