From 4d37bb1b62429dd35cfaa55e48ad158c565a27e8 Mon Sep 17 00:00:00 2001 From: Marien Fressinaud Date: Tue, 16 Jun 2020 15:37:41 +0200 Subject: [PATCH] tec: Initialize the SpiderBits lib to sanitize URLs --- autoload.php | 2 + lib/SpiderBits/autoload.php | 8 ++ lib/SpiderBits/src/Url.php | 194 +++++++++++++++++++++++++++++++ tests/lib/SpiderBits/UrlTest.php | 72 ++++++++++++ 4 files changed, 276 insertions(+) create mode 100644 lib/SpiderBits/autoload.php create mode 100644 lib/SpiderBits/src/Url.php create mode 100644 tests/lib/SpiderBits/UrlTest.php diff --git a/autoload.php b/autoload.php index a038eeb5..e132edb1 100644 --- a/autoload.php +++ b/autoload.php @@ -16,6 +16,8 @@ function ($class_name) { if (strpos($class_name, 'Minz') === 0) { include $lib_path . '/Minz/autoload.php'; + } elseif (strpos($class_name, 'SpiderBits') === 0) { + include $lib_path . '/SpiderBits/autoload.php'; } elseif (strpos($class_name, $app_namespace) === 0) { $class_name = substr($class_name, strlen($app_namespace) + 1); $class_path = str_replace('\\', '/', $class_name) . '.php'; diff --git a/lib/SpiderBits/autoload.php b/lib/SpiderBits/autoload.php new file mode 100644 index 00000000..c84bc5c7 --- /dev/null +++ b/lib/SpiderBits/autoload.php @@ -0,0 +1,8 @@ + + * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL + */ +class Url +{ + /** + * Return the given URL as a sanitized string. It allows to compute a + * canonical URL. + * + * Algorithm comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization + * with few adaptations. + * + * @param string $url + * + * @return string + */ + public static function sanitize($url) + { + // Remove unwanted characters + $cleaned_url = trim($url); + $cleaned_url = str_replace(["\t", "\r", "\n"], '', $cleaned_url); + + if (!$cleaned_url) { + return ''; + } + + // Parse components of the URL. Note we percent-decode later since + // "%23" are replaced by hashes (#) and could lead to a bad parsing. + $parsed_url = parse_url($cleaned_url); + if (!$parsed_url) { + return ''; + } + + // Then, we decode each part of the parsed URL. We want to decode as + // long as percent-encoding characters exist. + foreach ($parsed_url as $component => $value) { + while (preg_match('/%[0-9A-Fa-f]{2}/', $value) === 1) { + $value = urldecode($value); + } + + $parsed_url[$component] = $value; + } + + // Get the scheme (default is http) + if (isset($parsed_url['scheme'])) { + $scheme = $parsed_url['scheme']; + } else { + $scheme = 'http'; + } + + // Get the host. In some situations (e.g. scheme is omitted), the host + // is considered as a path by `parse_url()`. + if (isset($parsed_url['host'])) { + $host = $parsed_url['host']; + } elseif (isset($parsed_url['path'])) { + $host = trim($parsed_url['path'], '/'); + unset($parsed_url['path']); + } else { + $host = ''; + } + + // Clean the extra dots from the host + $host = trim($host, '.'); + $host = preg_replace('/\.{2,}/', '.', $host); + + // The host can be a valid integer ip address, we want to normalize it + // to 4 dot-separated decimal values + if (filter_var($host, FILTER_VALIDATE_INT) !== false) { + $host = long2ip($host); + } + + // idn_to_ascii allows to transform an unicode hostname to an + // ASCII representation + // @see https://en.wikipedia.org/wiki/Punycode + // It also lowercases the string. + $host = idn_to_ascii($host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46); + + // Get the path with ./ and ../ paths replaced. + if (isset($parsed_url['path'])) { + $path = self::normalizePath($parsed_url['path']); + } else { + $path = '/'; + } + + // We finally rebuild the sanitized URL. + $sanitized_url = $scheme . '://'; + $sanitized_url .= $host; + if (isset($parsed_url['port'])) { + $sanitized_url .= ':' . $parsed_url['port']; + } + $sanitized_url .= $path; + if (isset($parsed_url['query'])) { + $sanitized_url .= '?' . $parsed_url['query']; + } elseif (strpos($url, '?') !== false) { + // If the initial URL had a `?` without query string, `parse_url()` + // doesn't return the query component. We want to keep the question + // mark though. + $sanitized_url .= '?'; + } + + // Re-percent-encode the URL. We don't want to use directly + // rawurlencode() since it will convert slashes (/), colons (:) and + // question mark (?) + $sanitized_url = self::percentEncode($sanitized_url); + + // The fragment must be added afterwhile or the hash (#) could be + // converted. + if (isset($parsed_url['fragment'])) { + $sanitized_url .= '#' . self::percentEncode($parsed_url['fragment']); + } + + return $sanitized_url; + } + + /** + * Resolves references to ./, ../ and extra / characters from a path. + * + * It is similar to the realpath() function, but it doesn't require the + * file to exist. + * + * @see https://www.php.net/manual/function.realpath.php + * + * @param string $path + * + * @return string + */ + private static function normalizePath($path) + { + $realpath = array(); + + // We just simulate browsing the path, segment by segment. + $path_segments = explode('/', $path); + foreach ($path_segments as $path_segment) { + // . is the current folder, so we can ignore it. Same if the + // segment is empty, we don't want it. + if ($path_segment === '.' || strlen($path_segment) === 0) { + continue; + } + + if ($path_segment === '..') { + // .. is the parent folder, so we must go back to the parent + // level + array_pop($realpath); + } else { + $realpath[] = $path_segment; + } + } + + // Rebuild the path and make sure to keep the first and last slash if + // they existed in the original path. + $realpath = implode('/', $realpath); + if ($path[0] === '/') { + $realpath = '/' . $realpath; + } + if ($realpath !== '/' && $path[strlen($path) - 1] === '/') { + $realpath = $realpath . '/'; + } + + return $realpath; + } + + /** + * Percent-encode a URL. + * + * Contrary to urlencode() and rawurlencode(), this method only encodes + * ASCII characters <= 32, >= 127, '"', "#" and "%". This leaves for + * instance "/", ":" and "?" as they are. + * + * @see https://www.php.net/manual/function.rawurlencode.php + * @see https://en.wikipedia.org/wiki/ASCII + * + * @param string url + * + * @return string + */ + private static function percentEncode($url) + { + $escaped_url = ''; + foreach (str_split($url) as $char) { + $ord = ord($char); + if ($ord > 32 && $ord < 127 && $char !== '"' && $char !== '#' && $char !== '%') { + $escaped_url .= $char; + } else { + $escaped_url .= rawurlencode($char); + } + } + return $escaped_url; + } +} diff --git a/tests/lib/SpiderBits/UrlTest.php b/tests/lib/SpiderBits/UrlTest.php new file mode 100644 index 00000000..107d615e --- /dev/null +++ b/tests/lib/SpiderBits/UrlTest.php @@ -0,0 +1,72 @@ +assertSame($expected, $sanitized_url); + } + + public function sanitizeProvider() + { + // This test suite comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization + // Minor differences are indicated in comments + // phpcs:disable Generic.Files.LineLength.TooLong + return [ + ["", ''], + [" ", ''], + [" \n\t\r", ''], + ["http://host/%25%32%35", 'http://host/%25'], + ["http://host/%25%32%35%25%32%35", 'http://host/%25%25'], + ["http://host/%2525252525252525", 'http://host/%25'], + ["http://host/asdf%25%32%35asd", 'http://host/asdf%25asd'], + ["http://host/%%%25%32%35asd%%", 'http://host/%25%25%25asd%25%25'], + ["http://www.google.com/", 'http://www.google.com/'], + ["http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/", 'http://168.188.99.26/.secure/www.ebay.com/'], + ["http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/", 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'], + ["http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B", 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'], + ["http://3279880203/blah", 'http://195.127.0.11/blah'], + ["http://www.google.com/blah/..", 'http://www.google.com/'], + ["www.google.com/", 'http://www.google.com/'], + ["www.google.com", 'http://www.google.com/'], + // We want to keep the fragment + ["http://www.evil.com/blah#frag", 'http://www.evil.com/blah#frag'], + ["http://www.GOOgle.com/", 'http://www.google.com/'], + ["http://www.google.com.../", 'http://www.google.com/'], + ["http://www.google.com/foo\tbar\rbaz\n2",'http://www.google.com/foobarbaz2'], + ["http://www.google.com/q?", 'http://www.google.com/q?'], + ["http://www.google.com/q?r?", 'http://www.google.com/q?r?'], + ["http://www.google.com/q?r?s", 'http://www.google.com/q?r?s'], + // We want to keep the fragment + ["http://evil.com/foo#bar#baz", 'http://evil.com/foo#bar%23baz'], + ["http://evil.com/foo;", 'http://evil.com/foo;'], + ["http://evil.com/foo?bar;", 'http://evil.com/foo?bar;'], + // idn_to_ascii cannot handle this url (which is invalid anyway) + // and return an empty host + ["http://\x01\x80.com/", 'http:///'], + ["http://notrailingslash.com", 'http://notrailingslash.com/'], + // We want to keep the port + ["http://www.gotaport.com:1234/", 'http://www.gotaport.com:1234/'], + [" http://www.google.com/ ", 'http://www.google.com/'], + ["http:// leadingspace.com/", 'http://%20leadingspace.com/'], + ["http://%20leadingspace.com/", 'http://%20leadingspace.com/'], + ["%20leadingspace.com/", 'http://%20leadingspace.com/'], + ["https://www.securesite.com/", 'https://www.securesite.com/'], + ["http://host.com/ab%23cd", 'http://host.com/ab%23cd'], + ["http://host.com//twoslashes?more//slashes", 'http://host.com/twoslashes?more//slashes'], + // More tests + ["https://domén-with-accent.com?query=with-àccent", 'https://xn--domn-with-accent-dqb.com/?query=with-%C3%A0ccent'], + ["https://host.com?query=with-%C3%A0ccent", 'https://host.com/?query=with-%C3%A0ccent'], + ["http://evil.com/foo#bar/baz", 'http://evil.com/foo#bar/baz'], + ["http://evil.com/foo#bar%22baz", 'http://evil.com/foo#bar%22baz'], + ["http://evil.com/foo#🐘", 'http://evil.com/foo#%F0%9F%90%98'], + ]; + } +}