Skip to content

Commit

Permalink
tec: Initialize the SpiderBits lib to sanitize URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
marienfressinaud committed Jun 17, 2020
1 parent bd3494b commit 4d37bb1
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 0 deletions.
2 changes: 2 additions & 0 deletions autoload.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ function ($class_name) {

if (strpos($class_name, 'Minz') === 0) {
include $lib_path . '/Minz/autoload.php';
} elseif (strpos($class_name, 'SpiderBits') === 0) {
include $lib_path . '/SpiderBits/autoload.php';
} elseif (strpos($class_name, $app_namespace) === 0) {
$class_name = substr($class_name, strlen($app_namespace) + 1);
$class_path = str_replace('\\', '/', $class_name) . '.php';
Expand Down
8 changes: 8 additions & 0 deletions lib/SpiderBits/autoload.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?php

spl_autoload_register(function ($class_name) {
if (strpos($class_name, 'SpiderBits') === 0) {
$class_name = substr($class_name, 11);
include(__DIR__ . '/src/' . str_replace('\\', '/', $class_name) . '.php');
}
});
194 changes: 194 additions & 0 deletions lib/SpiderBits/src/Url.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
<?php

namespace SpiderBits;

/**
* @author Marien Fressinaud <[email protected]>
* @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL
*/
class Url
{
/**
* Return the given URL as a sanitized string. It allows to compute a
* canonical URL.
*
* Algorithm comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
* with few adaptations.
*
* @param string $url
*
* @return string
*/
public static function sanitize($url)
{
// Remove unwanted characters
$cleaned_url = trim($url);
$cleaned_url = str_replace(["\t", "\r", "\n"], '', $cleaned_url);

if (!$cleaned_url) {
return '';
}

// Parse components of the URL. Note we percent-decode later since
// "%23" are replaced by hashes (#) and could lead to a bad parsing.
$parsed_url = parse_url($cleaned_url);
if (!$parsed_url) {
return '';
}

// Then, we decode each part of the parsed URL. We want to decode as
// long as percent-encoding characters exist.
foreach ($parsed_url as $component => $value) {
while (preg_match('/%[0-9A-Fa-f]{2}/', $value) === 1) {
$value = urldecode($value);
}

$parsed_url[$component] = $value;
}

// Get the scheme (default is http)
if (isset($parsed_url['scheme'])) {
$scheme = $parsed_url['scheme'];
} else {
$scheme = 'http';
}

// Get the host. In some situations (e.g. scheme is omitted), the host
// is considered as a path by `parse_url()`.
if (isset($parsed_url['host'])) {
$host = $parsed_url['host'];
} elseif (isset($parsed_url['path'])) {
$host = trim($parsed_url['path'], '/');
unset($parsed_url['path']);
} else {
$host = '';
}

// Clean the extra dots from the host
$host = trim($host, '.');
$host = preg_replace('/\.{2,}/', '.', $host);

// The host can be a valid integer ip address, we want to normalize it
// to 4 dot-separated decimal values
if (filter_var($host, FILTER_VALIDATE_INT) !== false) {
$host = long2ip($host);
}

// idn_to_ascii allows to transform an unicode hostname to an
// ASCII representation
// @see https://en.wikipedia.org/wiki/Punycode
// It also lowercases the string.
$host = idn_to_ascii($host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46);

// Get the path with ./ and ../ paths replaced.
if (isset($parsed_url['path'])) {
$path = self::normalizePath($parsed_url['path']);
} else {
$path = '/';
}

// We finally rebuild the sanitized URL.
$sanitized_url = $scheme . '://';
$sanitized_url .= $host;
if (isset($parsed_url['port'])) {
$sanitized_url .= ':' . $parsed_url['port'];
}
$sanitized_url .= $path;
if (isset($parsed_url['query'])) {
$sanitized_url .= '?' . $parsed_url['query'];
} elseif (strpos($url, '?') !== false) {
// If the initial URL had a `?` without query string, `parse_url()`
// doesn't return the query component. We want to keep the question
// mark though.
$sanitized_url .= '?';
}

// Re-percent-encode the URL. We don't want to use directly
// rawurlencode() since it will convert slashes (/), colons (:) and
// question mark (?)
$sanitized_url = self::percentEncode($sanitized_url);

// The fragment must be added afterwhile or the hash (#) could be
// converted.
if (isset($parsed_url['fragment'])) {
$sanitized_url .= '#' . self::percentEncode($parsed_url['fragment']);
}

return $sanitized_url;
}

/**
* Resolves references to ./, ../ and extra / characters from a path.
*
* It is similar to the realpath() function, but it doesn't require the
* file to exist.
*
* @see https://www.php.net/manual/function.realpath.php
*
* @param string $path
*
* @return string
*/
private static function normalizePath($path)
{
$realpath = array();

// We just simulate browsing the path, segment by segment.
$path_segments = explode('/', $path);
foreach ($path_segments as $path_segment) {
// . is the current folder, so we can ignore it. Same if the
// segment is empty, we don't want it.
if ($path_segment === '.' || strlen($path_segment) === 0) {
continue;
}

if ($path_segment === '..') {
// .. is the parent folder, so we must go back to the parent
// level
array_pop($realpath);
} else {
$realpath[] = $path_segment;
}
}

// Rebuild the path and make sure to keep the first and last slash if
// they existed in the original path.
$realpath = implode('/', $realpath);
if ($path[0] === '/') {
$realpath = '/' . $realpath;
}
if ($realpath !== '/' && $path[strlen($path) - 1] === '/') {
$realpath = $realpath . '/';
}

return $realpath;
}

/**
* Percent-encode a URL.
*
* Contrary to urlencode() and rawurlencode(), this method only encodes
* ASCII characters <= 32, >= 127, '"', "#" and "%". This leaves for
* instance "/", ":" and "?" as they are.
*
* @see https://www.php.net/manual/function.rawurlencode.php
* @see https://en.wikipedia.org/wiki/ASCII
*
* @param string url
*
* @return string
*/
private static function percentEncode($url)
{
$escaped_url = '';
foreach (str_split($url) as $char) {
$ord = ord($char);
if ($ord > 32 && $ord < 127 && $char !== '"' && $char !== '#' && $char !== '%') {
$escaped_url .= $char;
} else {
$escaped_url .= rawurlencode($char);
}
}
return $escaped_url;
}
}
72 changes: 72 additions & 0 deletions tests/lib/SpiderBits/UrlTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<?php

namespace SpiderBits;

class UrlTest extends \PHPUnit\Framework\TestCase
{
/**
* @dataProvider sanitizeProvider
*/
public function testSanitize($input, $expected)
{
$sanitized_url = Url::sanitize($input);

$this->assertSame($expected, $sanitized_url);
}

public function sanitizeProvider()
{
// This test suite comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
// Minor differences are indicated in comments
// phpcs:disable Generic.Files.LineLength.TooLong
return [
["", ''],
[" ", ''],
[" \n\t\r", ''],
["http://host/%25%32%35", 'http://host/%25'],
["http://host/%25%32%35%25%32%35", 'http://host/%25%25'],
["http://host/%2525252525252525", 'http://host/%25'],
["http://host/asdf%25%32%35asd", 'http://host/asdf%25asd'],
["http://host/%%%25%32%35asd%%", 'http://host/%25%25%25asd%25%25'],
["http://www.google.com/", 'http://www.google.com/'],
["http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/", 'http://168.188.99.26/.secure/www.ebay.com/'],
["http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/", 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'],
["http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B", 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'],
["http://3279880203/blah", 'http://195.127.0.11/blah'],
["http://www.google.com/blah/..", 'http://www.google.com/'],
["www.google.com/", 'http://www.google.com/'],
["www.google.com", 'http://www.google.com/'],
// We want to keep the fragment
["http://www.evil.com/blah#frag", 'http://www.evil.com/blah#frag'],
["http://www.GOOgle.com/", 'http://www.google.com/'],
["http://www.google.com.../", 'http://www.google.com/'],
["http://www.google.com/foo\tbar\rbaz\n2",'http://www.google.com/foobarbaz2'],
["http://www.google.com/q?", 'http://www.google.com/q?'],
["http://www.google.com/q?r?", 'http://www.google.com/q?r?'],
["http://www.google.com/q?r?s", 'http://www.google.com/q?r?s'],
// We want to keep the fragment
["http://evil.com/foo#bar#baz", 'http://evil.com/foo#bar%23baz'],
["http://evil.com/foo;", 'http://evil.com/foo;'],
["http://evil.com/foo?bar;", 'http://evil.com/foo?bar;'],
// idn_to_ascii cannot handle this url (which is invalid anyway)
// and return an empty host
["http://\x01\x80.com/", 'http:///'],
["http://notrailingslash.com", 'http://notrailingslash.com/'],
// We want to keep the port
["http://www.gotaport.com:1234/", 'http://www.gotaport.com:1234/'],
[" http://www.google.com/ ", 'http://www.google.com/'],
["http:// leadingspace.com/", 'http://%20leadingspace.com/'],
["http://%20leadingspace.com/", 'http://%20leadingspace.com/'],
["%20leadingspace.com/", 'http://%20leadingspace.com/'],
["https://www.securesite.com/", 'https://www.securesite.com/'],
["http://host.com/ab%23cd", 'http://host.com/ab%23cd'],
["http://host.com//twoslashes?more//slashes", 'http://host.com/twoslashes?more//slashes'],
// More tests
["https://domén-with-accent.com?query=with-àccent", 'https://xn--domn-with-accent-dqb.com/?query=with-%C3%A0ccent'],
["https://host.com?query=with-%C3%A0ccent", 'https://host.com/?query=with-%C3%A0ccent'],
["http://evil.com/foo#bar/baz", 'http://evil.com/foo#bar/baz'],
["http://evil.com/foo#bar%22baz", 'http://evil.com/foo#bar%22baz'],
["http://evil.com/foo#🐘", 'http://evil.com/foo#%F0%9F%90%98'],
];
}
}

0 comments on commit 4d37bb1

Please sign in to comment.