-
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tec: Initialize the SpiderBits lib to sanitize URLs
- Loading branch information
1 parent
bd3494b
commit 4d37bb1
Showing
4 changed files
with
276 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
<?php | ||
|
||
spl_autoload_register(function ($class_name) { | ||
if (strpos($class_name, 'SpiderBits') === 0) { | ||
$class_name = substr($class_name, 11); | ||
include(__DIR__ . '/src/' . str_replace('\\', '/', $class_name) . '.php'); | ||
} | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
<?php | ||
|
||
namespace SpiderBits; | ||
|
||
/** | ||
* @author Marien Fressinaud <[email protected]> | ||
* @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL | ||
*/ | ||
class Url | ||
{ | ||
/** | ||
* Return the given URL as a sanitized string. It allows to compute a | ||
* canonical URL. | ||
* | ||
* Algorithm comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization | ||
* with few adaptations. | ||
* | ||
* @param string $url | ||
* | ||
* @return string | ||
*/ | ||
public static function sanitize($url) | ||
{ | ||
// Remove unwanted characters | ||
$cleaned_url = trim($url); | ||
$cleaned_url = str_replace(["\t", "\r", "\n"], '', $cleaned_url); | ||
|
||
if (!$cleaned_url) { | ||
return ''; | ||
} | ||
|
||
// Parse components of the URL. Note we percent-decode later since | ||
// "%23" are replaced by hashes (#) and could lead to a bad parsing. | ||
$parsed_url = parse_url($cleaned_url); | ||
if (!$parsed_url) { | ||
return ''; | ||
} | ||
|
||
// Then, we decode each part of the parsed URL. We want to decode as | ||
// long as percent-encoding characters exist. | ||
foreach ($parsed_url as $component => $value) { | ||
while (preg_match('/%[0-9A-Fa-f]{2}/', $value) === 1) { | ||
$value = urldecode($value); | ||
} | ||
|
||
$parsed_url[$component] = $value; | ||
} | ||
|
||
// Get the scheme (default is http) | ||
if (isset($parsed_url['scheme'])) { | ||
$scheme = $parsed_url['scheme']; | ||
} else { | ||
$scheme = 'http'; | ||
} | ||
|
||
// Get the host. In some situations (e.g. scheme is omitted), the host | ||
// is considered as a path by `parse_url()`. | ||
if (isset($parsed_url['host'])) { | ||
$host = $parsed_url['host']; | ||
} elseif (isset($parsed_url['path'])) { | ||
$host = trim($parsed_url['path'], '/'); | ||
unset($parsed_url['path']); | ||
} else { | ||
$host = ''; | ||
} | ||
|
||
// Clean the extra dots from the host | ||
$host = trim($host, '.'); | ||
$host = preg_replace('/\.{2,}/', '.', $host); | ||
|
||
// The host can be a valid integer ip address, we want to normalize it | ||
// to 4 dot-separated decimal values | ||
if (filter_var($host, FILTER_VALIDATE_INT) !== false) { | ||
$host = long2ip($host); | ||
} | ||
|
||
// idn_to_ascii allows to transform an unicode hostname to an | ||
// ASCII representation | ||
// @see https://en.wikipedia.org/wiki/Punycode | ||
// It also lowercases the string. | ||
$host = idn_to_ascii($host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46); | ||
|
||
// Get the path with ./ and ../ paths replaced. | ||
if (isset($parsed_url['path'])) { | ||
$path = self::normalizePath($parsed_url['path']); | ||
} else { | ||
$path = '/'; | ||
} | ||
|
||
// We finally rebuild the sanitized URL. | ||
$sanitized_url = $scheme . '://'; | ||
$sanitized_url .= $host; | ||
if (isset($parsed_url['port'])) { | ||
$sanitized_url .= ':' . $parsed_url['port']; | ||
} | ||
$sanitized_url .= $path; | ||
if (isset($parsed_url['query'])) { | ||
$sanitized_url .= '?' . $parsed_url['query']; | ||
} elseif (strpos($url, '?') !== false) { | ||
// If the initial URL had a `?` without query string, `parse_url()` | ||
// doesn't return the query component. We want to keep the question | ||
// mark though. | ||
$sanitized_url .= '?'; | ||
} | ||
|
||
// Re-percent-encode the URL. We don't want to use directly | ||
// rawurlencode() since it will convert slashes (/), colons (:) and | ||
// question mark (?) | ||
$sanitized_url = self::percentEncode($sanitized_url); | ||
|
||
// The fragment must be added afterwhile or the hash (#) could be | ||
// converted. | ||
if (isset($parsed_url['fragment'])) { | ||
$sanitized_url .= '#' . self::percentEncode($parsed_url['fragment']); | ||
} | ||
|
||
return $sanitized_url; | ||
} | ||
|
||
/** | ||
* Resolves references to ./, ../ and extra / characters from a path. | ||
* | ||
* It is similar to the realpath() function, but it doesn't require the | ||
* file to exist. | ||
* | ||
* @see https://www.php.net/manual/function.realpath.php | ||
* | ||
* @param string $path | ||
* | ||
* @return string | ||
*/ | ||
private static function normalizePath($path) | ||
{ | ||
$realpath = array(); | ||
|
||
// We just simulate browsing the path, segment by segment. | ||
$path_segments = explode('/', $path); | ||
foreach ($path_segments as $path_segment) { | ||
// . is the current folder, so we can ignore it. Same if the | ||
// segment is empty, we don't want it. | ||
if ($path_segment === '.' || strlen($path_segment) === 0) { | ||
continue; | ||
} | ||
|
||
if ($path_segment === '..') { | ||
// .. is the parent folder, so we must go back to the parent | ||
// level | ||
array_pop($realpath); | ||
} else { | ||
$realpath[] = $path_segment; | ||
} | ||
} | ||
|
||
// Rebuild the path and make sure to keep the first and last slash if | ||
// they existed in the original path. | ||
$realpath = implode('/', $realpath); | ||
if ($path[0] === '/') { | ||
$realpath = '/' . $realpath; | ||
} | ||
if ($realpath !== '/' && $path[strlen($path) - 1] === '/') { | ||
$realpath = $realpath . '/'; | ||
} | ||
|
||
return $realpath; | ||
} | ||
|
||
/** | ||
* Percent-encode a URL. | ||
* | ||
* Contrary to urlencode() and rawurlencode(), this method only encodes | ||
* ASCII characters <= 32, >= 127, '"', "#" and "%". This leaves for | ||
* instance "/", ":" and "?" as they are. | ||
* | ||
* @see https://www.php.net/manual/function.rawurlencode.php | ||
* @see https://en.wikipedia.org/wiki/ASCII | ||
* | ||
* @param string url | ||
* | ||
* @return string | ||
*/ | ||
private static function percentEncode($url) | ||
{ | ||
$escaped_url = ''; | ||
foreach (str_split($url) as $char) { | ||
$ord = ord($char); | ||
if ($ord > 32 && $ord < 127 && $char !== '"' && $char !== '#' && $char !== '%') { | ||
$escaped_url .= $char; | ||
} else { | ||
$escaped_url .= rawurlencode($char); | ||
} | ||
} | ||
return $escaped_url; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
<?php | ||
|
||
namespace SpiderBits; | ||
|
||
class UrlTest extends \PHPUnit\Framework\TestCase | ||
{ | ||
/** | ||
* @dataProvider sanitizeProvider | ||
*/ | ||
public function testSanitize($input, $expected) | ||
{ | ||
$sanitized_url = Url::sanitize($input); | ||
|
||
$this->assertSame($expected, $sanitized_url); | ||
} | ||
|
||
public function sanitizeProvider() | ||
{ | ||
// This test suite comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization | ||
// Minor differences are indicated in comments | ||
// phpcs:disable Generic.Files.LineLength.TooLong | ||
return [ | ||
["", ''], | ||
[" ", ''], | ||
[" \n\t\r", ''], | ||
["http://host/%25%32%35", 'http://host/%25'], | ||
["http://host/%25%32%35%25%32%35", 'http://host/%25%25'], | ||
["http://host/%2525252525252525", 'http://host/%25'], | ||
["http://host/asdf%25%32%35asd", 'http://host/asdf%25asd'], | ||
["http://host/%%%25%32%35asd%%", 'http://host/%25%25%25asd%25%25'], | ||
["http://www.google.com/", 'http://www.google.com/'], | ||
["http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/", 'http://168.188.99.26/.secure/www.ebay.com/'], | ||
["http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/", 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'], | ||
["http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B", 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'], | ||
["http://3279880203/blah", 'http://195.127.0.11/blah'], | ||
["http://www.google.com/blah/..", 'http://www.google.com/'], | ||
["www.google.com/", 'http://www.google.com/'], | ||
["www.google.com", 'http://www.google.com/'], | ||
// We want to keep the fragment | ||
["http://www.evil.com/blah#frag", 'http://www.evil.com/blah#frag'], | ||
["http://www.GOOgle.com/", 'http://www.google.com/'], | ||
["http://www.google.com.../", 'http://www.google.com/'], | ||
["http://www.google.com/foo\tbar\rbaz\n2",'http://www.google.com/foobarbaz2'], | ||
["http://www.google.com/q?", 'http://www.google.com/q?'], | ||
["http://www.google.com/q?r?", 'http://www.google.com/q?r?'], | ||
["http://www.google.com/q?r?s", 'http://www.google.com/q?r?s'], | ||
// We want to keep the fragment | ||
["http://evil.com/foo#bar#baz", 'http://evil.com/foo#bar%23baz'], | ||
["http://evil.com/foo;", 'http://evil.com/foo;'], | ||
["http://evil.com/foo?bar;", 'http://evil.com/foo?bar;'], | ||
// idn_to_ascii cannot handle this url (which is invalid anyway) | ||
// and return an empty host | ||
["http://\x01\x80.com/", 'http:///'], | ||
["http://notrailingslash.com", 'http://notrailingslash.com/'], | ||
// We want to keep the port | ||
["http://www.gotaport.com:1234/", 'http://www.gotaport.com:1234/'], | ||
[" http://www.google.com/ ", 'http://www.google.com/'], | ||
["http:// leadingspace.com/", 'http://%20leadingspace.com/'], | ||
["http://%20leadingspace.com/", 'http://%20leadingspace.com/'], | ||
["%20leadingspace.com/", 'http://%20leadingspace.com/'], | ||
["https://www.securesite.com/", 'https://www.securesite.com/'], | ||
["http://host.com/ab%23cd", 'http://host.com/ab%23cd'], | ||
["http://host.com//twoslashes?more//slashes", 'http://host.com/twoslashes?more//slashes'], | ||
// More tests | ||
["https://domén-with-accent.com?query=with-àccent", 'https://xn--domn-with-accent-dqb.com/?query=with-%C3%A0ccent'], | ||
["https://host.com?query=with-%C3%A0ccent", 'https://host.com/?query=with-%C3%A0ccent'], | ||
["http://evil.com/foo#bar/baz", 'http://evil.com/foo#bar/baz'], | ||
["http://evil.com/foo#bar%22baz", 'http://evil.com/foo#bar%22baz'], | ||
["http://evil.com/foo#🐘", 'http://evil.com/foo#%F0%9F%90%98'], | ||
]; | ||
} | ||
} |