From 4d37bb1b62429dd35cfaa55e48ad158c565a27e8 Mon Sep 17 00:00:00 2001
From: Marien Fressinaud <dev@marienfressinaud.fr>
Date: Tue, 16 Jun 2020 15:37:41 +0200
Subject: [PATCH] tec: Initialize the SpiderBits lib to sanitize URLs

---
 autoload.php                     |   2 +
 lib/SpiderBits/autoload.php      |   8 ++
 lib/SpiderBits/src/Url.php       | 194 +++++++++++++++++++++++++++++++
 tests/lib/SpiderBits/UrlTest.php |  72 ++++++++++++
 4 files changed, 276 insertions(+)
 create mode 100644 lib/SpiderBits/autoload.php
 create mode 100644 lib/SpiderBits/src/Url.php
 create mode 100644 tests/lib/SpiderBits/UrlTest.php

diff --git a/autoload.php b/autoload.php
index a038eeb5..e132edb1 100644
--- a/autoload.php
+++ b/autoload.php
@@ -16,6 +16,8 @@ function ($class_name) {
 
         if (strpos($class_name, 'Minz') === 0) {
             include $lib_path . '/Minz/autoload.php';
+        } elseif (strpos($class_name, 'SpiderBits') === 0) {
+            include $lib_path . '/SpiderBits/autoload.php';
         } elseif (strpos($class_name, $app_namespace) === 0) {
             $class_name = substr($class_name, strlen($app_namespace) + 1);
             $class_path = str_replace('\\', '/', $class_name) . '.php';
diff --git a/lib/SpiderBits/autoload.php b/lib/SpiderBits/autoload.php
new file mode 100644
index 00000000..c84bc5c7
--- /dev/null
+++ b/lib/SpiderBits/autoload.php
@@ -0,0 +1,8 @@
+<?php
+
+spl_autoload_register(function ($class_name) {
+    if (strpos($class_name, 'SpiderBits') === 0) {
+        $class_name = substr($class_name, 11);
+        include(__DIR__ . '/src/' . str_replace('\\', '/', $class_name) . '.php');
+    }
+});
diff --git a/lib/SpiderBits/src/Url.php b/lib/SpiderBits/src/Url.php
new file mode 100644
index 00000000..5c878037
--- /dev/null
+++ b/lib/SpiderBits/src/Url.php
@@ -0,0 +1,194 @@
+<?php
+
+namespace SpiderBits;
+
+/**
+ * @author  Marien Fressinaud <dev@marienfressinaud.fr>
+ * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL
+ */
+class Url
+{
+    /**
+     * Return the given URL as a sanitized string. It allows to compute a
+     * canonical URL.
+     *
+     * Algorithm comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
+     * with few adaptations.
+     *
+     * @param string $url
+     *
+     * @return string
+     */
+    public static function sanitize($url)
+    {
+        // Remove unwanted characters
+        $cleaned_url = trim($url);
+        $cleaned_url = str_replace(["\t", "\r", "\n"], '', $cleaned_url);
+
+        if (!$cleaned_url) {
+            return '';
+        }
+
+        // Parse components of the URL. Note we percent-decode later since
+        // "%23" are replaced by hashes (#) and could lead to a bad parsing.
+        $parsed_url = parse_url($cleaned_url);
+        if (!$parsed_url) {
+            return '';
+        }
+
+        // Then, we decode each part of the parsed URL. We want to decode as
+        // long as percent-encoding characters exist.
+        foreach ($parsed_url as $component => $value) {
+            while (preg_match('/%[0-9A-Fa-f]{2}/', $value) === 1) {
+                $value = urldecode($value);
+            }
+
+            $parsed_url[$component] = $value;
+        }
+
+        // Get the scheme (default is http)
+        if (isset($parsed_url['scheme'])) {
+            $scheme = $parsed_url['scheme'];
+        } else {
+            $scheme = 'http';
+        }
+
+        // Get the host. In some situations (e.g. scheme is omitted), the host
+        // is considered as a path by `parse_url()`.
+        if (isset($parsed_url['host'])) {
+            $host = $parsed_url['host'];
+        } elseif (isset($parsed_url['path'])) {
+            $host = trim($parsed_url['path'], '/');
+            unset($parsed_url['path']);
+        } else {
+            $host = '';
+        }
+
+        // Clean the extra dots from the host
+        $host = trim($host, '.');
+        $host = preg_replace('/\.{2,}/', '.', $host);
+
+        // The host can be a valid integer ip address, we want to normalize it
+        // to 4 dot-separated decimal values
+        if (filter_var($host, FILTER_VALIDATE_INT) !== false) {
+            $host = long2ip($host);
+        }
+
+        // idn_to_ascii allows to transform an unicode hostname to an
+        // ASCII representation
+        // @see https://en.wikipedia.org/wiki/Punycode
+        // It also lowercases the string.
+        $host = idn_to_ascii($host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46);
+
+        // Get the path with ./ and ../ paths replaced.
+        if (isset($parsed_url['path'])) {
+            $path = self::normalizePath($parsed_url['path']);
+        } else {
+            $path = '/';
+        }
+
+        // We finally rebuild the sanitized URL.
+        $sanitized_url = $scheme . '://';
+        $sanitized_url .= $host;
+        if (isset($parsed_url['port'])) {
+            $sanitized_url .= ':' . $parsed_url['port'];
+        }
+        $sanitized_url .= $path;
+        if (isset($parsed_url['query'])) {
+            $sanitized_url .= '?' . $parsed_url['query'];
+        } elseif (strpos($url, '?') !== false) {
+            // If the initial URL had a `?` without query string, `parse_url()`
+            // doesn't return the query component. We want to keep the question
+            // mark though.
+            $sanitized_url .= '?';
+        }
+
+        // Re-percent-encode the URL. We don't want to use directly
+        // rawurlencode() since it will convert slashes (/), colons (:) and
+        // question mark (?)
+        $sanitized_url = self::percentEncode($sanitized_url);
+
+        // The fragment must be added afterwhile or the hash (#) could be
+        // converted.
+        if (isset($parsed_url['fragment'])) {
+            $sanitized_url .= '#' . self::percentEncode($parsed_url['fragment']);
+        }
+
+        return $sanitized_url;
+    }
+
+    /**
+     * Resolves references to ./, ../ and extra / characters from a path.
+     *
+     * It is similar to the realpath() function, but it doesn't require the
+     * file to exist.
+     *
+     * @see https://www.php.net/manual/function.realpath.php
+     *
+     * @param string $path
+     *
+     * @return string
+     */
+    private static function normalizePath($path)
+    {
+        $realpath = array();
+
+        // We just simulate browsing the path, segment by segment.
+        $path_segments = explode('/', $path);
+        foreach ($path_segments as $path_segment) {
+            // . is the current folder, so we can ignore it. Same if the
+            // segment is empty, we don't want it.
+            if ($path_segment === '.' || strlen($path_segment) === 0) {
+                continue;
+            }
+
+            if ($path_segment === '..') {
+                // .. is the parent folder, so we must go back to the parent
+                // level
+                array_pop($realpath);
+            } else {
+                $realpath[] = $path_segment;
+            }
+        }
+
+        // Rebuild the path and make sure to keep the first and last slash if
+        // they existed in the original path.
+        $realpath = implode('/', $realpath);
+        if ($path[0] === '/') {
+            $realpath = '/' . $realpath;
+        }
+        if ($realpath !== '/' && $path[strlen($path) - 1] === '/') {
+            $realpath = $realpath . '/';
+        }
+
+        return $realpath;
+    }
+
+    /**
+     * Percent-encode a URL.
+     *
+     * Contrary to urlencode() and rawurlencode(), this method only encodes
+     * ASCII characters <= 32, >= 127, '"', "#" and "%". This leaves for
+     * instance "/", ":" and "?" as they are.
+     *
+     * @see https://www.php.net/manual/function.rawurlencode.php
+     * @see https://en.wikipedia.org/wiki/ASCII
+     *
+     * @param string url
+     *
+     * @return string
+     */
+    private static function percentEncode($url)
+    {
+        $escaped_url = '';
+        foreach (str_split($url) as $char) {
+            $ord = ord($char);
+            if ($ord > 32 && $ord < 127 && $char !== '"' && $char !== '#' && $char !== '%') {
+                $escaped_url .= $char;
+            } else {
+                $escaped_url .= rawurlencode($char);
+            }
+        }
+        return $escaped_url;
+    }
+}
diff --git a/tests/lib/SpiderBits/UrlTest.php b/tests/lib/SpiderBits/UrlTest.php
new file mode 100644
index 00000000..107d615e
--- /dev/null
+++ b/tests/lib/SpiderBits/UrlTest.php
@@ -0,0 +1,72 @@
+<?php
+
+namespace SpiderBits;
+
+class UrlTest extends \PHPUnit\Framework\TestCase
+{
+    /**
+     * @dataProvider sanitizeProvider
+     */
+    public function testSanitize($input, $expected)
+    {
+        $sanitized_url = Url::sanitize($input);
+
+        $this->assertSame($expected, $sanitized_url);
+    }
+
+    public function sanitizeProvider()
+    {
+        // This test suite comes from https://developers.google.com/safe-browsing/v4/urls-hashing#canonicalization
+        // Minor differences are indicated in comments
+        // phpcs:disable Generic.Files.LineLength.TooLong
+        return [
+            ["", ''],
+            ["   ", ''],
+            [" \n\t\r", ''],
+            ["http://host/%25%32%35", 'http://host/%25'],
+            ["http://host/%25%32%35%25%32%35", 'http://host/%25%25'],
+            ["http://host/%2525252525252525", 'http://host/%25'],
+            ["http://host/asdf%25%32%35asd", 'http://host/asdf%25asd'],
+            ["http://host/%%%25%32%35asd%%", 'http://host/%25%25%25asd%25%25'],
+            ["http://www.google.com/", 'http://www.google.com/'],
+            ["http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/", 'http://168.188.99.26/.secure/www.ebay.com/'],
+            ["http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/", 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'],
+            ["http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B", 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'],
+            ["http://3279880203/blah", 'http://195.127.0.11/blah'],
+            ["http://www.google.com/blah/..", 'http://www.google.com/'],
+            ["www.google.com/", 'http://www.google.com/'],
+            ["www.google.com", 'http://www.google.com/'],
+            // We want to keep the fragment
+            ["http://www.evil.com/blah#frag", 'http://www.evil.com/blah#frag'],
+            ["http://www.GOOgle.com/", 'http://www.google.com/'],
+            ["http://www.google.com.../", 'http://www.google.com/'],
+            ["http://www.google.com/foo\tbar\rbaz\n2",'http://www.google.com/foobarbaz2'],
+            ["http://www.google.com/q?", 'http://www.google.com/q?'],
+            ["http://www.google.com/q?r?", 'http://www.google.com/q?r?'],
+            ["http://www.google.com/q?r?s", 'http://www.google.com/q?r?s'],
+            // We want to keep the fragment
+            ["http://evil.com/foo#bar#baz", 'http://evil.com/foo#bar%23baz'],
+            ["http://evil.com/foo;", 'http://evil.com/foo;'],
+            ["http://evil.com/foo?bar;", 'http://evil.com/foo?bar;'],
+            // idn_to_ascii cannot handle this url (which is invalid anyway)
+            // and return an empty host
+            ["http://\x01\x80.com/", 'http:///'],
+            ["http://notrailingslash.com", 'http://notrailingslash.com/'],
+            // We want to keep the port
+            ["http://www.gotaport.com:1234/", 'http://www.gotaport.com:1234/'],
+            ["  http://www.google.com/  ", 'http://www.google.com/'],
+            ["http:// leadingspace.com/", 'http://%20leadingspace.com/'],
+            ["http://%20leadingspace.com/", 'http://%20leadingspace.com/'],
+            ["%20leadingspace.com/", 'http://%20leadingspace.com/'],
+            ["https://www.securesite.com/", 'https://www.securesite.com/'],
+            ["http://host.com/ab%23cd", 'http://host.com/ab%23cd'],
+            ["http://host.com//twoslashes?more//slashes", 'http://host.com/twoslashes?more//slashes'],
+            // More tests
+            ["https://domén-with-accent.com?query=with-àccent", 'https://xn--domn-with-accent-dqb.com/?query=with-%C3%A0ccent'],
+            ["https://host.com?query=with-%C3%A0ccent", 'https://host.com/?query=with-%C3%A0ccent'],
+            ["http://evil.com/foo#bar/baz", 'http://evil.com/foo#bar/baz'],
+            ["http://evil.com/foo#bar%22baz", 'http://evil.com/foo#bar%22baz'],
+            ["http://evil.com/foo#🐘", 'http://evil.com/foo#%F0%9F%90%98'],
+        ];
+    }
+}