From 04ef182a93fe2ac65b9e039fc23f44c83924e1fe Mon Sep 17 00:00:00 2001
From: Tom Stein <dev@tomstein.me>
Date: Mon, 29 May 2023 15:53:27 +0200
Subject: [PATCH 1/2] add testcase to reproduce redirect behaviour

---
 test/mock_server/index.html                   |  4 ++++
 .../subpages/redirect/redirect.html           | 12 ++++++++++++
 .../subpages/subsubpages/index.html           | 13 +++++++++++++
 test/test_integration.py                      | 19 ++++++++++++++++---
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 test/mock_server/subpages/redirect/redirect.html
 create mode 100644 test/mock_server/subpages/subsubpages/index.html
diff --git a/test/mock_server/index.html b/test/mock_server/index.html
index 1dc59c3..77ca184 100644
--- a/test/mock_server/index.html
+++ b/test/mock_server/index.html
@@ -11,5 +11,9 @@ <h1>Index</h1>
   <a href="/page1.html">To Page 1</a>
   <a href="/page4.html">To Page 4</a>
   <a href="/subpages/subpage1.html">To Subpage 1</a>
+  <!-- This link redirects to /subpages/subsubpages/ see https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39 -->
+  <a href="/subpages/subsubpages">To Subpage index</a> 
+  <!-- This link has no redirect see https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39 -->
+  <a href="/subpages/subsubpages/">To Subpage index /</a>
 </body>
 </html>
\ No newline at end of file
diff --git a/test/mock_server/subpages/redirect/redirect.html b/test/mock_server/subpages/redirect/redirect.html
new file mode 100644
index 0000000..247e184
--- /dev/null
+++ b/test/mock_server/subpages/redirect/redirect.html
@@ -0,0 +1,12 @@
+<!doctype html>
+<html lang="">
+<head>
+  <meta charset="utf-8">
+  <title>Redirect</title>
+</head>
+<body>
+  <h1>Redirect</h1>
+  <a href="/page3.html">To Page 3</a>
+  <a href="/page1.html">To Page 1</a>
+</body>
+</html>
\ No newline at end of file
diff --git a/test/mock_server/subpages/subsubpages/index.html b/test/mock_server/subpages/subsubpages/index.html
new file mode 100644
index 0000000..5016ebd
--- /dev/null
+++ b/test/mock_server/subpages/subsubpages/index.html
@@ -0,0 +1,13 @@
+<!doctype html>
+<html lang="">
+<head>
+  <meta charset="utf-8">
+  <title>Subpage index</title>
+</head>
+<body>
+  <h1>Subpage index</h1>
+  <a href="/page2.html">To Page 2</a>
+  <a href="/subpages/redirect/redirect.html">To redirected page via absolute link</a>
+  <a href="../redirect/redirect.html">To redirected page via relative link</a>
+</body>
+</html>
\ No newline at end of file
diff --git a/test/test_integration.py b/test/test_integration.py
index 80407f6..64f5aea 100644
--- a/test/test_integration.py
+++ b/test/test_integration.py
@@ -25,11 +25,11 @@ def do_HEAD(self):
             self.send_response(HTTPStatus.METHOD_NOT_ALLOWED)
             self.end_headers()
             return
-        if not self.check_error():
+        if not self.check_error() and not self.check_redirect():
             super().do_HEAD()
 
     def do_GET(self):
-        if not self.check_error():
+        if not self.check_error() and not self.check_redirect():
             super().do_GET()
 
     def check_error(self):
@@ -39,6 +39,15 @@ def check_error(self):
             return True
         return False
 
+    def check_redirect(self):
+        if self.path.endswith('/subpages'):
+            self.send_response(HTTPStatus.MOVED_PERMANENTLY)
+            new_url = self.path.replace('/subpages', '/subpages/')
+            self.send_header('Location', new_url)
+            self.end_headers()
+            return True
+        return False
+
 
 def get_free_port():
     s = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM)
@@ -121,7 +130,11 @@ def test_messagesLogged(self):
                 f'200 - {self.url}/subpages/subpage1.html - ',
                 f'200 - {self.url}/page2.html - ',
                 f'200 - {self.url}/subpages/subpage2.html - ',
-                f'200 - {self.url}/index.html - '
+                f'200 - {self.url}/index.html - ',
+                f'200 - {self.url}/subpages/redirect/redirect.html - ',
+                # duplicate because of fix for https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39
+                f'200 - {self.url}/subpages/subsubpages/ - ',
+                f'200 - {self.url}/subpages/subsubpages/ - ',
             ]
             actual_infos: List[str] = []
             for call in info_mock.call_args_list:

From a074e6a0c83f29560300752af773c7356041ceb4 Mon Sep 17 00:00:00 2001
From: Tom Stein <dev@tomstein.me>
Date: Mon, 29 May 2023 15:58:46 +0200
Subject: [PATCH 2/2] fix to update url target after following http redirects

The url target is updated after following all http redirects to the final url.
This fixes borken states where the crawler is not arware of it's current location
because of http redirects.

This fix is not optimal as it causes duplicate requests to the same url.
Additionally the path of the redirect is not tracable.
---
 deadseeker/responsefetcher.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/deadseeker/responsefetcher.py b/deadseeker/responsefetcher.py
index 15f995a..b1676fa 100644
--- a/deadseeker/responsefetcher.py
+++ b/deadseeker/responsefetcher.py
@@ -68,6 +68,10 @@ async def _do_get(
         async with session.get(url) as response:
             timer.stop()
             resp.status = response.status
+            # Because of redirects the url might have changed. Update it here.
+            # This fixes https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39
+            urltarget.url = str(response.real_url)
+
             if has_html(response) and is_onsite(urltarget):
                 resp.html = await response.text()