From 04ef182a93fe2ac65b9e039fc23f44c83924e1fe Mon Sep 17 00:00:00 2001 From: Tom Stein Date: Mon, 29 May 2023 15:53:27 +0200 Subject: [PATCH 1/2] add testcase to reproduce redirect behaviour --- test/mock_server/index.html | 4 ++++ .../subpages/redirect/redirect.html | 12 ++++++++++++ .../subpages/subsubpages/index.html | 13 +++++++++++++ test/test_integration.py | 19 ++++++++++++++++--- 4 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 test/mock_server/subpages/redirect/redirect.html create mode 100644 test/mock_server/subpages/subsubpages/index.html diff --git a/test/mock_server/index.html b/test/mock_server/index.html index 1dc59c3..77ca184 100644 --- a/test/mock_server/index.html +++ b/test/mock_server/index.html @@ -11,5 +11,9 @@

Index

To Page 1 To Page 4 To Subpage 1 + + To Subpage index + + To Subpage index / \ No newline at end of file diff --git a/test/mock_server/subpages/redirect/redirect.html b/test/mock_server/subpages/redirect/redirect.html new file mode 100644 index 0000000..247e184 --- /dev/null +++ b/test/mock_server/subpages/redirect/redirect.html @@ -0,0 +1,12 @@ + + + + + Redirect + + +

Redirect

+ To Page 3 + To Page 1 + + \ No newline at end of file diff --git a/test/mock_server/subpages/subsubpages/index.html b/test/mock_server/subpages/subsubpages/index.html new file mode 100644 index 0000000..5016ebd --- /dev/null +++ b/test/mock_server/subpages/subsubpages/index.html @@ -0,0 +1,13 @@ + + + + + Subpage index + + +

Subpage index

+ To Page 2 + To redirected page via absolute link + To redirected page via relative link + + \ No newline at end of file diff --git a/test/test_integration.py b/test/test_integration.py index 80407f6..64f5aea 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -25,11 +25,11 @@ def do_HEAD(self): self.send_response(HTTPStatus.METHOD_NOT_ALLOWED) self.end_headers() return - if not self.check_error(): + if not self.check_error() and not self.check_redirect(): super().do_HEAD() def do_GET(self): - if not self.check_error(): + if not self.check_error() and not self.check_redirect(): super().do_GET() def check_error(self): @@ -39,6 +39,15 @@ def check_error(self): return True return False + def check_redirect(self): + if self.path.endswith('/subpages'): + self.send_response(HTTPStatus.MOVED_PERMANENTLY) + new_url = self.path.replace('/subpages', '/subpages/') + self.send_header('Location', new_url) + self.end_headers() + return True + return False + def get_free_port(): s = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM) @@ -121,7 +130,11 @@ def test_messagesLogged(self): f'200 - {self.url}/subpages/subpage1.html - ', f'200 - {self.url}/page2.html - ', f'200 - {self.url}/subpages/subpage2.html - ', - f'200 - {self.url}/index.html - ' + f'200 - {self.url}/index.html - ', + f'200 - {self.url}/subpages/redirect/redirect.html - ', + # duplicate because of fix for https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39 + f'200 - {self.url}/subpages/subsubpages/ - ', + f'200 - {self.url}/subpages/subsubpages/ - ', ] actual_infos: List[str] = [] for call in info_mock.call_args_list: From a074e6a0c83f29560300752af773c7356041ceb4 Mon Sep 17 00:00:00 2001 From: Tom Stein Date: Mon, 29 May 2023 15:58:46 +0200 Subject: [PATCH 2/2] fix to update url target after following http redirects The url target is updated after following all http redirects to the final url. This fixes borken states where the crawler is not arware of it's current location because of http redirects. This fix is not optimal as it causes duplicate requests to the same url. Additionally the path of the redirect is not tracable. --- deadseeker/responsefetcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deadseeker/responsefetcher.py b/deadseeker/responsefetcher.py index 15f995a..b1676fa 100644 --- a/deadseeker/responsefetcher.py +++ b/deadseeker/responsefetcher.py @@ -68,6 +68,10 @@ async def _do_get( async with session.get(url) as response: timer.stop() resp.status = response.status + # Because of redirects the url might have changed. Update it here. + # This fixes https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39 + urltarget.url = str(response.real_url) + if has_html(response) and is_onsite(urltarget): resp.html = await response.text()