diff --git a/deadseeker/responsefetcher.py b/deadseeker/responsefetcher.py index 15f995a..b1676fa 100644 --- a/deadseeker/responsefetcher.py +++ b/deadseeker/responsefetcher.py @@ -68,6 +68,10 @@ async def _do_get( async with session.get(url) as response: timer.stop() resp.status = response.status + # Because of redirects the url might have changed. Update it here. + # This fixes https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39 + urltarget.url = str(response.real_url) + if has_html(response) and is_onsite(urltarget): resp.html = await response.text() diff --git a/test/mock_server/index.html b/test/mock_server/index.html index 1dc59c3..77ca184 100644 --- a/test/mock_server/index.html +++ b/test/mock_server/index.html @@ -11,5 +11,9 @@

Index

To Page 1 To Page 4 To Subpage 1 + + To Subpage index + + To Subpage index / \ No newline at end of file diff --git a/test/mock_server/subpages/redirect/redirect.html b/test/mock_server/subpages/redirect/redirect.html new file mode 100644 index 0000000..247e184 --- /dev/null +++ b/test/mock_server/subpages/redirect/redirect.html @@ -0,0 +1,12 @@ + + + + + Redirect + + +

Redirect

+ To Page 3 + To Page 1 + + \ No newline at end of file diff --git a/test/mock_server/subpages/subsubpages/index.html b/test/mock_server/subpages/subsubpages/index.html new file mode 100644 index 0000000..5016ebd --- /dev/null +++ b/test/mock_server/subpages/subsubpages/index.html @@ -0,0 +1,13 @@ + + + + + Subpage index + + +

Subpage index

+ To Page 2 + To redirected page via absolute link + To redirected page via relative link + + \ No newline at end of file diff --git a/test/test_integration.py b/test/test_integration.py index 80407f6..64f5aea 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -25,11 +25,11 @@ def do_HEAD(self): self.send_response(HTTPStatus.METHOD_NOT_ALLOWED) self.end_headers() return - if not self.check_error(): + if not self.check_error() and not self.check_redirect(): super().do_HEAD() def do_GET(self): - if not self.check_error(): + if not self.check_error() and not self.check_redirect(): super().do_GET() def check_error(self): @@ -39,6 +39,15 @@ def check_error(self): return True return False + def check_redirect(self): + if self.path.endswith('/subpages'): + self.send_response(HTTPStatus.MOVED_PERMANENTLY) + new_url = self.path.replace('/subpages', '/subpages/') + self.send_header('Location', new_url) + self.end_headers() + return True + return False + def get_free_port(): s = socket.socket(socket.AF_INET, type=socket.SOCK_STREAM) @@ -121,7 +130,11 @@ def test_messagesLogged(self): f'200 - {self.url}/subpages/subpage1.html - ', f'200 - {self.url}/page2.html - ', f'200 - {self.url}/subpages/subpage2.html - ', - f'200 - {self.url}/index.html - ' + f'200 - {self.url}/index.html - ', + f'200 - {self.url}/subpages/redirect/redirect.html - ', + # duplicate because of fix for https://github.com/ScholliYT/Broken-Links-Crawler-Action/issues/39 + f'200 - {self.url}/subpages/subsubpages/ - ', + f'200 - {self.url}/subpages/subsubpages/ - ', ] actual_infos: List[str] = [] for call in info_mock.call_args_list: