From b2c4fadae0a8953ddde7ce36bcc71da116377cce Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Thu, 25 Jul 2024 16:29:21 +0100 Subject: [PATCH 1/5] Use simpler error pages when HTML isn't needed This saves queries and processing time, if the HTML isn't ever going to be rendered, and simple text would be enough. --- tbx/core/tests/test_views.py | 25 ++++++++++++++++ tbx/core/utils/views.py | 56 ++++++++++++++++++++++++++++++++++-- tbx/urls.py | 12 ++------ 3 files changed, 81 insertions(+), 12 deletions(-) diff --git a/tbx/core/tests/test_views.py b/tbx/core/tests/test_views.py index 075e1e401..7ab258aa4 100644 --- a/tbx/core/tests/test_views.py +++ b/tbx/core/tests/test_views.py @@ -106,3 +106,28 @@ def test_setting_theme_on_one_site_sets_it_on_multiple_sites(self): # check theme is set on new site resp = self.client.get(f"http://{new_site.hostname}/") self.assertEqual(resp.context["MODE"], mode) + +class PageNotFoundTestCase(TestCase): + url = "/does-not-exist/" + + def test_accessible(self) -> None: + response = self.client.get(self.url) + self.assertEqual(response.status_code, 404) + self.assertIn("text/html", response.headers["content-type"]) + + def test_accept_html(self) -> None: + response = self.client.get(self.url, headers={"Accept": "text/html"}) + self.assertEqual(response.status_code, 404) + self.assertIn("text/html", response.headers["content-type"]) + + def test_simple_when_doesnt_accept_html(self) -> None: + response = self.client.get(self.url, headers={"Accept": "text/css"}) + self.assertEqual(response.status_code, 404) + self.assertIn("text/plain", response.headers["content-type"]) + + def test_simple_when_html_not_highest(self) -> None: + response = self.client.get( + self.url, headers={"Accept": "text/html;q=0.8,text/css"} + ) + self.assertEqual(response.status_code, 404) + self.assertIn("text/plain", response.headers["content-type"]) diff --git a/tbx/core/utils/views.py b/tbx/core/utils/views.py index cf53d4585..1612c7d0d 100644 --- a/tbx/core/utils/views.py +++ b/tbx/core/utils/views.py @@ -1,9 +1,59 @@ +from django.http import HttpResponseNotFound, HttpResponseServerError from django.views import defaults +from django.views.decorators.csrf import requires_csrf_token +from django.views.decorators.vary import vary_on_headers -def page_not_found(request, exception, template_name="patterns/pages/errors/404.html"): - return defaults.page_not_found(request, exception, template_name) +def get_quality(media_type): + return float(media_type.params.get("q", 1)) +def show_html_error_page(request): + accepted_types = sorted(request.accepted_types, key=get_quality, reverse=True) + + html_type = next( + ( + accepted_type + for accepted_type in accepted_types + if accepted_type.match("text/html") + ), + None, + ) + + # If HTML isn't accepted, don't serve it + if html_type is None: + return False + + max_quality = get_quality(accepted_types[0]) + + # If HTML isn't the highest quality, don't serve it + if get_quality(html_type) < max_quality: + return False + + return True + + +@vary_on_headers("Accept") +@requires_csrf_token +def page_not_found( + request, exception=None, template_name="patterns/pages/errors/404.html" +): + if show_html_error_page(request): + return defaults.page_not_found(request, exception, template_name) + + # Serve a simpler, cheaper 404 page if we don't need to + return HttpResponseNotFound( + "Page not found", content_type="text/plain; charset=utf-8" + ) + + +@vary_on_headers("Accept") +@requires_csrf_token def server_error(request, template_name="patterns/pages/errors/500.html"): - return defaults.server_error(request, template_name) + if show_html_error_page(request): + return defaults.server_error(request, template_name) + + # Serve a simpler, cheaper 500 page if we don't need to + return HttpResponseServerError( + "Server error", content_type="text/plain; charset=utf-8" + ) diff --git a/tbx/urls.py b/tbx/urls.py index 392327a49..e90bc4e0b 100644 --- a/tbx/urls.py +++ b/tbx/urls.py @@ -11,6 +11,7 @@ get_default_cache_control_method_decorator, ) from tbx.core.views import robots, switch_mode +from tbx.core.utils.views import page_not_found, server_error from wagtail import urls as wagtail_urls from wagtail.admin import urls as wagtailadmin_urls from wagtail.contrib.sitemaps.views import sitemap @@ -33,7 +34,6 @@ if settings.DEBUG: from django.conf.urls.static import static from django.contrib.staticfiles.urls import staticfiles_urlpatterns - from django.views.generic import TemplateView # Serve static and media files from development server urlpatterns += staticfiles_urlpatterns() @@ -42,14 +42,8 @@ # Add views for testing 404 and 500 templates urlpatterns += [ # Add views for testing 404 and 500 templates - path( - "test404/", - TemplateView.as_view(template_name="patterns/pages/errors/404.html"), - ), - path( - "test500/", - TemplateView.as_view(template_name="patterns/pages/errors/500.html"), - ), + path("test404/", page_not_found), + path("test500/", server_error), ] # Django Debug Toolbar From 8109908bc5c353ab40134cd6af974666c89c9286 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Thu, 25 Jul 2024 16:47:37 +0100 Subject: [PATCH 2/5] Cache 404s for 15 minutes This should reduce the impact on missing pages being crawled. --- tbx/core/utils/views.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tbx/core/utils/views.py b/tbx/core/utils/views.py index 1612c7d0d..f0be66de4 100644 --- a/tbx/core/utils/views.py +++ b/tbx/core/utils/views.py @@ -1,5 +1,6 @@ from django.http import HttpResponseNotFound, HttpResponseServerError from django.views import defaults +from django.views.decorators.cache import cache_control from django.views.decorators.csrf import requires_csrf_token from django.views.decorators.vary import vary_on_headers @@ -33,8 +34,9 @@ def show_html_error_page(request): return True -@vary_on_headers("Accept") @requires_csrf_token +@vary_on_headers("Accept") +@cache_control(max_age=900) # 15 minutes def page_not_found( request, exception=None, template_name="patterns/pages/errors/404.html" ): @@ -47,8 +49,8 @@ def page_not_found( ) -@vary_on_headers("Accept") @requires_csrf_token +@vary_on_headers("Accept") def server_error(request, template_name="patterns/pages/errors/500.html"): if show_html_error_page(request): return defaults.server_error(request, template_name) From 89ae35428ce789df67d6892ab968d79b97457056 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Thu, 25 Jul 2024 16:59:52 +0100 Subject: [PATCH 3/5] Test missing and wildcard Accept headers --- tbx/core/tests/test_views.py | 28 +++++++++++++++++++++++----- tbx/core/utils/views.py | 7 +++++++ 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/tbx/core/tests/test_views.py b/tbx/core/tests/test_views.py index 7ab258aa4..8df689d94 100644 --- a/tbx/core/tests/test_views.py +++ b/tbx/core/tests/test_views.py @@ -110,11 +110,6 @@ def test_setting_theme_on_one_site_sets_it_on_multiple_sites(self): class PageNotFoundTestCase(TestCase): url = "/does-not-exist/" - def test_accessible(self) -> None: - response = self.client.get(self.url) - self.assertEqual(response.status_code, 404) - self.assertIn("text/html", response.headers["content-type"]) - def test_accept_html(self) -> None: response = self.client.get(self.url, headers={"Accept": "text/html"}) self.assertEqual(response.status_code, 404) @@ -131,3 +126,26 @@ def test_simple_when_html_not_highest(self) -> None: ) self.assertEqual(response.status_code, 404) self.assertIn("text/plain", response.headers["content-type"]) + + def test_missing_accept_header(self) -> None: + response = self.client.get(self.url, headers={"Accept": ""}) + self.assertEqual(response.status_code, 404) + self.assertIn("text/plain", response.headers["content-type"]) + + def test_wildcard_accept_header(self) -> None: + response = self.client.get(self.url, headers={"Accept": "*/*"}) + self.assertEqual(response.status_code, 404) + self.assertIn("text/plain", response.headers["content-type"]) + + def test_browser_request(self) -> None: + """ + Test Accept header from Firefox 128 + """ + response = self.client.get( + self.url, + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8" # noqa:E501 + }, + ) + self.assertEqual(response.status_code, 404) + self.assertIn("text/html", response.headers["content-type"]) diff --git a/tbx/core/utils/views.py b/tbx/core/utils/views.py index f0be66de4..82e2481bd 100644 --- a/tbx/core/utils/views.py +++ b/tbx/core/utils/views.py @@ -10,8 +10,15 @@ def get_quality(media_type): def show_html_error_page(request): + # If there is no `Accept` header, serve the simpler page + if not request.headers.get("Accept"): + return False + accepted_types = sorted(request.accepted_types, key=get_quality, reverse=True) + if len(accepted_types) == 1 and accepted_types[0].match("*/*"): + return False + html_type = next( ( accepted_type From 4cad762c462d3056f928b10ba679a5ff11adf6c9 Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Thu, 25 Jul 2024 17:08:43 +0100 Subject: [PATCH 4/5] Flip comment --- tbx/core/utils/views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tbx/core/utils/views.py b/tbx/core/utils/views.py index 82e2481bd..d86e3b958 100644 --- a/tbx/core/utils/views.py +++ b/tbx/core/utils/views.py @@ -50,7 +50,7 @@ def page_not_found( if show_html_error_page(request): return defaults.page_not_found(request, exception, template_name) - # Serve a simpler, cheaper 404 page if we don't need to + # Serve a simpler, cheaper 404 page if possible return HttpResponseNotFound( "Page not found", content_type="text/plain; charset=utf-8" ) @@ -62,7 +62,7 @@ def server_error(request, template_name="patterns/pages/errors/500.html"): if show_html_error_page(request): return defaults.server_error(request, template_name) - # Serve a simpler, cheaper 500 page if we don't need to + # Serve a simpler, cheaper 500 page if possible return HttpResponseServerError( "Server error", content_type="text/plain; charset=utf-8" ) From 489ae307437ab851b34cbf4cdfced2d11303feae Mon Sep 17 00:00:00 2001 From: Jake Howard Date: Mon, 16 Sep 2024 17:14:36 +0100 Subject: [PATCH 5/5] Allow Cloudflare to cache 404s --- cloudflare/workers.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cloudflare/workers.js b/cloudflare/workers.js index 740f3493e..1649c579d 100644 --- a/cloudflare/workers.js +++ b/cloudflare/workers.js @@ -74,9 +74,10 @@ const REPLACE_STRIPPED_QUERYSTRING_ON_REDIRECT_LOCATION = false; // Disabled by default, but highly recommended const STRIP_VALUELESS_QUERYSTRING_KEYS = false; -// Only these status codes should be considered cacheable +// Only these status codes should be considered cacheable. // (from https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4) -const CACHABLE_HTTP_STATUS_CODES = [200, 203, 206, 300, 301, 410]; +// 404 is added to reduce server load on spammed requests. +const CACHABLE_HTTP_STATUS_CODES = [200, 203, 206, 300, 301, 410, 404]; addEventListener('fetch', (event) => { event.respondWith(main(event));