Skip to content

Commit

Permalink
bookmarkmgr: Fix URLs with outdated subdomains
Browse files Browse the repository at this point in the history
  • Loading branch information
lubo committed Jan 15, 2025
1 parent 0de1513 commit 8cdb183
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 5 deletions.
52 changes: 48 additions & 4 deletions bookmarkmgr/bookmarkmgr/checks/link_status.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from collections.abc import Awaitable, Callable
from collections.abc import Awaitable
from enum import IntEnum, unique
from functools import partial
from http import HTTPStatus
import itertools
import re
from typing import Any, Protocol
from urllib.parse import ParseResult, quote, urlparse

from tld import get_fld

from bookmarkmgr.cronet import RequestError, Response
from bookmarkmgr.scraper import Page

Expand All @@ -20,6 +24,12 @@
HTTPStatus.GONE.value,
}

_get_fld_lax = partial(
get_fld,
fail_silently=True,
fix_protocol=True,
)


@unique
class LinkStatus(IntEnum):
Expand Down Expand Up @@ -78,21 +88,54 @@ async def check_link_status(
return link_status, error


def _fix_url_quoting(url: ParseResult) -> ParseResult:
def _fix_url_quoting(
url: ParseResult,
**_: Any,
) -> ParseResult:
return url._replace(path=quote(url.path))


def _fix_url_trailing_slash(url: ParseResult) -> ParseResult:
def _fix_url_subdomain(
url: ParseResult,
redirect_url: ParseResult,
) -> ParseResult:
if (
url.hostname is None
or redirect_url.hostname is None
or url.hostname == redirect_url.hostname
):
return url

if _get_fld_lax(url.hostname) == _get_fld_lax(redirect_url.hostname):
return url._replace(netloc=redirect_url.netloc)

return url


def _fix_url_trailing_slash(
url: ParseResult,
**_: Any,
) -> ParseResult:
return url._replace(
path=(
url.path.rstrip("/") if url.path.endswith("/") else f"{url.path}/"
),
)


_URL_FIXERS: list[Callable[[ParseResult], ParseResult]] = [
class _FixerCallable(Protocol):
def __call__(
self,
url: ParseResult,
*,
redirect_url: ParseResult,
) -> ParseResult: ...


_URL_FIXERS: list[_FixerCallable] = [
_fix_url_quoting,
_fix_url_trailing_slash,
_fix_url_subdomain,
]


Expand All @@ -113,6 +156,7 @@ def get_fixed_url(response: Response, url: str) -> None | str:
for fixer in fixer_combination:
fixed_parsed_url = fixer(
fixed_parsed_url,
redirect_url=parsed_redirect_url,
)

if fixed_parsed_url == parsed_redirect_url:
Expand Down
13 changes: 12 additions & 1 deletion bookmarkmgr/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions bookmarkmgr/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ cffi = "^1.17.1"
enlighten = "^1.13.0"
overrides = "^7.7.0"
python = "^3.12"
tld = "^0.13"
yarl = "^1.18.3"

[tool.poetry.group.dev.dependencies]
Expand Down

0 comments on commit 8cdb183

Please sign in to comment.