From 7c8eaec1e12bf52547bc0276de8ef7550afa5663 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Tue, 27 Aug 2024 18:29:39 +0100 Subject: [PATCH] Rewrite lyrics integration tests --- beetsplug/lyrics.py | 34 +---- test/plugins/test_lyrics.py | 275 ++++++++++-------------------------- test/rsrc/lyricstext.yaml | 55 -------- 3 files changed, 78 insertions(+), 286 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 7e19136ba3..d92cea3de9 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -22,10 +22,10 @@ import json import os.path import re -import struct import unicodedata import urllib import warnings +from html import unescape from typing import Any import requests @@ -111,27 +111,6 @@ # Utilities. -def unichar(i): - try: - return chr(i) - except ValueError: - return struct.pack("i", i).decode("utf-32") - - -def unescape(text): - """Resolve &#xxx; HTML entities (and some others).""" - if isinstance(text, bytes): - text = text.decode("utf-8", "ignore") - out = text.replace(" ", " ") - - def replchar(m): - num = m.group(1) - return unichar(int(num)) - - out = re.sub("&#(\\d+);", replchar, out) - return out - - def extract_text_between(html, start_marker, end_marker): try: _, html = html.split(start_marker, 1) @@ -659,6 +638,8 @@ def _scrape_strip_cruft(html, plain_text_out=False): html = BREAK_RE.sub("\n", html) #
eats up surrounding '\n'. html = re.sub(r"(?s)<(script).*?", "", html) # Strip script tags. html = re.sub("\u2005", " ", html) # replace unicode with regular space + html = re.sub("