From 7c8eaec1e12bf52547bc0276de8ef7550afa5663 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Tue, 27 Aug 2024 18:29:39 +0100
Subject: [PATCH] Rewrite lyrics integration tests

---
 beetsplug/lyrics.py         |  34 +----
 test/plugins/test_lyrics.py | 275 ++++++++++--------------------------
 test/rsrc/lyricstext.yaml   |  55 --------
 3 files changed, 78 insertions(+), 286 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 7e19136ba3..d92cea3de9 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -22,10 +22,10 @@
 import json
 import os.path
 import re
-import struct
 import unicodedata
 import urllib
 import warnings
+from html import unescape
 from typing import Any
 
 import requests
@@ -111,27 +111,6 @@
 # Utilities.
 
 
-def unichar(i):
-    try:
-        return chr(i)
-    except ValueError:
-        return struct.pack("i", i).decode("utf-32")
-
-
-def unescape(text):
-    """Resolve &#xxx; HTML entities (and some others)."""
-    if isinstance(text, bytes):
-        text = text.decode("utf-8", "ignore")
-    out = text.replace("&nbsp;", " ")
-
-    def replchar(m):
-        num = m.group(1)
-        return unichar(int(num))
-
-    out = re.sub("&#(\\d+);", replchar, out)
-    return out
-
-
 def extract_text_between(html, start_marker, end_marker):
     try:
         _, html = html.split(start_marker, 1)
@@ -659,6 +638,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
     html = BREAK_RE.sub("\n", html)  # <br> eats up surrounding '\n'.
     html = re.sub(r"(?s)<(script).*?</\1>", "", html)  # Strip script tags.
     html = re.sub("\u2005", " ", html)  # replace unicode with regular space
+    html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
+    html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove bold / italics
 
     if plain_text_out:  # Strip remaining HTML tags
         html = COMMENT_RE.sub("", html)
@@ -674,10 +655,12 @@ def _scrape_merge_paragraphs(html):
     return re.sub(r"<div .*>\s*</div>", "\n", html)
 
 
-def scrape_lyrics_from_html(html):
+def scrape_lyrics_from_html(html: str | None) -> str | None:
     """Scrape lyrics from a URL. If no lyrics can be found, return None
     instead.
     """
+    if not html:
+        return None
 
     def is_text_notcode(text):
         if not text:
@@ -819,10 +802,7 @@ def fetch(self, artist, title, album=None, length=None):
                     url_link, url_title, title, artist
                 ):
                     continue
-                html = self.fetch_url(url_link)
-                if not html:
-                    continue
-                lyrics = scrape_lyrics_from_html(html)
+                lyrics = scrape_lyrics_from_html(self.fetch_url(url_link))
                 if not lyrics:
                     continue
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index d05642b84c..5f9cba02fa 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -31,16 +31,27 @@
 from beets.util import bytestring_path
 from beetsplug import lyrics
 
+PHRASE_BY_TITLE = {
+    "Lady Madonna": "friday night arrives without a suitcase",
+    "Jazz'n'blues": "as i check my balance i kiss the screen",
+    "Beets song": "via plugins, beets becomes a panacea",
+}
+
 log = logging.getLogger("beets.test_lyrics")
 raw_backend = lyrics.Backend({}, log)
 google = lyrics.Google(MagicMock(), log)
 genius = lyrics.Genius(MagicMock(), log)
 tekstowo = lyrics.Tekstowo(MagicMock(), log)
-lrclib = lyrics.LRCLib(MagicMock(), log)
 
 _p = pytest.param
 
 
+skip_ci = pytest.mark.skipif(
+    os.environ.get("GITHUB_ACTIONS") == "true",
+    reason="GitHub actions is on some form of Cloudflare blacklist",
+)
+
+
 class LyricsPluginTest(unittest.TestCase):
     def setUp(self):
         """Set up configuration."""
@@ -213,26 +224,6 @@ def __call__(self, url, filename=None):
         return content
 
 
-class LyricsAssertions:
-    """A mixin with lyrics-specific assertions."""
-
-    def assertLyricsContentOk(self, title, text, msg=""):  # noqa: N802
-        """Compare lyrics text to expected lyrics for given title."""
-        if not text:
-            return
-
-        keywords = set(LYRICS_TEXTS[google.slugify(title)].split())
-        words = {x.strip(".?, ()") for x in text.lower().split()}
-
-        if not keywords <= words:
-            details = (
-                f"{keywords!r} is not a subset of {words!r}."
-                f" Words only in expected set {keywords - words!r},"
-                f" Words only in result set {words - keywords!r}."
-            )
-            self.fail(f"{details} : {msg}")
-
-
 LYRICS_ROOT_DIR = os.path.join(_common.RSRC, b"lyrics")
 yaml_path = os.path.join(_common.RSRC, b"lyricstext.yaml")
 LYRICS_TEXTS = confuse.load_yaml(yaml_path)
@@ -247,132 +238,71 @@ def setUp(self):
             self.skipTest("Beautiful Soup 4 not available")
 
 
-class LyricsPluginSourcesTest(LyricsGoogleBaseTest, LyricsAssertions):
-    """Check that beets google custom search engine sources are correctly
-    scraped.
-    """
-
-    DEFAULT_SONG = dict(artist="The Beatles", title="Lady Madonna")
-
-    DEFAULT_SOURCES = [
-        # dict(artist=u'Santana', title=u'Black magic woman',
-        #      backend=lyrics.MusiXmatch),
-        dict(
-            DEFAULT_SONG,
-            backend=lyrics.Genius,
-            # GitHub actions is on some form of Cloudflare blacklist.
-            skip=os.environ.get("GITHUB_ACTIONS") == "true",
-        ),
-        dict(artist="Boy In Space", title="u n eye", backend=lyrics.Tekstowo),
-    ]
-
-    GOOGLE_SOURCES = [
-        dict(
-            DEFAULT_SONG,
-            url="http://www.absolutelyrics.com",
-            path="/lyrics/view/the_beatles/lady_madonna",
-        ),
-        dict(
-            DEFAULT_SONG,
-            url="http://www.azlyrics.com",
-            path="/lyrics/beatles/ladymadonna.html",
-            # AZLyrics returns a 403 on GitHub actions.
-            skip=os.environ.get("GITHUB_ACTIONS") == "true",
-        ),
-        dict(
-            DEFAULT_SONG,
-            url="http://www.chartlyrics.com",
-            path="/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx",
-        ),
-        # dict(DEFAULT_SONG,
-        #      url=u'http://www.elyricsworld.com',
-        #      path=u'/lady_madonna_lyrics_beatles.html'),
-        dict(
-            url="http://www.lacoccinelle.net",
-            artist="Jacques Brel",
-            title="Amsterdam",
-            path="/paroles-officielles/275679.html",
-        ),
-        dict(
-            DEFAULT_SONG, url="http://letras.mus.br/", path="the-beatles/275/"
-        ),
-        dict(
-            DEFAULT_SONG,
-            url="http://www.lyricsmania.com/",
-            path="lady_madonna_lyrics_the_beatles.html",
-        ),
-        dict(
-            DEFAULT_SONG,
-            url="http://www.lyricsmode.com",
-            path="/lyrics/b/beatles/lady_madonna.html",
-        ),
-        dict(
-            url="http://www.lyricsontop.com",
-            artist="Amy Winehouse",
-            title="Jazz'n'blues",
-            path="/amy-winehouse-songs/jazz-n-blues-lyrics.html",
-        ),
-        # dict(DEFAULT_SONG,
-        #      url='http://www.metrolyrics.com/',
-        #      path='lady-madonna-lyrics-beatles.html'),
-        # dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
-        #      artist=u'Santana', title=u'Black magic woman'),
-        dict(
-            url="http://www.paroles.net/",
-            artist="Lilly Wood & the prick",
-            title="Hey it's ok",
-            path="lilly-wood-the-prick/paroles-hey-it-s-ok",
-        ),
-        dict(
-            DEFAULT_SONG,
-            url="http://www.songlyrics.com",
-            path="/the-beatles/lady-madonna-lyrics",
-        ),
-        dict(
-            DEFAULT_SONG,
-            url="http://www.sweetslyrics.com",
-            path="/761696.The%20Beatles%20-%20Lady%20Madonna.html",
-        ),
-    ]
+@pytest.mark.skipif(
+    not os.environ.get("INTEGRATION_TEST") == "1",
+    reason="integration testing not enabled",
+)
+class TestSources:
+    @pytest.mark.parametrize(
+        "title, url",
+        [
+            *(
+                ("Lady Madonna", url)
+                for url in (
+                    "http://www.chartlyrics.com/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx",  # noqa: E501
+                    "http://www.absolutelyrics.com/lyrics/view/the_beatles/lady_madonna",  # noqa: E501
+                    "https://letras.mus.br/the-beatles/275/",
+                    "https://www.lyricsmania.com/lady_madonna_lyrics_the_beatles.html",
+                    "https://www.lyricsmode.com/lyrics/b/beatles/lady_madonna.html",
+                    "https://www.paroles.net/the-beatles/paroles-lady-madonna",
+                    "https://www.songlyrics.com/the-beatles/lady-madonna-lyrics",
+                    "https://www.sweetslyrics.com/761696.The%20Beatles%20-%20Lady%20Madonna.html",  # noqa: E501
+                    "http://www.musica.com/letras.asp?letra=59862",
+                    "https://www.lacoccinelle.net/259956-the-beatles-lady-madonna.html",
+                )
+            ),
+            pytest.param(
+                "Lady Madonna",
+                "https://www.azlyrics.com/lyrics/beatles/ladymadonna.html",
+                marks=skip_ci,
+            ),
+            (
+                "Jazz'n'blues",
+                "https://www.lyricsontop.com/amy-winehouse-songs/jazz-n-blues-lyrics.html",  # noqa: E501
+            ),
+        ],
+    )
+    def test_google_source(self, title, url):
+        """Test if lyrics present on websites registered in beets google custom
+        search engine are correctly scraped.
+        """
+        response = raw_backend.fetch_url(url)
+        result = lyrics.scrape_lyrics_from_html(response).lower()
 
-    def setUp(self):
-        LyricsGoogleBaseTest.setUp(self)
-        self.plugin = lyrics.LyricsPlugin()
+        assert google.is_lyrics(result)
+        assert PHRASE_BY_TITLE[title] in result
 
-    @unittest.skipUnless(
-        os.environ.get("INTEGRATION_TEST", "0") == "1",
-        "integration testing not enabled",
+    @pytest.mark.parametrize(
+        "backend",
+        [
+            pytest.param(lyrics.Genius, marks=skip_ci),
+            lyrics.Tekstowo,
+            lyrics.LRCLib,
+            # lyrics.MusiXmatch,
+        ],
     )
-    def test_backend_sources_ok(self):
-        """Test default backends with songs known to exist in respective
+    def test_backend_source(self, backend):
+        """Test default backends with a song known to exist in respective
         databases.
         """
-        # Don't test any sources marked as skipped.
-        sources = [s for s in self.DEFAULT_SOURCES if not s.get("skip", False)]
-        for s in sources:
-            with self.subTest(s["backend"].__name__):
-                backend = s["backend"](self.plugin.config, self.plugin._log)
-                res = backend.fetch(s["artist"], s["title"])
-                self.assertLyricsContentOk(s["title"], res)
-
-    @unittest.skipUnless(
-        os.environ.get("INTEGRATION_TEST", "0") == "1",
-        "integration testing not enabled",
-    )
-    def test_google_sources_ok(self):
-        """Test if lyrics present on websites registered in beets google custom
-        search engine are correctly scraped.
-        """
-        # Don't test any sources marked as skipped.
-        sources = [s for s in self.GOOGLE_SOURCES if not s.get("skip", False)]
-        for s in sources:
-            url = s["url"] + s["path"]
-            res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
-            assert google.is_lyrics(res), url
-            self.assertLyricsContentOk(s["title"], res, url)
+        plugin = lyrics.LyricsPlugin()
+        backend = backend(plugin.config, plugin._log)
+        title = "Lady Madonna"
+        res = backend.fetch("The Beatles", title)
+        assert PHRASE_BY_TITLE[title] in res.lower()
 
 
-class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest, LyricsAssertions):
+class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest):
     """Test scraping heuristics on a fake html page."""
 
     source = dict(
@@ -393,7 +323,7 @@ def test_mocked_source_ok(self):
         url = self.source["url"] + self.source["path"]
         res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
         assert google.is_lyrics(res), url
-        self.assertLyricsContentOk(self.source["title"], res, url)
+        assert PHRASE_BY_TITLE[self.source["title"]] in res.lower()
 
     @patch.object(lyrics.Backend, "fetch_url", MockFetchUrl())
     def test_is_page_candidate_exact_match(self):
@@ -644,37 +574,6 @@ def test_no_results(self):
         assert tekstowo.parse_search_results(mock(url)) is None
 
 
-class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):
-    """Tests Tekstowo lyric source with real requests"""
-
-    def setUp(self):
-        """Set up configuration"""
-        TekstowoBaseTest.setUp(self)
-        self.plugin = lyrics.LyricsPlugin()
-        tekstowo.config = self.plugin.config
-
-    @unittest.skipUnless(
-        os.environ.get("INTEGRATION_TEST", "0") == "1",
-        "integration testing not enabled",
-    )
-    def test_normal(self):
-        """Ensure we can fetch a song's lyrics in the ordinary case"""
-        lyrics = tekstowo.fetch("Boy in Space", "u n eye")
-        self.assertLyricsContentOk("u n eye", lyrics)
-
-    @unittest.skipUnless(
-        os.environ.get("INTEGRATION_TEST", "0") == "1",
-        "integration testing not enabled",
-    )
-    def test_no_matching_results(self):
-        """Ensure we fetch nothing if there are search results
-        returned but no matches"""
-        # https://github.com/beetbox/beets/issues/4406
-        # expected return value None
-        lyrics = tekstowo.fetch("Kelly Bailey", "Black Mesa Inbound")
-        assert lyrics is None
-
-
 # test LRCLib backend
 
 
@@ -771,38 +670,6 @@ def test_error(
         assert re.search(expected_log_match, last_log, re.I)
 
 
-class LRCLibIntegrationTest(LyricsAssertions):
-    def setUp(self):
-        self.plugin = lyrics.LyricsPlugin()
-        lrclib.config = self.plugin.config
-
-    @unittest.skipUnless(
-        os.environ.get("INTEGRATION_TEST", "0") == "1",
-        "integration testing not enabled",
-    )
-    def test_track_with_lyrics(self):
-        lyrics = lrclib.fetch("Boy in Space", "u n eye", "Live EP", 160)
-        self.assertLyricsContentOk("u n eye", lyrics)
-
-    @unittest.skipUnless(
-        os.environ.get("INTEGRATION_TEST", "0") == "1",
-        "integration testing not enabled",
-    )
-    def test_instrumental_track(self):
-        lyrics = lrclib.fetch(
-            "Kelly Bailey", "Black Mesa Inbound", "Half Life 2 Soundtrack", 134
-        )
-        assert lyrics is None
-
-    @unittest.skipUnless(
-        os.environ.get("INTEGRATION_TEST", "0") == "1",
-        "integration testing not enabled",
-    )
-    def test_nonexistent_track(self):
-        lyrics = lrclib.fetch("blah", "blah", "blah", 999)
-        assert lyrics is None
-
-
 # test utilities
 
 
diff --git a/test/rsrc/lyricstext.yaml b/test/rsrc/lyricstext.yaml
index 4cec7802a0..83354dc3ca 100644
--- a/test/rsrc/lyricstext.yaml
+++ b/test/rsrc/lyricstext.yaml
@@ -1,62 +1,7 @@
 # Song used by LyricsGooglePluginMachineryTest
 
-Beets_song: |
-    beets is the media library management system for obsessive music geeks the purpose of 
-    beets is to get your music collection right once and for all it catalogs your collection 
-    automatically improving its metadata as it goes it then provides a bouquet of tools for 
-    manipulating and accessing your music here's an example of beets' brainy tag corrector doing its 
-    because beets is designed as a library it can do almost anything you can imagine for your 
-    music collection via plugins beets becomes a panacea
-
 missing_texts: |
     Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
     as they'll be released by $ARTIST, check back soon!
     In case you have the lyrics to $TITLE and want to send them to us, fill out
     the following form.
-
-# Songs lyrics used to test the different sources present in the google custom search engine.
-# Text is randomized for copyright infringement reason.
-
-Amsterdam: |
-    coup corps coeur invitent mains comme trop morue le hantent mais la dames joli revenir aux 
-    mangent croquer pleine plantent rire de sortent pleins fortune d'amsterdam bruit ruisselants 
-    large poissons braguette leur putains blanches jusque pissent dans soleils dansent et port
-    bien vertu nez sur chaleur femmes rotant dorment marins boivent bu les que d'un qui je 
-    une cou hambourg plus ils dents ou tournent or berges d'ailleurs tout ciel haubans ce son lueurs
-    en lune ont mouchent leurs long frottant jusqu'en vous regard montrent langueurs chantent
-    tordent pleure donnent drames mornes des panse pour un sent encore referment nappes au meurent
-    geste quand puis alors frites grosses batave expire naissent reboivent oriflammes grave riant a 
-    enfin rance fier y bouffer s'entendre se mieux
-
-Lady_Madonna: |
-    feed his money tuesday manage didn't head feet see arrives at in madonna rest morning children 
-    wonder how make thursday your to sunday music papers come tie you has was is listen suitcase 
-    ends friday run that needed breast they child baby mending on lady learned a nun like did wednesday 
-    bed think without afternoon night meet the playing lying
-
-Jazz_n_blues: |
-    all shoes money through follow blow til father to his hit jazz kiss now cool bar cause 50 night
-    heading i'll says yeah cash forgot blues out what for ways away fingers waiting got ever bold 
-    screen sixty throw wait on about last compton days o pick love wall had within jeans jd next 
-    miss standing from it's two long fight extravagant tell today more buy shopping that didn't 
-    what's but russian up can parkway balance my and gone am it as at in check if bags when cross 
-    machine take you drinks coke june wrong coming fancy's i n' impatient so the main's spend 
-    that's
-
-Hey_it_s_ok: |
-    and forget be when please it against fighting mama cause ! again what said
-    things papa hey to much lovers way wet was too do drink and i who forgive
-    hey fourteen please know not wanted had myself ok friends bed times looked
-    swear act found the my mean
-
-Black_magic_woman: |
-    blind heart sticks just don't into back alone see need yes your out devil make that to black got
-    you might me woman turning spell stop baby with 'round a on stone messin' magic i of 
-    tricks up leave turn bad so pick she's my can't
-
-u_n_eye: |
-    let see cool bed for sometimes are place told in yeah or ride open hide blame knee your my borders
-    perfect i of laying lies they love the night all out saying fast things said that on face hit hell
-    no low not bullets bullet fly time maybe over is roof a it know now airplane where and tonight
-    brakes just waste we go an to you was going eye start need insane cross gotta mood life with
-    hurts too whoa me fight little every oh would thousand but high lay space do down private