From 47247f725005c5c94c036a43943270044a87eef8 Mon Sep 17 00:00:00 2001 From: Ritiek Malhotra Date: Wed, 8 Apr 2020 21:43:58 +0530 Subject: [PATCH] Add additional methods to fetch lyrics The following inputs can now be used to fetch lyrics: * artist and track names * search query * direct url --- spotdl/command_line/helper.py | 15 ++- spotdl/lyrics/lyric_base.py | 27 +++-- spotdl/lyrics/providers/genius.py | 84 +++++++++---- spotdl/lyrics/providers/lyricwikia_wrapper.py | 17 +-- spotdl/lyrics/providers/tests/test_genius.py | 113 +++++++++++++++--- .../tests/test_lyricwikia_wrapper.py | 11 +- spotdl/lyrics/tests/test_lyric_base.py | 19 ++- 7 files changed, 210 insertions(+), 76 deletions(-) diff --git a/spotdl/command_line/helper.py b/spotdl/command_line/helper.py index 01b0a1b..3d4eb4f 100644 --- a/spotdl/command_line/helper.py +++ b/spotdl/command_line/helper.py @@ -1,6 +1,8 @@ from spotdl.metadata.providers import ProviderSpotify from spotdl.metadata.providers import ProviderYouTube from spotdl.metadata.embedders import EmbedderDefault +from spotdl.lyrics.providers import LyricWikia +from spotdl.lyrics.providers import Genius from spotdl.track import Track @@ -10,7 +12,7 @@ import urllib.request import threading -def search_metadata(track): +def search_metadata(track, lyrics=True): youtube = ProviderYouTube() if spotdl.util.is_spotify(track): spotify = ProviderSpotify() @@ -34,8 +36,7 @@ def search_metadata(track): return metadata -def download_track(metadata, - dry_run=False, overwrite="prompt", output_ext="mp3", file_format="{artist} - {track-name}", log_fmt="{artist} - {track_name}"): +def download_track(metadata, arguments): # TODO: CONFIG.YML # Exit here if config.dry_run @@ -52,10 +53,16 @@ def download_track(metadata, track = Track(metadata, cache_albumart=True) track.download_while_re_encoding("test.mp3") + + # TODO: CONFIG.YML + # Skip metadata if config.no_metadata + track.apply_metadata("test.mp3") -def download_tracks_from_file(path): +def download_tracks_from_file(path, arguments): + # FIXME: Can we make this function cleaner? + # log.info( # "Checking and removing any duplicate tracks " # "in reading {}".format(path) diff --git a/spotdl/lyrics/lyric_base.py b/spotdl/lyrics/lyric_base.py index 6fbc4ab..fac97eb 100644 --- a/spotdl/lyrics/lyric_base.py +++ b/spotdl/lyrics/lyric_base.py @@ -12,16 +12,23 @@ class LyricBase(ABC): """ @abstractmethod - def __init__(self, artist, track): - """ - This method must set any protected attributes, - which may be modified from outside the class - if the need arises. - """ - pass - - @abstractmethod - def get_lyrics(self, linesep="\n", timeout=None): + def from_url(self, url, linesep="\n", timeout=None): + """ + This method must return the lyrics string for the + given track. + """ + pass + + @abstractmethod + def from_artist_and_track(self, artist, track, linesep="\n", timeout=None): + """ + This method must return the lyrics string for the + given track. + """ + pass + + @abstractmethod + def from_query(self, query, linesep="\n", timeout=None): """ This method must return the lyrics string for the given track. diff --git a/spotdl/lyrics/providers/genius.py b/spotdl/lyrics/providers/genius.py index 5c707d8..871f5c0 100644 --- a/spotdl/lyrics/providers/genius.py +++ b/spotdl/lyrics/providers/genius.py @@ -1,34 +1,38 @@ from bs4 import BeautifulSoup import urllib.request +import json from spotdl.lyrics.lyric_base import LyricBase from spotdl.lyrics.exceptions import LyricsNotFoundError BASE_URL = "https://genius.com" +BASE_SEARCH_URL = BASE_URL + "/api/search/multi?per_page=1&q=" +# FIXME: Make Genius a metadata provider instead of lyric provider +# Since, Genius parses additional metadata too (such as track +# name, artist name, albumart url). For example, fetch this URL: +# https://genius.com/api/search/multi?per_page=1&q=artist+trackname class Genius(LyricBase): - def __init__(self, artist, track): - self.artist = artist - self.track = track + def __init__(self): self.base_url = BASE_URL + self.base_search_url = BASE_SEARCH_URL - def _guess_lyric_url(self): + def guess_lyric_url_from_artist_and_track(self, artist, track): """ - Returns the possible lyric URL for the track available - on Genius. This may not always be a valid URL, but this - is apparently the best we can do at the moment? + Returns the possible lyric URL for the track available on + Genius. This may not always be a valid URL. """ - query = "/{} {} lyrics".format(self.artist, self.track) + query = "/{} {} lyrics".format(artist, track) query = query.replace(" ", "-") encoded_query = urllib.request.quote(query) lyric_url = self.base_url + encoded_query return lyric_url - def _fetch_page(self, url, timeout=None): + def _fetch_url_page(self, url, timeout=None): """ - Makes a GET request to the given URL and returns the - HTML content in the case of a valid response. + Makes a GET request to the given lyrics page URL and returns + the HTML content in the case of a valid response. """ request = urllib.request.Request(url) request.add_header("User-Agent", "urllib") @@ -36,17 +40,14 @@ class Genius(LyricBase): response = urllib.request.urlopen(request, timeout=timeout) except urllib.request.HTTPError: raise LyricsNotFoundError( - "Could not find lyrics for {} - {} at URL: {}".format( - self.artist, self.track, url - ) + "Could not find lyrics at URL: {}".format(url) ) else: return response.read() def _get_lyrics_text(self, html): """ - Extracts and returns the lyric content from the - provided HTML. + Extracts and returns the lyric content from the provided HTML. """ soup = BeautifulSoup(html, "html.parser") lyrics_paragraph = soup.find("p") @@ -57,11 +58,52 @@ class Genius(LyricBase): "The lyrics for this track are yet to be released." ) - def get_lyrics(self, linesep="\n", timeout=None): + def _fetch_search_page(self, url, timeout=None): """ - Returns the lyric string for the given artist and track. + Returns search results from a given URL in JSON. """ - url = self._guess_lyric_url() - html_page = self._fetch_page(url, timeout=timeout) - lyrics = self._get_lyrics_text(html_page) + request = urllib.request.Request(url) + request.add_header("User-Agent", "urllib") + response = urllib.request.urlopen(request, timeout=timeout) + metadata = json.loads(response.read()) + if len(metadata["response"]["sections"][0]["hits"]) == 0: + raise LyricsNotFoundError( + "Could not find any search results for URL: {}".format(url) + ) + return metadata + + def best_matching_lyric_url_from_query(self, query): + """ + Returns the best matching track's URL from a given query. + """ + encoded_query = query.replace(" ", "+") + search_url = self.base_search_url + encoded_query + metadata = self._fetch_search_page(search_url) + lyric_url = metadata["response"]["sections"][0]["hits"][0]["result"]["path"] + return self.base_url + lyric_url + + def from_query(self, query, linesep="\n", timeout=None): + """ + Returns the lyric string for the track best matching the + given query. + """ + lyric_url = self.best_matching_lyric_url_from_query(query) + return self.from_url(lyric_url, linesep, timeout=timeout) + + def from_artist_and_track(self, artist, track, linesep="\n", timeout=None): + """ + Returns the lyric string for the given artist and track + by making scraping search results and fetching the first + result. + """ + lyric_url = self.guess_lyric_url_from_artist_and_track(artist, track) + return self.from_url(lyric_url, linesep, timeout) + + def from_url(self, url, linesep="\n", timeout=None): + """ + Returns the lyric string for the given URL. + """ + lyric_html_page = self._fetch_url_page(url, timeout=timeout) + lyrics = self._get_lyrics_text(lyric_html_page) return lyrics.replace("\n", linesep) + diff --git a/spotdl/lyrics/providers/lyricwikia_wrapper.py b/spotdl/lyrics/providers/lyricwikia_wrapper.py index e5a26ac..4db3678 100644 --- a/spotdl/lyrics/providers/lyricwikia_wrapper.py +++ b/spotdl/lyrics/providers/lyricwikia_wrapper.py @@ -5,17 +5,20 @@ from spotdl.lyrics.exceptions import LyricsNotFoundError class LyricWikia(LyricBase): - def __init__(self, artist, track): - self.artist = artist - self.track = track + def from_query(self, query, linesep="\n", timeout=None): + raise NotImplementedError - def get_lyrics(self, linesep="\n", timeout=None): + def from_artist_and_track(self, artist, track, linesep="\n", timeout=None): """ Returns the lyric string for the given artist and track. """ try: - lyrics = lyricwikia.get_lyrics(self.artist, self.track, linesep, timeout) + lyrics = lyricwikia.get_lyrics(artist, track, linesep, timeout) except lyricwikia.LyricsNotFound as e: raise LyricsNotFoundError(e.args[0]) - else: - return lyrics + + return lyrics + + def from_url(self, url, linesep="\n", timeout=None): + raise NotImplementedError + diff --git a/spotdl/lyrics/providers/tests/test_genius.py b/spotdl/lyrics/providers/tests/test_genius.py index 44991e5..c9ac518 100644 --- a/spotdl/lyrics/providers/tests/test_genius.py +++ b/spotdl/lyrics/providers/tests/test_genius.py @@ -3,35 +3,114 @@ from spotdl.lyrics import exceptions from spotdl.lyrics.providers import Genius import urllib.request +import json import pytest - class TestGenius: def test_subclass(self): assert issubclass(Genius, LyricBase) + @pytest.fixture(scope="module") + def expect_lyrics_count(self): + # This is the number of characters in lyrics found + # for the track in `lyric_url` fixture below + return 1845 + + @pytest.fixture(scope="module") + def genius(self): + return Genius() + + def test_base_url(self, genius): + assert genius.base_url == "https://genius.com" + + @pytest.fixture(scope="module") + def artist(self): + return "selena gomez" + @pytest.fixture(scope="module") def track(self): - return Genius("artist", "track") + return "wolves" - def test_base_url(self, track): - assert track.base_url == "https://genius.com" + @pytest.fixture(scope="module") + def query(self, artist, track): + return "{} {}".format(artist, track) - def test_get_lyrics(self, track, monkeypatch): - def mocked_urlopen(url, timeout=None): - class DummyHTTPResponse: - def read(self): - return "

amazing lyrics!

" + @pytest.fixture(scope="module") + def guess_url(self, query): + return "https://genius.com/selena-gomez-wolves-lyrics" - return DummyHTTPResponse() + @pytest.fixture(scope="module") + def lyric_url(self): + return "https://genius.com/Selena-gomez-and-marshmello-wolves-lyrics" - monkeypatch.setattr("urllib.request.urlopen", mocked_urlopen) - assert track.get_lyrics() == "amazing lyrics!" + def test_guess_lyric_url_from_artist_and_track(self, genius, artist, track, guess_url): + url = genius.guess_lyric_url_from_artist_and_track(artist, track) + assert url == guess_url - def test_lyrics_not_found_error(self, track, monkeypatch): - def mocked_urlopen(url, timeout=None): + class MockHTTPResponse: + expect_lyrics = "" + + def __init__(self, request, timeout=None): + search_results_url = "https://genius.com/api/search/multi?per_page=1&q=selena+gomez+wolves" + if request._full_url == search_results_url: + read_method = lambda: json.dumps({ + "response": {"sections": [{"hits": [{"result": { + "path": "/Selena-gomez-and-marshmello-wolves-lyrics" + } }] }] } + }) + else: + read_method = lambda: "

" + self.expect_lyrics + "

" + + self.read = read_method + + @pytest.mark.network + def test_best_matching_lyric_url_from_query(self, genius, query, lyric_url): + url = genius.best_matching_lyric_url_from_query(query) + assert url == lyric_url + + def test_mock_best_matching_lyric_url_from_query(self, genius, query, lyric_url, monkeypatch): + monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse) + self.test_best_matching_lyric_url_from_query(genius, query, lyric_url) + + @pytest.mark.network + def test_from_url(self, genius, lyric_url, expect_lyrics_count): + lyrics = genius.from_url(lyric_url) + assert len(lyrics) == expect_lyrics_count + + def test_mock_from_url(self, genius, lyric_url, expect_lyrics_count, monkeypatch): + self.MockHTTPResponse.expect_lyrics = "a" * expect_lyrics_count + monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse) + self.test_from_url(genius, lyric_url, expect_lyrics_count) + + @pytest.mark.network + def test_from_artist_and_track(self, genius, artist, track, expect_lyrics_count): + lyrics = genius.from_artist_and_track(artist, track) + assert len(lyrics) == expect_lyrics_count + + def test_mock_from_artist_and_track(self, genius, artist, track, expect_lyrics_count, monkeypatch): + self.MockHTTPResponse.expect_lyrics = "a" * expect_lyrics_count + monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse) + self.test_from_artist_and_track(genius, artist, track, expect_lyrics_count) + + @pytest.mark.network + def test_from_query(self, genius, query, expect_lyrics_count): + lyrics = genius.from_query(query) + assert len(lyrics) == expect_lyrics_count + + def test_mock_from_query(self, genius, query, expect_lyrics_count, monkeypatch): + self.MockHTTPResponse.expect_lyrics = "a" * expect_lyrics_count + monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse) + self.test_from_query(genius, query, expect_lyrics_count) + + @pytest.mark.network + def test_lyrics_not_found_error(self, genius): + with pytest.raises(exceptions.LyricsNotFoundError): + genius.from_artist_and_track(self, "nonexistent_artist", "nonexistent_track") + + def test_mock_lyrics_not_found_error(self, genius, monkeypatch): + def mock_urlopen(url, timeout=None): raise urllib.request.HTTPError("", "", "", "", "") - monkeypatch.setattr("urllib.request.urlopen", mocked_urlopen) - with pytest.raises(exceptions.LyricsNotFoundError): - track.get_lyrics() + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) + self.test_lyrics_not_found_error(genius) + diff --git a/spotdl/lyrics/providers/tests/test_lyricwikia_wrapper.py b/spotdl/lyrics/providers/tests/test_lyricwikia_wrapper.py index 7cab7d5..6c83b2c 100644 --- a/spotdl/lyrics/providers/tests/test_lyricwikia_wrapper.py +++ b/spotdl/lyrics/providers/tests/test_lyricwikia_wrapper.py @@ -11,15 +11,16 @@ class TestLyricWikia: def test_subclass(self): assert issubclass(LyricWikia, LyricBase) - def test_get_lyrics(self, monkeypatch): + def test_from_artist_and_track(self, monkeypatch): # `LyricWikia` class uses the 3rd party method `lyricwikia.get_lyrics` # internally and there is no need to test a 3rd party library as they # have their own implementation of tests. monkeypatch.setattr( "lyricwikia.get_lyrics", lambda a, b, c, d: "awesome lyrics!" ) - track = LyricWikia("Lyricwikia", "Lyricwikia") - assert track.get_lyrics() == "awesome lyrics!" + artist, track = "selena gomez", "wolves" + lyrics = LyricWikia().from_artist_and_track(artist, track) + assert lyrics == "awesome lyrics!" def test_lyrics_not_found_error(self, monkeypatch): def lyricwikia_lyrics_not_found(msg): @@ -30,6 +31,6 @@ class TestLyricWikia: "lyricwikia.get_lyrics", lambda a, b, c, d: lyricwikia_lyrics_not_found("Nope, no lyrics."), ) - track = LyricWikia("Lyricwikia", "Lyricwikia") + artist, track = "nonexistent_artist", "nonexistent_track" with pytest.raises(exceptions.LyricsNotFoundError): - track.get_lyrics() + LyricWikia().from_artist_and_track(artist, track) diff --git a/spotdl/lyrics/tests/test_lyric_base.py b/spotdl/lyrics/tests/test_lyric_base.py index cfa9e86..a2c78b1 100644 --- a/spotdl/lyrics/tests/test_lyric_base.py +++ b/spotdl/lyrics/tests/test_lyric_base.py @@ -5,25 +5,20 @@ import pytest class TestAbstractBaseClass: def test_error_abstract_base_class_lyricbase(self): - artist = "awesome artist" - track = "amazing track" - with pytest.raises(TypeError): # This abstract base class must be inherited from # for instantiation - LyricBase(artist, track) - + LyricBase() def test_inherit_abstract_base_class_encoderbase(self): class LyricKid(LyricBase): - def __init__(self, artist, track): - super().__init__(artist, track) + def from_query(self, query): + raise NotImplementedError - def get_lyrics(self): + def from_artist_and_track(self, artist, track): pass + def from_url(self, url): + raise NotImplementedError - artist = "awesome artist" - track = "amazing track" - - LyricKid(artist, track) + LyricKid()