Add additional methods to fetch lyrics

The following inputs can now be used to fetch lyrics:
* artist and track names
* search query
* direct url
This commit is contained in:
Ritiek Malhotra
2020-04-08 21:43:58 +05:30
parent 51da0b7a29
commit 47247f7250
7 changed files with 210 additions and 76 deletions

View File

@@ -1,6 +1,8 @@
from spotdl.metadata.providers import ProviderSpotify
from spotdl.metadata.providers import ProviderYouTube
from spotdl.metadata.embedders import EmbedderDefault
from spotdl.lyrics.providers import LyricWikia
from spotdl.lyrics.providers import Genius
from spotdl.track import Track
@@ -10,7 +12,7 @@ import urllib.request
import threading
def search_metadata(track):
def search_metadata(track, lyrics=True):
youtube = ProviderYouTube()
if spotdl.util.is_spotify(track):
spotify = ProviderSpotify()
@@ -34,8 +36,7 @@ def search_metadata(track):
return metadata
def download_track(metadata,
dry_run=False, overwrite="prompt", output_ext="mp3", file_format="{artist} - {track-name}", log_fmt="{artist} - {track_name}"):
def download_track(metadata, arguments):
# TODO: CONFIG.YML
# Exit here if config.dry_run
@@ -52,10 +53,16 @@ def download_track(metadata,
track = Track(metadata, cache_albumart=True)
track.download_while_re_encoding("test.mp3")
# TODO: CONFIG.YML
# Skip metadata if config.no_metadata
track.apply_metadata("test.mp3")
def download_tracks_from_file(path):
def download_tracks_from_file(path, arguments):
# FIXME: Can we make this function cleaner?
# log.info(
# "Checking and removing any duplicate tracks "
# "in reading {}".format(path)

View File

@@ -12,16 +12,23 @@ class LyricBase(ABC):
"""
@abstractmethod
def __init__(self, artist, track):
"""
This method must set any protected attributes,
which may be modified from outside the class
if the need arises.
"""
pass
@abstractmethod
def get_lyrics(self, linesep="\n", timeout=None):
def from_url(self, url, linesep="\n", timeout=None):
"""
This method must return the lyrics string for the
given track.
"""
pass
@abstractmethod
def from_artist_and_track(self, artist, track, linesep="\n", timeout=None):
"""
This method must return the lyrics string for the
given track.
"""
pass
@abstractmethod
def from_query(self, query, linesep="\n", timeout=None):
"""
This method must return the lyrics string for the
given track.

View File

@@ -1,34 +1,38 @@
from bs4 import BeautifulSoup
import urllib.request
import json
from spotdl.lyrics.lyric_base import LyricBase
from spotdl.lyrics.exceptions import LyricsNotFoundError
BASE_URL = "https://genius.com"
BASE_SEARCH_URL = BASE_URL + "/api/search/multi?per_page=1&q="
# FIXME: Make Genius a metadata provider instead of lyric provider
# Since, Genius parses additional metadata too (such as track
# name, artist name, albumart url). For example, fetch this URL:
# https://genius.com/api/search/multi?per_page=1&q=artist+trackname
class Genius(LyricBase):
def __init__(self, artist, track):
self.artist = artist
self.track = track
def __init__(self):
self.base_url = BASE_URL
self.base_search_url = BASE_SEARCH_URL
def _guess_lyric_url(self):
def guess_lyric_url_from_artist_and_track(self, artist, track):
"""
Returns the possible lyric URL for the track available
on Genius. This may not always be a valid URL, but this
is apparently the best we can do at the moment?
Returns the possible lyric URL for the track available on
Genius. This may not always be a valid URL.
"""
query = "/{} {} lyrics".format(self.artist, self.track)
query = "/{} {} lyrics".format(artist, track)
query = query.replace(" ", "-")
encoded_query = urllib.request.quote(query)
lyric_url = self.base_url + encoded_query
return lyric_url
def _fetch_page(self, url, timeout=None):
def _fetch_url_page(self, url, timeout=None):
"""
Makes a GET request to the given URL and returns the
HTML content in the case of a valid response.
Makes a GET request to the given lyrics page URL and returns
the HTML content in the case of a valid response.
"""
request = urllib.request.Request(url)
request.add_header("User-Agent", "urllib")
@@ -36,17 +40,14 @@ class Genius(LyricBase):
response = urllib.request.urlopen(request, timeout=timeout)
except urllib.request.HTTPError:
raise LyricsNotFoundError(
"Could not find lyrics for {} - {} at URL: {}".format(
self.artist, self.track, url
)
"Could not find lyrics at URL: {}".format(url)
)
else:
return response.read()
def _get_lyrics_text(self, html):
"""
Extracts and returns the lyric content from the
provided HTML.
Extracts and returns the lyric content from the provided HTML.
"""
soup = BeautifulSoup(html, "html.parser")
lyrics_paragraph = soup.find("p")
@@ -57,11 +58,52 @@ class Genius(LyricBase):
"The lyrics for this track are yet to be released."
)
def get_lyrics(self, linesep="\n", timeout=None):
def _fetch_search_page(self, url, timeout=None):
"""
Returns the lyric string for the given artist and track.
Returns search results from a given URL in JSON.
"""
url = self._guess_lyric_url()
html_page = self._fetch_page(url, timeout=timeout)
lyrics = self._get_lyrics_text(html_page)
request = urllib.request.Request(url)
request.add_header("User-Agent", "urllib")
response = urllib.request.urlopen(request, timeout=timeout)
metadata = json.loads(response.read())
if len(metadata["response"]["sections"][0]["hits"]) == 0:
raise LyricsNotFoundError(
"Could not find any search results for URL: {}".format(url)
)
return metadata
def best_matching_lyric_url_from_query(self, query):
"""
Returns the best matching track's URL from a given query.
"""
encoded_query = query.replace(" ", "+")
search_url = self.base_search_url + encoded_query
metadata = self._fetch_search_page(search_url)
lyric_url = metadata["response"]["sections"][0]["hits"][0]["result"]["path"]
return self.base_url + lyric_url
def from_query(self, query, linesep="\n", timeout=None):
"""
Returns the lyric string for the track best matching the
given query.
"""
lyric_url = self.best_matching_lyric_url_from_query(query)
return self.from_url(lyric_url, linesep, timeout=timeout)
def from_artist_and_track(self, artist, track, linesep="\n", timeout=None):
"""
Returns the lyric string for the given artist and track
by making scraping search results and fetching the first
result.
"""
lyric_url = self.guess_lyric_url_from_artist_and_track(artist, track)
return self.from_url(lyric_url, linesep, timeout)
def from_url(self, url, linesep="\n", timeout=None):
"""
Returns the lyric string for the given URL.
"""
lyric_html_page = self._fetch_url_page(url, timeout=timeout)
lyrics = self._get_lyrics_text(lyric_html_page)
return lyrics.replace("\n", linesep)

View File

@@ -5,17 +5,20 @@ from spotdl.lyrics.exceptions import LyricsNotFoundError
class LyricWikia(LyricBase):
def __init__(self, artist, track):
self.artist = artist
self.track = track
def from_query(self, query, linesep="\n", timeout=None):
raise NotImplementedError
def get_lyrics(self, linesep="\n", timeout=None):
def from_artist_and_track(self, artist, track, linesep="\n", timeout=None):
"""
Returns the lyric string for the given artist and track.
"""
try:
lyrics = lyricwikia.get_lyrics(self.artist, self.track, linesep, timeout)
lyrics = lyricwikia.get_lyrics(artist, track, linesep, timeout)
except lyricwikia.LyricsNotFound as e:
raise LyricsNotFoundError(e.args[0])
else:
return lyrics
return lyrics
def from_url(self, url, linesep="\n", timeout=None):
raise NotImplementedError

View File

@@ -3,35 +3,114 @@ from spotdl.lyrics import exceptions
from spotdl.lyrics.providers import Genius
import urllib.request
import json
import pytest
class TestGenius:
def test_subclass(self):
assert issubclass(Genius, LyricBase)
@pytest.fixture(scope="module")
def expect_lyrics_count(self):
# This is the number of characters in lyrics found
# for the track in `lyric_url` fixture below
return 1845
@pytest.fixture(scope="module")
def genius(self):
return Genius()
def test_base_url(self, genius):
assert genius.base_url == "https://genius.com"
@pytest.fixture(scope="module")
def artist(self):
return "selena gomez"
@pytest.fixture(scope="module")
def track(self):
return Genius("artist", "track")
return "wolves"
def test_base_url(self, track):
assert track.base_url == "https://genius.com"
@pytest.fixture(scope="module")
def query(self, artist, track):
return "{} {}".format(artist, track)
def test_get_lyrics(self, track, monkeypatch):
def mocked_urlopen(url, timeout=None):
class DummyHTTPResponse:
def read(self):
return "<p>amazing lyrics!</p>"
@pytest.fixture(scope="module")
def guess_url(self, query):
return "https://genius.com/selena-gomez-wolves-lyrics"
return DummyHTTPResponse()
@pytest.fixture(scope="module")
def lyric_url(self):
return "https://genius.com/Selena-gomez-and-marshmello-wolves-lyrics"
monkeypatch.setattr("urllib.request.urlopen", mocked_urlopen)
assert track.get_lyrics() == "amazing lyrics!"
def test_guess_lyric_url_from_artist_and_track(self, genius, artist, track, guess_url):
url = genius.guess_lyric_url_from_artist_and_track(artist, track)
assert url == guess_url
def test_lyrics_not_found_error(self, track, monkeypatch):
def mocked_urlopen(url, timeout=None):
class MockHTTPResponse:
expect_lyrics = ""
def __init__(self, request, timeout=None):
search_results_url = "https://genius.com/api/search/multi?per_page=1&q=selena+gomez+wolves"
if request._full_url == search_results_url:
read_method = lambda: json.dumps({
"response": {"sections": [{"hits": [{"result": {
"path": "/Selena-gomez-and-marshmello-wolves-lyrics"
} }] }] }
})
else:
read_method = lambda: "<p>" + self.expect_lyrics + "</p>"
self.read = read_method
@pytest.mark.network
def test_best_matching_lyric_url_from_query(self, genius, query, lyric_url):
url = genius.best_matching_lyric_url_from_query(query)
assert url == lyric_url
def test_mock_best_matching_lyric_url_from_query(self, genius, query, lyric_url, monkeypatch):
monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse)
self.test_best_matching_lyric_url_from_query(genius, query, lyric_url)
@pytest.mark.network
def test_from_url(self, genius, lyric_url, expect_lyrics_count):
lyrics = genius.from_url(lyric_url)
assert len(lyrics) == expect_lyrics_count
def test_mock_from_url(self, genius, lyric_url, expect_lyrics_count, monkeypatch):
self.MockHTTPResponse.expect_lyrics = "a" * expect_lyrics_count
monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse)
self.test_from_url(genius, lyric_url, expect_lyrics_count)
@pytest.mark.network
def test_from_artist_and_track(self, genius, artist, track, expect_lyrics_count):
lyrics = genius.from_artist_and_track(artist, track)
assert len(lyrics) == expect_lyrics_count
def test_mock_from_artist_and_track(self, genius, artist, track, expect_lyrics_count, monkeypatch):
self.MockHTTPResponse.expect_lyrics = "a" * expect_lyrics_count
monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse)
self.test_from_artist_and_track(genius, artist, track, expect_lyrics_count)
@pytest.mark.network
def test_from_query(self, genius, query, expect_lyrics_count):
lyrics = genius.from_query(query)
assert len(lyrics) == expect_lyrics_count
def test_mock_from_query(self, genius, query, expect_lyrics_count, monkeypatch):
self.MockHTTPResponse.expect_lyrics = "a" * expect_lyrics_count
monkeypatch.setattr("urllib.request.urlopen", self.MockHTTPResponse)
self.test_from_query(genius, query, expect_lyrics_count)
@pytest.mark.network
def test_lyrics_not_found_error(self, genius):
with pytest.raises(exceptions.LyricsNotFoundError):
genius.from_artist_and_track(self, "nonexistent_artist", "nonexistent_track")
def test_mock_lyrics_not_found_error(self, genius, monkeypatch):
def mock_urlopen(url, timeout=None):
raise urllib.request.HTTPError("", "", "", "", "")
monkeypatch.setattr("urllib.request.urlopen", mocked_urlopen)
with pytest.raises(exceptions.LyricsNotFoundError):
track.get_lyrics()
monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
self.test_lyrics_not_found_error(genius)

View File

@@ -11,15 +11,16 @@ class TestLyricWikia:
def test_subclass(self):
assert issubclass(LyricWikia, LyricBase)
def test_get_lyrics(self, monkeypatch):
def test_from_artist_and_track(self, monkeypatch):
# `LyricWikia` class uses the 3rd party method `lyricwikia.get_lyrics`
# internally and there is no need to test a 3rd party library as they
# have their own implementation of tests.
monkeypatch.setattr(
"lyricwikia.get_lyrics", lambda a, b, c, d: "awesome lyrics!"
)
track = LyricWikia("Lyricwikia", "Lyricwikia")
assert track.get_lyrics() == "awesome lyrics!"
artist, track = "selena gomez", "wolves"
lyrics = LyricWikia().from_artist_and_track(artist, track)
assert lyrics == "awesome lyrics!"
def test_lyrics_not_found_error(self, monkeypatch):
def lyricwikia_lyrics_not_found(msg):
@@ -30,6 +31,6 @@ class TestLyricWikia:
"lyricwikia.get_lyrics",
lambda a, b, c, d: lyricwikia_lyrics_not_found("Nope, no lyrics."),
)
track = LyricWikia("Lyricwikia", "Lyricwikia")
artist, track = "nonexistent_artist", "nonexistent_track"
with pytest.raises(exceptions.LyricsNotFoundError):
track.get_lyrics()
LyricWikia().from_artist_and_track(artist, track)

View File

@@ -5,25 +5,20 @@ import pytest
class TestAbstractBaseClass:
def test_error_abstract_base_class_lyricbase(self):
artist = "awesome artist"
track = "amazing track"
with pytest.raises(TypeError):
# This abstract base class must be inherited from
# for instantiation
LyricBase(artist, track)
LyricBase()
def test_inherit_abstract_base_class_encoderbase(self):
class LyricKid(LyricBase):
def __init__(self, artist, track):
super().__init__(artist, track)
def from_query(self, query):
raise NotImplementedError
def get_lyrics(self):
def from_artist_and_track(self, artist, track):
pass
def from_url(self, url):
raise NotImplementedError
artist = "awesome artist"
track = "amazing track"
LyricKid(artist, track)
LyricKid()