Files
spotify-downloader/spotdl/lyrics/providers/genius.py
Ritiek Malhotra f24c026ae4 Improve Genius lyrics stability
Sometimes the first result in the Genius API response may not contain
the path to lyrics. In such cases, keep iterating on the next
result until the path is found.
2020-04-21 18:56:46 +05:30

125 lines
4.4 KiB
Python

from bs4 import BeautifulSoup
import urllib.request
import json
from spotdl.lyrics.lyric_base import LyricBase
from spotdl.lyrics.exceptions import LyricsNotFoundError
BASE_URL = "https://genius.com"
BASE_SEARCH_URL = BASE_URL + "/api/search/multi?per_page=1&q="
# FIXME: Make Genius a metadata provider instead of lyric provider
# Since, Genius parses additional metadata too (such as track
# name, artist name, albumart url). For example, fetch this URL:
# https://genius.com/api/search/multi?per_page=1&q=artist+trackname
class Genius(LyricBase):
def __init__(self):
self.base_url = BASE_URL
self.base_search_url = BASE_SEARCH_URL
def guess_lyric_url_from_artist_and_track(self, artist, track):
"""
Returns the possible lyric URL for the track available on
Genius. This may not always be a valid URL.
"""
query = "/{} {} lyrics".format(artist, track)
query = query.replace(" ", "-")
encoded_query = urllib.request.quote(query)
lyric_url = self.base_url + encoded_query
return lyric_url
def _fetch_url_page(self, url, timeout=None):
"""
Makes a GET request to the given lyrics page URL and returns
the HTML content in the case of a valid response.
"""
request = urllib.request.Request(url)
request.add_header("User-Agent", "urllib")
try:
response = urllib.request.urlopen(request, timeout=timeout)
except urllib.request.HTTPError:
raise LyricsNotFoundError(
"Could not find lyrics at URL: {}".format(url)
)
else:
return response.read()
def _get_lyrics_text(self, html):
"""
Extracts and returns the lyric content from the provided HTML.
"""
soup = BeautifulSoup(html, "html.parser")
lyrics_paragraph = soup.find("p")
if lyrics_paragraph:
return lyrics_paragraph.get_text()
else:
raise LyricsNotFoundError(
"The lyrics for this track are yet to be released."
)
def _fetch_search_page(self, url, timeout=None):
"""
Returns search results from a given URL in JSON.
"""
request = urllib.request.Request(url)
request.add_header("User-Agent", "urllib")
response = urllib.request.urlopen(request, timeout=timeout)
metadata = json.loads(response.read())
if len(metadata["response"]["sections"][0]["hits"]) == 0:
raise LyricsNotFoundError(
"Could not find any search results for URL: {}".format(url)
)
return metadata
def best_matching_lyric_url_from_query(self, query):
"""
Returns the best matching track's URL from a given query.
"""
encoded_query = urllib.request.quote(query.replace(" ", "+"))
search_url = self.base_search_url + encoded_query
metadata = self._fetch_search_page(search_url)
lyric_url = None
for section in metadata["response"]["sections"]:
result = section["hits"][0]["result"]
try:
lyric_url = result["path"]
break
except KeyError:
pass
if lyric_url is None:
raise LyricsNotFoundError(
"Could not find any valid lyric paths in the "
"API response for the query {}".format(query)
)
return self.base_url + lyric_url
def from_query(self, query, linesep="\n", timeout=None):
"""
Returns the lyric string for the track best matching the
given query.
"""
lyric_url = self.best_matching_lyric_url_from_query(query)
return self.from_url(lyric_url, linesep, timeout=timeout)
def from_artist_and_track(self, artist, track, linesep="\n", timeout=None):
"""
Returns the lyric string for the given artist and track
by making scraping search results and fetching the first
result.
"""
lyric_url = self.guess_lyric_url_from_artist_and_track(artist, track)
return self.from_url(lyric_url, linesep, timeout)
def from_url(self, url, linesep="\n", timeout=None):
"""
Returns the lyric string for the given URL.
"""
lyric_html_page = self._fetch_url_page(url, timeout=timeout)
lyrics = self._get_lyrics_text(lyric_html_page)
return lyrics.replace("\n", linesep)