spotify-downloader/spotdl/lyrics/providers/genius.py

from bs4 import BeautifulSoup
import urllib.request

from spotdl.lyrics.lyric_base import LyricBase
from spotdl.lyrics.exceptions import LyricsNotFoundError

BASE_URL = "https://genius.com"


class Genius(LyricBase):
    def __init__(self, artist, track):
        self.artist = artist
        self.track = track
        self.base_url = BASE_URL

    def _guess_lyric_url(self):
        """
        Returns the possible lyric URL for the track available
        on Genius. This may not always be a valid URL, but this
        is apparently the best we can do at the moment?
        """
        query = "/{} {} lyrics".format(self.artist, self.track)
        query = query.replace(" ", "-")
        encoded_query = urllib.request.quote(query)
        lyric_url = self.base_url + encoded_query
        return lyric_url

    def _fetch_page(self, url, timeout=None):
        """
        Makes a GET request to the given URL and returns the
        HTML content in the case of a valid response.
        """
        request = urllib.request.Request(url)
        request.add_header("User-Agent", "urllib")
        try:
            response = urllib.request.urlopen(request, timeout=timeout)
        except urllib.request.HTTPError:
            raise LyricsNotFoundError(
                "Could not find lyrics for {} - {} at URL: {}".format(
                    self.artist, self.track, url
                )
            )
        else:
            return response.read()

    def _get_lyrics_text(self, html):
        """
        Extracts and returns the lyric content from the
        provided HTML.
        """
        soup = BeautifulSoup(html, "html.parser")
        lyrics_paragraph = soup.find("p")
        if lyrics_paragraph:
            return lyrics_paragraph.get_text()
        else:
            raise LyricsNotFoundError(
                "The lyrics for this track are yet to be released."
            )

    def get_lyrics(self, linesep="\n", timeout=None):
        """
        Returns the lyric string for the given artist and track.
        """
        url = self._guess_lyric_url()
        html_page = self._fetch_page(url, timeout=timeout)
        lyrics = self._get_lyrics_text(html_page)
        return lyrics.replace("\n", linesep)