Refactor encoding

2026-02-15 13:59:55 +00:00 · 2020-03-16 18:12:52 +05:30
parent 937ed6ebcc
commit 5adb3d0a4d
11 changed files with 239 additions and 165 deletions
--- a/spotdl/downloaders/youtube.py
+++ b/spotdl/downloaders/youtube.py
@@ -0,0 +1,412 @@
+from bs4 import BeautifulSoup
+import urllib
+import pafy
+
+from slugify import slugify
+from logzero import logger as log
+import os
+
+from spotdl import spotify_tools
+from spotdl import internals
+from spotdl import const
+
+# Fix download speed throttle on short duration tracks
+# Read more on mps-youtube/pafy#199
+pafy.g.opener.addheaders.append(("Range", "bytes=0-"))
+
+# Implement unreleased methods on Pafy object
+# More info: https://github.com/mps-youtube/pafy/pull/211
+if pafy.__version__ <= "0.5.5":
+    from spotdl import patcher
+
+    pafy_patcher = patcher.PatchPafy()
+    pafy_patcher.patch_getbestthumb()
+    pafy_patcher.patch_process_streams()
+    pafy_patcher.patch_insecure_streams()
+
+
+def set_api_key():
+    if const.args.youtube_api_key:
+        key = const.args.youtube_api_key
+    else:
+        # Please respect this YouTube token :)
+        key = "AIzaSyC6cEeKlxtOPybk9sEe5ksFN5sB-7wzYp0"
+    pafy.set_api_key(key)
+
+
+def go_pafy(raw_song, meta_tags=None):
+    """ Parse track from YouTube. """
+    if internals.is_youtube(raw_song):
+        track_info = pafy.new(raw_song)
+    else:
+        track_url = generate_youtube_url(raw_song, meta_tags)
+
+        if track_url:
+            track_info = pafy.new(track_url)
+        else:
+            track_info = None
+
+    return track_info
+
+
+def match_video_and_metadata(track):
+    """ Get and match track data from YouTube and Spotify. """
+    meta_tags = None
+
+    def fallback_metadata(meta_tags):
+        fallback_metadata_info = (
+            "Track not found on Spotify, falling back on YouTube metadata"
+        )
+        skip_fallback_metadata_warning = (
+            "Fallback condition not met, shall not embed metadata"
+        )
+        if meta_tags is None:
+            if const.args.no_fallback_metadata:
+                log.warning(skip_fallback_metadata_warning)
+            else:
+                log.info(fallback_metadata_info)
+                meta_tags = generate_metadata(content)
+        return meta_tags
+
+    if internals.is_youtube(track):
+        log.debug("Input song is a YouTube URL")
+        content = go_pafy(track, meta_tags=None)
+        track = slugify(content.title).replace("-", " ")
+        if not const.args.no_metadata:
+            meta_tags = spotify_tools.generate_metadata(track)
+            meta_tags = fallback_metadata(meta_tags)
+
+    elif internals.is_spotify(track):
+        log.debug("Input song is a Spotify URL")
+        # Let it generate metadata, YouTube doesn't know Spotify slang
+        meta_tags = spotify_tools.generate_metadata(track)
+        content = go_pafy(track, meta_tags)
+        if const.args.no_metadata:
+            meta_tags = None
+
+    else:
+        log.debug("Input song is plain text based")
+        if const.args.no_metadata:
+            content = go_pafy(track, meta_tags=None)
+        else:
+            meta_tags = spotify_tools.generate_metadata(track)
+            content = go_pafy(track, meta_tags=meta_tags)
+            meta_tags = fallback_metadata(meta_tags)
+
+    return content, meta_tags
+
+
+def generate_metadata(content):
+    """ Fetch a song's metadata from YouTube. """
+    meta_tags = {
+        "spotify_metadata": False,
+        "name": content.title,
+        "artists": [{"name": content.author}],
+        "duration": content.length,
+        "external_urls": {"youtube": content.watchv_url},
+        "album": {
+            "images": [{"url": content.getbestthumb()}],
+            "artists": [{"name": None}],
+            "name": None,
+        },
+        "year": None,
+        "release_date": None,
+        "type": "track",
+        "disc_number": 1,
+        "track_number": 1,
+        "total_tracks": 1,
+        "publisher": None,
+        "external_ids": {"isrc": None},
+        "lyrics": None,
+        "copyright": None,
+        "genre": None,
+    }
+
+    # Workaround for
+    # https://github.com/ritiek/spotify-downloader/issues/671
+    try:
+        meta_tags["year"] = content.published.split("-")[0]
+        meta_tags["release_date"] = content.published.split(" ")[0]
+    except pafy.util.GdataError:
+        pass
+
+    return meta_tags
+
+
+def get_youtube_title(content, number=None):
+    """ Get the YouTube video's title. """
+    title = content.title
+    if number:
+        return "{0}. {1}".format(number, title)
+    else:
+        return title
+
+
+def generate_m3u(track_file):
+    tracks = internals.get_unique_tracks(track_file)
+    target_file = "{}.m3u".format(track_file.split(".")[0])
+    total_tracks = len(tracks)
+    log.info("Generating {0} from {1} YouTube URLs".format(target_file, total_tracks))
+    with open(target_file, "w") as output_file:
+        output_file.write("#EXTM3U\n\n")
+
+    videos = []
+    for n, track in enumerate(tracks, 1):
+        content, _ = match_video_and_metadata(track)
+        if content is None:
+            log.warning("Skipping {}".format(track))
+        else:
+            log.info(
+                "Matched track {0}/{1} ({2})".format(
+                    n, total_tracks, content.watchv_url
+                )
+            )
+            log.debug(track)
+            m3u_key = "#EXTINF:{duration},{title}\n{youtube_url}\n".format(
+                duration=internals.get_sec(content.duration),
+                title=content.title,
+                youtube_url=content.watchv_url,
+            )
+            log.debug(m3u_key)
+            with open(target_file, "a") as output_file:
+                output_file.write(m3u_key)
+            videos.append(content.watchv_url)
+
+    return videos
+
+
+def download_song(file_name, content):
+    """ Download the audio file from YouTube. """
+    _, extension = os.path.splitext(file_name)
+    if extension in (".webm", ".m4a"):
+        link = content.getbestaudio(preftype=extension[1:])
+    else:
+        log.debug("No audio streams available for {} type".format(extension))
+        return False
+
+    if link:
+        log.debug("Downloading from URL: " + link.url)
+        filepath = os.path.join(const.args.folder, file_name)
+        log.debug("Saving to: " + filepath)
+        link.download(filepath=filepath)
+        return True
+    else:
+        log.debug("No audio streams available")
+        return False
+
+
+def generate_search_url(query):
+    """ Generate YouTube search URL for the given song. """
+    # urllib.request.quote() encodes string with special characters
+    quoted_query = urllib.request.quote(query)
+    # Special YouTube URL filter to search only for videos
+    url = "https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q={0}".format(
+        quoted_query
+    )
+    return url
+
+
+def is_video(result):
+    # ensure result is not a channel
+    not_video = (
+        result.find("channel") is not None
+        or "yt-lockup-channel" in result.parent.attrs["class"]
+        or "yt-lockup-channel" in result.attrs["class"]
+    )
+
+    # ensure result is not a mix/playlist
+    not_video = not_video or "yt-lockup-playlist" in result.parent.attrs["class"]
+
+    # ensure video result is not an advertisement
+    not_video = not_video or result.find("googleads") is not None
+
+    video = not not_video
+    return video
+
+
+def generate_youtube_url(raw_song, meta_tags):
+    url_fetch = GenerateYouTubeURL(raw_song, meta_tags)
+    if const.args.youtube_api_key:
+        url = url_fetch.api()
+    else:
+        url = url_fetch.scrape()
+    return url
+
+
+class GenerateYouTubeURL:
+    def __init__(self, raw_song, meta_tags):
+        self.raw_song = raw_song
+        self.meta_tags = meta_tags
+
+        if meta_tags is None:
+            self.search_query = raw_song
+        else:
+            self.search_query = internals.format_string(
+                const.args.search_format, meta_tags, force_spaces=True
+            )
+
+    def _best_match(self, videos):
+        if not videos:
+            log.error("No videos found on YouTube for a given search")
+            return None
+
+        """ Select the best matching video from a list of videos. """
+        if const.args.manual:
+            log.info(self.raw_song)
+            log.info("0. Skip downloading this song.\n")
+            # fetch all video links on first page on YouTube
+            for i, v in enumerate(videos):
+                log.info(
+                    u"{0}. {1} {2} {3}".format(
+                        i + 1,
+                        v["title"],
+                        v["videotime"],
+                        "http://youtube.com/watch?v=" + v["link"],
+                    )
+                )
+            # let user select the song to download
+            result = internals.input_link(videos)
+            if result is None:
+                return None
+        else:
+            if not self.meta_tags:
+                # if the metadata could not be acquired, take the first result
+                # from Youtube because the proper song length is unknown
+                result = videos[0]
+                log.debug(
+                    "Since no metadata found on Spotify, going with the first result"
+                )
+            else:
+                # filter out videos that do not have a similar length to the Spotify song
+                duration_tolerance = 10
+                max_duration_tolerance = 20
+                possible_videos_by_duration = []
+
+                # start with a reasonable duration_tolerance, and increment duration_tolerance
+                # until one of the Youtube results falls within the correct duration or
+                # the duration_tolerance has reached the max_duration_tolerance
+                while len(possible_videos_by_duration) == 0:
+                    possible_videos_by_duration = list(
+                        filter(
+                            lambda x: abs(x["seconds"] - self.meta_tags["duration"])
+                            <= duration_tolerance,
+                            videos,
+                        )
+                    )
+                    duration_tolerance += 1
+                    if duration_tolerance > max_duration_tolerance:
+                        log.error(
+                            "{0} by {1} was not found.".format(
+                                self.meta_tags["name"],
+                                self.meta_tags["artists"][0]["name"],
+                            )
+                        )
+                        return None
+
+                result = possible_videos_by_duration[0]
+
+        if result:
+            url = "http://youtube.com/watch?v={0}".format(result["link"])
+        else:
+            url = None
+
+        return url
+
+    def scrape(self, bestmatch=True, tries_remaining=5):
+        """ Search and scrape YouTube to return a list of matching videos. """
+
+        # prevents an infinite loop but allows for a few retries
+        if tries_remaining == 0:
+            log.debug("No tries left. I quit.")
+            return
+
+        search_url = generate_search_url(self.search_query)
+        log.debug("Opening URL: {0}".format(search_url))
+
+        item = self._fetch_response(search_url).read()
+        items_parse = BeautifulSoup(item, "html.parser")
+
+        videos = []
+        for x in items_parse.find_all(
+            "div", {"class": "yt-lockup-dismissable yt-uix-tile"}
+        ):
+
+            if not is_video(x):
+                continue
+
+            y = x.find("div", class_="yt-lockup-content")
+            link = y.find("a")["href"][-11:]
+            title = y.find("a")["title"]
+
+            try:
+                videotime = x.find("span", class_="video-time").get_text()
+            except AttributeError:
+                log.debug("Could not find video duration on YouTube, retrying..")
+                return self.scrape(
+                    bestmatch=bestmatch, tries_remaining=tries_remaining - 1
+                )
+
+            youtubedetails = {
+                "link": link,
+                "title": title,
+                "videotime": videotime,
+                "seconds": internals.get_sec(videotime),
+            }
+            videos.append(youtubedetails)
+
+        if bestmatch:
+            return self._best_match(videos)
+
+        return videos
+
+    def api(self, bestmatch=True):
+        """ Use YouTube API to search and return a list of matching videos. """
+
+        query = {"part": "snippet", "maxResults": 50, "type": "video"}
+
+        if const.args.music_videos_only:
+            query["videoCategoryId"] = "10"
+
+        if not self.meta_tags:
+            song = self.raw_song
+            query["q"] = song
+        else:
+            query["q"] = self.search_query
+        log.debug("query: {0}".format(query))
+
+        data = pafy.call_gdata("search", query)
+        data["items"] = list(
+            filter(lambda x: x["id"].get("videoId") is not None, data["items"])
+        )
+        query_results = {
+            "part": "contentDetails,snippet,statistics",
+            "maxResults": 50,
+            "id": ",".join(i["id"]["videoId"] for i in data["items"]),
+        }
+        log.debug("query_results: {0}".format(query_results))
+
+        vdata = pafy.call_gdata("videos", query_results)
+
+        videos = []
+        for x in vdata["items"]:
+            duration_s = pafy.playlist.parseISO8591(x["contentDetails"]["duration"])
+            youtubedetails = {
+                "link": x["id"],
+                "title": x["snippet"]["title"],
+                "videotime": internals.videotime_from_seconds(duration_s),
+                "seconds": duration_s,
+            }
+            videos.append(youtubedetails)
+
+        if bestmatch:
+            return self._best_match(videos)
+
+        return videos
+
+    @staticmethod
+    def _fetch_response(url):
+        # XXX: This method exists only because it helps us indirectly
+        # monkey patch `urllib.request.open`, directly monkey patching
+        # `urllib.request.open` causes us to end up in an infinite recursion
+        # during the test since `urllib.request.open` would monkeypatch itself.
+        return urllib.request.urlopen(url)