mirror of
https://github.com/KevinMidboe/spotify-downloader.git
synced 2025-10-29 18:00:15 +00:00
Refactor encoding
This commit is contained in:
412
spotdl/downloaders/youtube.py
Normal file
412
spotdl/downloaders/youtube.py
Normal file
@@ -0,0 +1,412 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib
|
||||
import pafy
|
||||
|
||||
from slugify import slugify
|
||||
from logzero import logger as log
|
||||
import os
|
||||
|
||||
from spotdl import spotify_tools
|
||||
from spotdl import internals
|
||||
from spotdl import const
|
||||
|
||||
# Fix download speed throttle on short duration tracks
|
||||
# Read more on mps-youtube/pafy#199
|
||||
pafy.g.opener.addheaders.append(("Range", "bytes=0-"))
|
||||
|
||||
# Implement unreleased methods on Pafy object
|
||||
# More info: https://github.com/mps-youtube/pafy/pull/211
|
||||
if pafy.__version__ <= "0.5.5":
|
||||
from spotdl import patcher
|
||||
|
||||
pafy_patcher = patcher.PatchPafy()
|
||||
pafy_patcher.patch_getbestthumb()
|
||||
pafy_patcher.patch_process_streams()
|
||||
pafy_patcher.patch_insecure_streams()
|
||||
|
||||
|
||||
def set_api_key():
|
||||
if const.args.youtube_api_key:
|
||||
key = const.args.youtube_api_key
|
||||
else:
|
||||
# Please respect this YouTube token :)
|
||||
key = "AIzaSyC6cEeKlxtOPybk9sEe5ksFN5sB-7wzYp0"
|
||||
pafy.set_api_key(key)
|
||||
|
||||
|
||||
def go_pafy(raw_song, meta_tags=None):
|
||||
""" Parse track from YouTube. """
|
||||
if internals.is_youtube(raw_song):
|
||||
track_info = pafy.new(raw_song)
|
||||
else:
|
||||
track_url = generate_youtube_url(raw_song, meta_tags)
|
||||
|
||||
if track_url:
|
||||
track_info = pafy.new(track_url)
|
||||
else:
|
||||
track_info = None
|
||||
|
||||
return track_info
|
||||
|
||||
|
||||
def match_video_and_metadata(track):
|
||||
""" Get and match track data from YouTube and Spotify. """
|
||||
meta_tags = None
|
||||
|
||||
def fallback_metadata(meta_tags):
|
||||
fallback_metadata_info = (
|
||||
"Track not found on Spotify, falling back on YouTube metadata"
|
||||
)
|
||||
skip_fallback_metadata_warning = (
|
||||
"Fallback condition not met, shall not embed metadata"
|
||||
)
|
||||
if meta_tags is None:
|
||||
if const.args.no_fallback_metadata:
|
||||
log.warning(skip_fallback_metadata_warning)
|
||||
else:
|
||||
log.info(fallback_metadata_info)
|
||||
meta_tags = generate_metadata(content)
|
||||
return meta_tags
|
||||
|
||||
if internals.is_youtube(track):
|
||||
log.debug("Input song is a YouTube URL")
|
||||
content = go_pafy(track, meta_tags=None)
|
||||
track = slugify(content.title).replace("-", " ")
|
||||
if not const.args.no_metadata:
|
||||
meta_tags = spotify_tools.generate_metadata(track)
|
||||
meta_tags = fallback_metadata(meta_tags)
|
||||
|
||||
elif internals.is_spotify(track):
|
||||
log.debug("Input song is a Spotify URL")
|
||||
# Let it generate metadata, YouTube doesn't know Spotify slang
|
||||
meta_tags = spotify_tools.generate_metadata(track)
|
||||
content = go_pafy(track, meta_tags)
|
||||
if const.args.no_metadata:
|
||||
meta_tags = None
|
||||
|
||||
else:
|
||||
log.debug("Input song is plain text based")
|
||||
if const.args.no_metadata:
|
||||
content = go_pafy(track, meta_tags=None)
|
||||
else:
|
||||
meta_tags = spotify_tools.generate_metadata(track)
|
||||
content = go_pafy(track, meta_tags=meta_tags)
|
||||
meta_tags = fallback_metadata(meta_tags)
|
||||
|
||||
return content, meta_tags
|
||||
|
||||
|
||||
def generate_metadata(content):
|
||||
""" Fetch a song's metadata from YouTube. """
|
||||
meta_tags = {
|
||||
"spotify_metadata": False,
|
||||
"name": content.title,
|
||||
"artists": [{"name": content.author}],
|
||||
"duration": content.length,
|
||||
"external_urls": {"youtube": content.watchv_url},
|
||||
"album": {
|
||||
"images": [{"url": content.getbestthumb()}],
|
||||
"artists": [{"name": None}],
|
||||
"name": None,
|
||||
},
|
||||
"year": None,
|
||||
"release_date": None,
|
||||
"type": "track",
|
||||
"disc_number": 1,
|
||||
"track_number": 1,
|
||||
"total_tracks": 1,
|
||||
"publisher": None,
|
||||
"external_ids": {"isrc": None},
|
||||
"lyrics": None,
|
||||
"copyright": None,
|
||||
"genre": None,
|
||||
}
|
||||
|
||||
# Workaround for
|
||||
# https://github.com/ritiek/spotify-downloader/issues/671
|
||||
try:
|
||||
meta_tags["year"] = content.published.split("-")[0]
|
||||
meta_tags["release_date"] = content.published.split(" ")[0]
|
||||
except pafy.util.GdataError:
|
||||
pass
|
||||
|
||||
return meta_tags
|
||||
|
||||
|
||||
def get_youtube_title(content, number=None):
|
||||
""" Get the YouTube video's title. """
|
||||
title = content.title
|
||||
if number:
|
||||
return "{0}. {1}".format(number, title)
|
||||
else:
|
||||
return title
|
||||
|
||||
|
||||
def generate_m3u(track_file):
|
||||
tracks = internals.get_unique_tracks(track_file)
|
||||
target_file = "{}.m3u".format(track_file.split(".")[0])
|
||||
total_tracks = len(tracks)
|
||||
log.info("Generating {0} from {1} YouTube URLs".format(target_file, total_tracks))
|
||||
with open(target_file, "w") as output_file:
|
||||
output_file.write("#EXTM3U\n\n")
|
||||
|
||||
videos = []
|
||||
for n, track in enumerate(tracks, 1):
|
||||
content, _ = match_video_and_metadata(track)
|
||||
if content is None:
|
||||
log.warning("Skipping {}".format(track))
|
||||
else:
|
||||
log.info(
|
||||
"Matched track {0}/{1} ({2})".format(
|
||||
n, total_tracks, content.watchv_url
|
||||
)
|
||||
)
|
||||
log.debug(track)
|
||||
m3u_key = "#EXTINF:{duration},{title}\n{youtube_url}\n".format(
|
||||
duration=internals.get_sec(content.duration),
|
||||
title=content.title,
|
||||
youtube_url=content.watchv_url,
|
||||
)
|
||||
log.debug(m3u_key)
|
||||
with open(target_file, "a") as output_file:
|
||||
output_file.write(m3u_key)
|
||||
videos.append(content.watchv_url)
|
||||
|
||||
return videos
|
||||
|
||||
|
||||
def download_song(file_name, content):
|
||||
""" Download the audio file from YouTube. """
|
||||
_, extension = os.path.splitext(file_name)
|
||||
if extension in (".webm", ".m4a"):
|
||||
link = content.getbestaudio(preftype=extension[1:])
|
||||
else:
|
||||
log.debug("No audio streams available for {} type".format(extension))
|
||||
return False
|
||||
|
||||
if link:
|
||||
log.debug("Downloading from URL: " + link.url)
|
||||
filepath = os.path.join(const.args.folder, file_name)
|
||||
log.debug("Saving to: " + filepath)
|
||||
link.download(filepath=filepath)
|
||||
return True
|
||||
else:
|
||||
log.debug("No audio streams available")
|
||||
return False
|
||||
|
||||
|
||||
def generate_search_url(query):
|
||||
""" Generate YouTube search URL for the given song. """
|
||||
# urllib.request.quote() encodes string with special characters
|
||||
quoted_query = urllib.request.quote(query)
|
||||
# Special YouTube URL filter to search only for videos
|
||||
url = "https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q={0}".format(
|
||||
quoted_query
|
||||
)
|
||||
return url
|
||||
|
||||
|
||||
def is_video(result):
|
||||
# ensure result is not a channel
|
||||
not_video = (
|
||||
result.find("channel") is not None
|
||||
or "yt-lockup-channel" in result.parent.attrs["class"]
|
||||
or "yt-lockup-channel" in result.attrs["class"]
|
||||
)
|
||||
|
||||
# ensure result is not a mix/playlist
|
||||
not_video = not_video or "yt-lockup-playlist" in result.parent.attrs["class"]
|
||||
|
||||
# ensure video result is not an advertisement
|
||||
not_video = not_video or result.find("googleads") is not None
|
||||
|
||||
video = not not_video
|
||||
return video
|
||||
|
||||
|
||||
def generate_youtube_url(raw_song, meta_tags):
|
||||
url_fetch = GenerateYouTubeURL(raw_song, meta_tags)
|
||||
if const.args.youtube_api_key:
|
||||
url = url_fetch.api()
|
||||
else:
|
||||
url = url_fetch.scrape()
|
||||
return url
|
||||
|
||||
|
||||
class GenerateYouTubeURL:
|
||||
def __init__(self, raw_song, meta_tags):
|
||||
self.raw_song = raw_song
|
||||
self.meta_tags = meta_tags
|
||||
|
||||
if meta_tags is None:
|
||||
self.search_query = raw_song
|
||||
else:
|
||||
self.search_query = internals.format_string(
|
||||
const.args.search_format, meta_tags, force_spaces=True
|
||||
)
|
||||
|
||||
def _best_match(self, videos):
|
||||
if not videos:
|
||||
log.error("No videos found on YouTube for a given search")
|
||||
return None
|
||||
|
||||
""" Select the best matching video from a list of videos. """
|
||||
if const.args.manual:
|
||||
log.info(self.raw_song)
|
||||
log.info("0. Skip downloading this song.\n")
|
||||
# fetch all video links on first page on YouTube
|
||||
for i, v in enumerate(videos):
|
||||
log.info(
|
||||
u"{0}. {1} {2} {3}".format(
|
||||
i + 1,
|
||||
v["title"],
|
||||
v["videotime"],
|
||||
"http://youtube.com/watch?v=" + v["link"],
|
||||
)
|
||||
)
|
||||
# let user select the song to download
|
||||
result = internals.input_link(videos)
|
||||
if result is None:
|
||||
return None
|
||||
else:
|
||||
if not self.meta_tags:
|
||||
# if the metadata could not be acquired, take the first result
|
||||
# from Youtube because the proper song length is unknown
|
||||
result = videos[0]
|
||||
log.debug(
|
||||
"Since no metadata found on Spotify, going with the first result"
|
||||
)
|
||||
else:
|
||||
# filter out videos that do not have a similar length to the Spotify song
|
||||
duration_tolerance = 10
|
||||
max_duration_tolerance = 20
|
||||
possible_videos_by_duration = []
|
||||
|
||||
# start with a reasonable duration_tolerance, and increment duration_tolerance
|
||||
# until one of the Youtube results falls within the correct duration or
|
||||
# the duration_tolerance has reached the max_duration_tolerance
|
||||
while len(possible_videos_by_duration) == 0:
|
||||
possible_videos_by_duration = list(
|
||||
filter(
|
||||
lambda x: abs(x["seconds"] - self.meta_tags["duration"])
|
||||
<= duration_tolerance,
|
||||
videos,
|
||||
)
|
||||
)
|
||||
duration_tolerance += 1
|
||||
if duration_tolerance > max_duration_tolerance:
|
||||
log.error(
|
||||
"{0} by {1} was not found.".format(
|
||||
self.meta_tags["name"],
|
||||
self.meta_tags["artists"][0]["name"],
|
||||
)
|
||||
)
|
||||
return None
|
||||
|
||||
result = possible_videos_by_duration[0]
|
||||
|
||||
if result:
|
||||
url = "http://youtube.com/watch?v={0}".format(result["link"])
|
||||
else:
|
||||
url = None
|
||||
|
||||
return url
|
||||
|
||||
def scrape(self, bestmatch=True, tries_remaining=5):
|
||||
""" Search and scrape YouTube to return a list of matching videos. """
|
||||
|
||||
# prevents an infinite loop but allows for a few retries
|
||||
if tries_remaining == 0:
|
||||
log.debug("No tries left. I quit.")
|
||||
return
|
||||
|
||||
search_url = generate_search_url(self.search_query)
|
||||
log.debug("Opening URL: {0}".format(search_url))
|
||||
|
||||
item = self._fetch_response(search_url).read()
|
||||
items_parse = BeautifulSoup(item, "html.parser")
|
||||
|
||||
videos = []
|
||||
for x in items_parse.find_all(
|
||||
"div", {"class": "yt-lockup-dismissable yt-uix-tile"}
|
||||
):
|
||||
|
||||
if not is_video(x):
|
||||
continue
|
||||
|
||||
y = x.find("div", class_="yt-lockup-content")
|
||||
link = y.find("a")["href"][-11:]
|
||||
title = y.find("a")["title"]
|
||||
|
||||
try:
|
||||
videotime = x.find("span", class_="video-time").get_text()
|
||||
except AttributeError:
|
||||
log.debug("Could not find video duration on YouTube, retrying..")
|
||||
return self.scrape(
|
||||
bestmatch=bestmatch, tries_remaining=tries_remaining - 1
|
||||
)
|
||||
|
||||
youtubedetails = {
|
||||
"link": link,
|
||||
"title": title,
|
||||
"videotime": videotime,
|
||||
"seconds": internals.get_sec(videotime),
|
||||
}
|
||||
videos.append(youtubedetails)
|
||||
|
||||
if bestmatch:
|
||||
return self._best_match(videos)
|
||||
|
||||
return videos
|
||||
|
||||
def api(self, bestmatch=True):
|
||||
""" Use YouTube API to search and return a list of matching videos. """
|
||||
|
||||
query = {"part": "snippet", "maxResults": 50, "type": "video"}
|
||||
|
||||
if const.args.music_videos_only:
|
||||
query["videoCategoryId"] = "10"
|
||||
|
||||
if not self.meta_tags:
|
||||
song = self.raw_song
|
||||
query["q"] = song
|
||||
else:
|
||||
query["q"] = self.search_query
|
||||
log.debug("query: {0}".format(query))
|
||||
|
||||
data = pafy.call_gdata("search", query)
|
||||
data["items"] = list(
|
||||
filter(lambda x: x["id"].get("videoId") is not None, data["items"])
|
||||
)
|
||||
query_results = {
|
||||
"part": "contentDetails,snippet,statistics",
|
||||
"maxResults": 50,
|
||||
"id": ",".join(i["id"]["videoId"] for i in data["items"]),
|
||||
}
|
||||
log.debug("query_results: {0}".format(query_results))
|
||||
|
||||
vdata = pafy.call_gdata("videos", query_results)
|
||||
|
||||
videos = []
|
||||
for x in vdata["items"]:
|
||||
duration_s = pafy.playlist.parseISO8591(x["contentDetails"]["duration"])
|
||||
youtubedetails = {
|
||||
"link": x["id"],
|
||||
"title": x["snippet"]["title"],
|
||||
"videotime": internals.videotime_from_seconds(duration_s),
|
||||
"seconds": duration_s,
|
||||
}
|
||||
videos.append(youtubedetails)
|
||||
|
||||
if bestmatch:
|
||||
return self._best_match(videos)
|
||||
|
||||
return videos
|
||||
|
||||
@staticmethod
|
||||
def _fetch_response(url):
|
||||
# XXX: This method exists only because it helps us indirectly
|
||||
# monkey patch `urllib.request.open`, directly monkey patching
|
||||
# `urllib.request.open` causes us to end up in an infinite recursion
|
||||
# during the test since `urllib.request.open` would monkeypatch itself.
|
||||
return urllib.request.urlopen(url)
|
||||
Reference in New Issue
Block a user