Files
spotify-downloader/spotdl/youtube_tools.py
Arryon Tijsma 32c2ace96c Fixes #272 by refactoring global log to conventional global... (#358)
* Fixes #272 by refactoring global log to conventional global as used by logzero

* Remove unnecessary logger from download_single, which was a test case

* Remove unnecessary logger from main(), which was a test case
2018-09-21 22:43:26 -07:00

240 lines
8.3 KiB
Python

from bs4 import BeautifulSoup
import urllib
import pafy
from logzero import logger as log
from spotdl import internals
from spotdl import const
import os
import pprint
# Fix download speed throttle on short duration tracks
# Read more on mps-youtube/pafy#199
pafy.g.opener.addheaders.append(('Range', 'bytes=0-'))
def set_api_key():
if const.args.youtube_api_key:
key = const.args.youtube_api_key
else:
# Please respect this YouTube token :)
key = 'AIzaSyC6cEeKlxtOPybk9sEe5ksFN5sB-7wzYp0'
pafy.set_api_key(key)
def go_pafy(raw_song, meta_tags=None):
""" Parse track from YouTube. """
if internals.is_youtube(raw_song):
track_info = pafy.new(raw_song)
else:
track_url = generate_youtube_url(raw_song, meta_tags)
if track_url:
track_info = pafy.new(track_url)
else:
track_info = None
return track_info
def get_youtube_title(content, number=None):
""" Get the YouTube video's title. """
title = content.title
if number:
return '{0}. {1}'.format(number, title)
else:
return title
def download_song(file_name, content):
""" Download the audio file from YouTube. """
_, extension = os.path.splitext(file_name)
if extension in ('.webm', '.m4a'):
link = content.getbestaudio(preftype=extension[1:])
else:
log.debug('No audio streams available for {} type'.format(extension))
return False
if link:
log.debug('Downloading from URL: ' + link.url)
filepath = os.path.join(const.args.folder, file_name)
log.debug('Saving to: ' + filepath)
link.download(filepath=filepath)
return True
else:
log.debug('No audio streams available')
return False
def generate_search_url(query):
""" Generate YouTube search URL for the given song. """
# urllib.request.quote() encodes string with special characters
quoted_query = urllib.request.quote(query)
# Special YouTube URL filter to search only for videos
url = 'https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q={0}'.format(quoted_query)
return url
def is_video(result):
# ensure result is not a channel
not_video = result.find('channel') is not None or \
'yt-lockup-channel' in result.parent.attrs['class'] or \
'yt-lockup-channel' in result.attrs['class']
# ensure result is not a mix/playlist
not_video = not_video or \
'yt-lockup-playlist' in result.parent.attrs['class']
# ensure video result is not an advertisement
not_video = not_video or \
result.find('googleads') is not None
video = not not_video
return video
def generate_youtube_url(raw_song, meta_tags):
url_fetch = GenerateYouTubeURL(raw_song, meta_tags)
if const.args.youtube_api_key:
url = url_fetch.api()
else:
url = url_fetch.scrape()
return url
class GenerateYouTubeURL:
def __init__(self, raw_song, meta_tags):
self.raw_song = raw_song
self.meta_tags = meta_tags
if meta_tags is None:
self.search_query = raw_song
else:
self.search_query = internals.format_string(const.args.search_format,
meta_tags, force_spaces=True)
def _best_match(self, videos):
""" Select the best matching video from a list of videos. """
if const.args.manual:
log.info(self.raw_song)
log.info('0. Skip downloading this song.\n')
# fetch all video links on first page on YouTube
for i, v in enumerate(videos):
log.info(u'{0}. {1} {2} {3}'.format(i+1, v['title'], v['videotime'],
"http://youtube.com/watch?v="+v['link']))
# let user select the song to download
result = internals.input_link(videos)
if result is None:
return None
else:
if not self.meta_tags:
# if the metadata could not be acquired, take the first result
# from Youtube because the proper song length is unknown
result = videos[0]
log.debug('Since no metadata found on Spotify, going with the first result')
else:
# filter out videos that do not have a similar length to the Spotify song
duration_tolerance = 10
max_duration_tolerance = 20
possible_videos_by_duration = []
# start with a reasonable duration_tolerance, and increment duration_tolerance
# until one of the Youtube results falls within the correct duration or
# the duration_tolerance has reached the max_duration_tolerance
while len(possible_videos_by_duration) == 0:
possible_videos_by_duration = list(filter(lambda x: abs(x['seconds'] - self.meta_tags['duration']) <= duration_tolerance, videos))
duration_tolerance += 1
if duration_tolerance > max_duration_tolerance:
log.error("{0} by {1} was not found.\n".format(self.meta_tags['name'], self.meta_tags['artists'][0]['name']))
return None
result = possible_videos_by_duration[0]
if result:
url = "http://youtube.com/watch?v={0}".format(result['link'])
else:
url = None
return url
def scrape(self, bestmatch=True, tries_remaining=5):
""" Search and scrape YouTube to return a list of matching videos. """
# prevents an infinite loop but allows for a few retries
if tries_remaining == 0:
log.debug('No tries left. I quit.')
return
search_url = generate_search_url(self.search_query)
log.debug('Opening URL: {0}'.format(search_url))
item = urllib.request.urlopen(search_url).read()
items_parse = BeautifulSoup(item, "html.parser")
videos = []
for x in items_parse.find_all('div', {'class': 'yt-lockup-dismissable yt-uix-tile'}):
if not is_video(x):
continue
y = x.find('div', class_='yt-lockup-content')
link = y.find('a')['href'][-11:]
title = y.find('a')['title']
try:
videotime = x.find('span', class_="video-time").get_text()
except AttributeError:
log.debug('Could not find video duration on YouTube, retrying..')
return self.scrape(bestmatch=bestmatch, tries_remaining=tries_remaining-1)
youtubedetails = {'link': link, 'title': title, 'videotime': videotime,
'seconds': internals.get_sec(videotime)}
videos.append(youtubedetails)
if bestmatch:
return self._best_match(videos)
return videos
def api(self, bestmatch=True):
""" Use YouTube API to search and return a list of matching videos. """
query = { 'part' : 'snippet',
'maxResults' : 50,
'type' : 'video' }
if const.args.music_videos_only:
query['videoCategoryId'] = '10'
if not self.meta_tags:
song = self.raw_song
query['q'] = song
else:
query['q'] = self.search_query
log.debug('query: {0}'.format(query))
data = pafy.call_gdata('search', query)
data['items'] = list(filter(lambda x: x['id'].get('videoId') is not None,
data['items']))
query_results = {'part': 'contentDetails,snippet,statistics',
'maxResults': 50,
'id': ','.join(i['id']['videoId'] for i in data['items'])}
log.debug('query_results: {0}'.format(query_results))
vdata = pafy.call_gdata('videos', query_results)
videos = []
for x in vdata['items']:
duration_s = pafy.playlist.parseISO8591(x['contentDetails']['duration'])
youtubedetails = {'link': x['id'], 'title': x['snippet']['title'],
'videotime':internals.videotime_from_seconds(duration_s),
'seconds': duration_s}
videos.append(youtubedetails)
if bestmatch:
return self._best_match(videos)
return videos