Scrape YouTube by default and optionally use YouTube API to perform searches (#250)

* YouTube scraping

* Cleanup GenerateYouTubeURL class

* Some minor improvements

* Add test to fetch title with and without api key
This commit is contained in:
Ritiek Malhotra
2018-03-09 20:40:15 +05:30
committed by GitHub
parent 46f313777b
commit b968b5d206
7 changed files with 254 additions and 113 deletions

1
.gitignore vendored
View File

@@ -6,4 +6,5 @@ Music/
*.pyc
__pycache__/
.cache/
.pytest_cache/
.python-version

View File

@@ -141,7 +141,7 @@ optional arguments:
-ll {INFO,WARNING,ERROR,DEBUG}, --log-level {INFO,WARNING,ERROR,DEBUG}
set log verbosity (default: INFO)
-c CONFIG_FILE_PATH --config CONFIG_FILE_PATH
Replace with custom config.yml file (default: None)
Replace with custom config.yml file (default: None)
```
#### Download by Name
@@ -263,11 +263,16 @@ to override any default options.
Also note that config options are overridden by command-line arguments.
#### Specify the Custom Config File Path
If you want to use custom `.yml` configuration instead of the default one, you can use `-c`/`--config` option.
E.g. `$ python3 spotdl.py -s "adele hello" -c "/home/user/customConfig.yml"`
## Set YouTube API Key
By default this tool will scrape YouTube to fetch for matching video tracks.
However, you can optionally use YouTube API for faster response time.
To do this, [generate your API key](https://developers.google.com/youtube/registering_an_application)
and then set it in your `config.yml`.
## [Docker Image](https://hub.docker.com/r/ritiek/spotify-downloader/)
[![Docker automated build](https://img.shields.io/docker/automated/jrottenberg/ffmpeg.svg)](https://hub.docker.com/r/ritiek/spotify-downloader)
[![Docker pulls](https://img.shields.io/docker/pulls/ritiek/spotify-downloader.svg)](https://hub.docker.com/r/ritiek/spotify-downloader)

View File

@@ -23,6 +23,7 @@ default_conf = { 'spotify-downloader':
'music-videos-only' : False,
'no-spaces' : False,
'file-format' : '{artist} - {track_name}',
'youtube-api-key' : None,
'log-level' : 'INFO' }
}
@@ -40,7 +41,7 @@ def merge(default, config):
merged.update(config)
return merged
def get_config(config_file):
try:
with open(config_file, 'r') as ymlfile:
@@ -57,21 +58,22 @@ def override_config(config_file, parser, raw_args=None):
""" Override default dict with config dict passed as comamnd line argument. """
config_file = os.path.realpath(config_file)
config = merge(default_conf['spotify-downloader'], get_config(config_file))
parser.set_defaults(manual=config['manual'])
parser.set_defaults(no_metadata=config['no-metadata'])
parser.set_defaults(avconv=config['avconv'])
parser.set_defaults(folder=os.path.relpath(config['folder'], os.getcwd()))
parser.set_defaults(overwrite=config['overwrite'])
parser.set_defaults(input_ext=config['input-ext'])
parser.set_defaults(output_ext=config['output-ext'])
parser.set_defaults(download_only_metadata=config['download-only-metadata'])
parser.set_defaults(dry_run=config['dry-run'])
parser.set_defaults(file_format=config['file-format'])
parser.set_defaults(folder=os.path.relpath(config['folder'], os.getcwd()))
parser.set_defaults(input_ext=config['input-ext'])
parser.set_defaults(log_level=config['log-level'])
parser.set_defaults(manual=config['manual'])
parser.set_defaults(music_videos_only=config['music-videos-only'])
parser.set_defaults(no_metadata=config['no-metadata'])
parser.set_defaults(no_spaces=config['no-spaces'])
parser.set_defaults(output_ext=config['output-ext'])
parser.set_defaults(overwrite=config['overwrite'])
parser.set_defaults(file_format=config['file-format'])
parser.set_defaults(no_spaces=config['youtube-api-key'])
parser.set_defaults(log_level=config['log-level'])
return parser.parse_args(raw_args)
@@ -151,15 +153,18 @@ def get_arguments(raw_args=None, to_group=True, to_merge=True):
choices=_LOG_LEVELS_STR,
type=str.upper,
help='set log verbosity')
parser.add_argument(
'-yk', '--youtube-api-key', default=config['youtube-api-key'],
help=argparse.SUPPRESS)
parser.add_argument(
'-c', '--config', default=None,
help='Replace with custom config.yml file')
help='Replace with custom config.yml file')
parsed = parser.parse_args(raw_args)
if parsed.config is not None and to_merge:
parsed = override_config(parsed.config,parser)
parsed = override_config(parsed.config, parser)
parsed.log_level = log_leveller(parsed.log_level)
return parsed

View File

@@ -120,6 +120,19 @@ def videotime_from_seconds(time):
return '{0}:{1:02}:{2:02}'.format((time//60)//60, (time//60) % 60, time % 60)
def get_sec(time_str):
v = time_str.split(':', 3)
v.reverse()
sec = 0
if len(v) > 0: # seconds
sec += int(v[0])
if len(v) > 1: # minutes
sec += int(v[1]) * 60
if len(v) > 2: # hours
sec += int(v[2]) * 3600
return sec
def get_splits(url):
if '/' in url:
if url.endswith('/'):
@@ -127,4 +140,4 @@ def get_splits(url):
splits = url.split('/')
else:
splits = url.split(':')
return splits
return splits

View File

@@ -1,3 +1,5 @@
from bs4 import BeautifulSoup
import urllib
import pafy
from core import internals
@@ -7,13 +9,21 @@ import os
import pprint
log = const.log
# Please respect this YouTube token :)
pafy.set_api_key('AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90')
# Fix download speed throttle on short duration tracks
# Read more on mps-youtube/pafy#199
pafy.g.opener.addheaders.append(('Range', 'bytes=0-'))
def set_api_key():
if const.args.youtube_api_key:
key = const.args.youtube_api_key
else:
# Please respect this YouTube token :)
key = 'AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90'
pafy.set_api_key(key)
def go_pafy(raw_song, meta_tags=None):
""" Parse track from YouTube. """
if internals.is_youtube(raw_song):
@@ -58,92 +68,175 @@ def download_song(file_name, content):
return False
def generate_youtube_url(raw_song, meta_tags, tries_remaining=5):
""" Search for the song on YouTube and generate a URL to its video. """
# prevents an infinite loop but allows for a few retries
if tries_remaining == 0:
log.debug('No tries left. I quit.')
return
query = { 'part' : 'snippet',
'maxResults' : 50,
'type' : 'video' }
if const.args.music_videos_only:
query['videoCategoryId'] = '10'
if not meta_tags:
song = raw_song
query['q'] = song
else:
song = '{0} - {1}'.format(meta_tags['artists'][0]['name'],
meta_tags['name'])
query['q'] = song
log.debug('query: {0}'.format(query))
data = pafy.call_gdata('search', query)
data['items'] = list(filter(lambda x: x['id'].get('videoId') is not None,
data['items']))
query_results = {'part': 'contentDetails,snippet,statistics',
'maxResults': 50,
'id': ','.join(i['id']['videoId'] for i in data['items'])}
log.debug('query_results: {0}'.format(query_results))
vdata = pafy.call_gdata('videos', query_results)
videos = []
for x in vdata['items']:
duration_s = pafy.playlist.parseISO8591(x['contentDetails']['duration'])
youtubedetails = {'link': x['id'], 'title': x['snippet']['title'],
'videotime':internals.videotime_from_seconds(duration_s),
'seconds': duration_s}
videos.append(youtubedetails)
if not meta_tags:
break
if not videos:
return None
if const.args.manual:
log.info(song)
log.info('0. Skip downloading this song.\n')
# fetch all video links on first page on YouTube
for i, v in enumerate(videos):
log.info(u'{0}. {1} {2} {3}'.format(i+1, v['title'], v['videotime'],
"http://youtube.com/watch?v="+v['link']))
# let user select the song to download
result = internals.input_link(videos)
if not result:
return None
else:
if not meta_tags:
# if the metadata could not be acquired, take the first result
# from Youtube because the proper song length is unknown
result = videos[0]
log.debug('Since no metadata found on Spotify, going with the first result')
else:
# filter out videos that do not have a similar length to the Spotify song
duration_tolerance = 10
max_duration_tolerance = 20
possible_videos_by_duration = list()
'''
start with a reasonable duration_tolerance, and increment duration_tolerance
until one of the Youtube results falls within the correct duration or
the duration_tolerance has reached the max_duration_tolerance
'''
while len(possible_videos_by_duration) == 0:
possible_videos_by_duration = list(filter(lambda x: abs(x['seconds'] - meta_tags['duration']) <= duration_tolerance, videos))
duration_tolerance += 1
if duration_tolerance > max_duration_tolerance:
log.error("{0} by {1} was not found.\n".format(meta_tags['name'], meta_tags['artists'][0]['name']))
return None
result = possible_videos_by_duration[0]
if result:
url = "http://youtube.com/watch?v=" + result['link']
else:
url = None
def generate_search_url(song):
""" Generate YouTube search URL for the given song. """
# urllib.request.quote() encodes URL with special characters
song = urllib.request.quote(song)
# Special YouTube URL filter to search only for videos
url = 'https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q={0}'.format(song)
return url
def is_video(result):
# ensure result is not a channel
not_video = result.find('channel') is not None or \
'yt-lockup-channel' in result.parent.attrs['class'] or \
'yt-lockup-channel' in result.attrs['class']
# ensure result is not a mix/playlist
not_video = not_video or \
'yt-lockup-playlist' in result.parent.attrs['class']
# ensure video result is not an advertisement
not_video = not_video or \
result.find('googleads') is not None
video = not not_video
return video
def generate_youtube_url(raw_song, meta_tags):
url_fetch = GenerateYouTubeURL(raw_song, meta_tags)
if const.args.youtube_api_key:
url = url_fetch.api()
else:
url = url_fetch.scrape()
return url
class GenerateYouTubeURL:
def __init__(self, raw_song, meta_tags):
self.raw_song = raw_song
self.meta_tags = meta_tags
def _best_match(self, videos):
""" Select the best matching video from a list of videos. """
if const.args.manual:
log.info(self.raw_song)
log.info('0. Skip downloading this song.\n')
# fetch all video links on first page on YouTube
for i, v in enumerate(videos):
log.info(u'{0}. {1} {2} {3}'.format(i+1, v['title'], v['videotime'],
"http://youtube.com/watch?v="+v['link']))
# let user select the song to download
result = internals.input_link(videos)
if result is None:
return None
else:
if not self.meta_tags:
# if the metadata could not be acquired, take the first result
# from Youtube because the proper song length is unknown
result = videos[0]
log.debug('Since no metadata found on Spotify, going with the first result')
else:
# filter out videos that do not have a similar length to the Spotify song
duration_tolerance = 10
max_duration_tolerance = 20
possible_videos_by_duration = list()
'''
start with a reasonable duration_tolerance, and increment duration_tolerance
until one of the Youtube results falls within the correct duration or
the duration_tolerance has reached the max_duration_tolerance
'''
while len(possible_videos_by_duration) == 0:
possible_videos_by_duration = list(filter(lambda x: abs(x['seconds'] - self.meta_tags['duration']) <= duration_tolerance, videos))
duration_tolerance += 1
if duration_tolerance > max_duration_tolerance:
log.error("{0} by {1} was not found.\n".format(self.meta_tags['name'], self.meta_tags['artists'][0]['name']))
return None
result = possible_videos_by_duration[0]
if result:
url = "http://youtube.com/watch?v=" + result['link']
else:
url = None
return url
def scrape(self, tries_remaining=5):
""" Search and scrape YouTube to return a list of matching videos. """
# prevents an infinite loop but allows for a few retries
if tries_remaining == 0:
log.debug('No tries left. I quit.')
return
if self.meta_tags is None:
song = self.raw_song
search_url = generate_search_url(song)
else:
song = internals.generate_songname(const.args.file_format,
self.meta_tags)
search_url = generate_search_url(song)
log.debug('Opening URL: {0}'.format(search_url))
item = urllib.request.urlopen(search_url).read()
items_parse = BeautifulSoup(item, "html.parser")
videos = []
for x in items_parse.find_all('div', {'class': 'yt-lockup-dismissable yt-uix-tile'}):
if not is_video(x):
continue
y = x.find('div', class_='yt-lockup-content')
link = y.find('a')['href'][-11:]
title = y.find('a')['title']
try:
videotime = x.find('span', class_="video-time").get_text()
except AttributeError:
log.debug('Could not find video duration on YouTube, retrying..')
return generate_youtube_url(self.raw_song, self.meta_tags, tries_remaining - 1)
youtubedetails = {'link': link, 'title': title, 'videotime': videotime,
'seconds': internals.get_sec(videotime)}
videos.append(youtubedetails)
if self.meta_tags is None:
break
return self._best_match(videos)
def api(self):
""" Use YouTube API to search and return a list of matching videos. """
query = { 'part' : 'snippet',
'maxResults' : 50,
'type' : 'video' }
if const.args.music_videos_only:
query['videoCategoryId'] = '10'
if not self.meta_tags:
song = self.raw_song
query['q'] = song
else:
song = '{0} - {1}'.format(self.meta_tags['artists'][0]['name'],
self.meta_tags['name'])
query['q'] = song
log.debug('query: {0}'.format(query))
data = pafy.call_gdata('search', query)
data['items'] = list(filter(lambda x: x['id'].get('videoId') is not None,
data['items']))
query_results = {'part': 'contentDetails,snippet,statistics',
'maxResults': 50,
'id': ','.join(i['id']['videoId'] for i in data['items'])}
log.debug('query_results: {0}'.format(query_results))
vdata = pafy.call_gdata('videos', query_results)
videos = []
for x in vdata['items']:
duration_s = pafy.playlist.parseISO8591(x['contentDetails']['duration'])
youtubedetails = {'link': x['id'], 'title': x['snippet']['title'],
'videotime':internals.videotime_from_seconds(duration_s),
'seconds': duration_s}
videos.append(youtubedetails)
if not self.meta_tags:
break
return self._best_match(videos)

View File

@@ -170,6 +170,7 @@ def download_single(raw_song, number=None):
if __name__ == '__main__':
const.args = handle.get_arguments()
internals.filter_path(const.args.folder)
youtube_tools.set_api_key()
const.log = const.logzero.setup_logger(formatter=const.formatter,
level=const.args.log_level)

View File

@@ -13,6 +13,22 @@ loader.load_defaults()
raw_song = "Tony's Videos VERY SHORT VIDEO 28.10.2016"
class TestYouTubeAPIKeys:
def test_custom(self):
expect_key = 'some_api_key'
const.args.youtube_api_key = expect_key
youtube_tools.set_api_key()
key = youtube_tools.pafy.g.api_key
assert key == expect_key
def test_default(self):
expect_key = 'AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90'
const.args.youtube_api_key = None
youtube_tools.set_api_key()
key = youtube_tools.pafy.g.api_key
assert key == expect_key
def test_metadata():
expect_metadata = None
global metadata
@@ -22,10 +38,12 @@ def test_metadata():
class TestYouTubeURL:
def test_only_music_category(self):
expect_url = 'http://youtube.com/watch?v=5USR1Omo7f0'
# YouTube keeps changing its results
expect_urls = ('http://youtube.com/watch?v=qOOcy2-tmbk',
'http://youtube.com/watch?v=5USR1Omo7f0')
const.args.music_videos_only = True
url = youtube_tools.generate_youtube_url(raw_song, metadata)
assert url == expect_url
assert url in expect_urls
def test_all_categories(self):
expect_url = 'http://youtube.com/watch?v=qOOcy2-tmbk'
@@ -49,16 +67,21 @@ class TestYouTubeURL:
class TestYouTubeTitle:
def test_single_download(self):
def test_single_download_with_youtube_api(self):
global content
global title
expect_title = "Tony's Videos VERY SHORT VIDEO 28.10.2016"
key = 'AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90'
const.args.youtube_api_key = key
youtube_tools.set_api_key()
content = youtube_tools.go_pafy(raw_song, metadata)
title = youtube_tools.get_youtube_title(content)
assert title == expect_title
def test_download_from_list(self):
def test_download_from_list_without_youtube_api(self):
expect_title = "1. Tony's Videos VERY SHORT VIDEO 28.10.2016"
const.args.youtube_api_key = None
youtube_tools.set_api_key()
content = youtube_tools.go_pafy(raw_song, metadata)
title = youtube_tools.get_youtube_title(content, 1)
assert title == expect_title