mirror of
				https://github.com/KevinMidboe/spotify-downloader.git
				synced 2025-10-29 18:00:15 +00:00 
			
		
		
		
	Scrape YouTube by default and optionally use YouTube API to perform searches (#250)
* YouTube scraping * Cleanup GenerateYouTubeURL class * Some minor improvements * Add test to fetch title with and without api key
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -6,4 +6,5 @@ Music/ | ||||
| *.pyc | ||||
| __pycache__/ | ||||
| .cache/ | ||||
| .pytest_cache/ | ||||
| .python-version | ||||
|   | ||||
| @@ -263,11 +263,16 @@ to override any default options. | ||||
|  | ||||
| Also note that config options are overridden by command-line arguments. | ||||
|  | ||||
| #### Specify the Custom Config File Path | ||||
|  | ||||
| If you want to use custom `.yml` configuration instead of the default one, you can use `-c`/`--config` option. | ||||
| E.g. `$ python3 spotdl.py -s "adele hello" -c "/home/user/customConfig.yml"` | ||||
|  | ||||
| ## Set YouTube API Key | ||||
|  | ||||
| By default this tool will scrape YouTube to fetch for matching video tracks. | ||||
| However, you can optionally use YouTube API for faster response time. | ||||
| To do this, [generate your API key](https://developers.google.com/youtube/registering_an_application) | ||||
| and then set it in your `config.yml`. | ||||
|  | ||||
| ## [Docker Image](https://hub.docker.com/r/ritiek/spotify-downloader/) | ||||
| [](https://hub.docker.com/r/ritiek/spotify-downloader) | ||||
| [](https://hub.docker.com/r/ritiek/spotify-downloader) | ||||
|   | ||||
| @@ -23,6 +23,7 @@ default_conf = { 'spotify-downloader': | ||||
|                    'music-videos-only'      : False, | ||||
|                    'no-spaces'              : False, | ||||
|                    'file-format'            : '{artist} - {track_name}', | ||||
|                    'youtube-api-key'        : None, | ||||
|                    'log-level'              : 'INFO' } | ||||
|                } | ||||
|  | ||||
| @@ -58,19 +59,20 @@ def override_config(config_file, parser, raw_args=None): | ||||
|     config_file = os.path.realpath(config_file) | ||||
|     config = merge(default_conf['spotify-downloader'], get_config(config_file)) | ||||
|  | ||||
|     parser.set_defaults(manual=config['manual']) | ||||
|     parser.set_defaults(no_metadata=config['no-metadata']) | ||||
|     parser.set_defaults(avconv=config['avconv']) | ||||
|     parser.set_defaults(folder=os.path.relpath(config['folder'], os.getcwd())) | ||||
|     parser.set_defaults(overwrite=config['overwrite']) | ||||
|     parser.set_defaults(input_ext=config['input-ext']) | ||||
|     parser.set_defaults(output_ext=config['output-ext']) | ||||
|     parser.set_defaults(download_only_metadata=config['download-only-metadata']) | ||||
|     parser.set_defaults(dry_run=config['dry-run']) | ||||
|     parser.set_defaults(file_format=config['file-format']) | ||||
|     parser.set_defaults(folder=os.path.relpath(config['folder'], os.getcwd())) | ||||
|     parser.set_defaults(input_ext=config['input-ext']) | ||||
|     parser.set_defaults(log_level=config['log-level']) | ||||
|     parser.set_defaults(manual=config['manual']) | ||||
|     parser.set_defaults(music_videos_only=config['music-videos-only']) | ||||
|     parser.set_defaults(no_metadata=config['no-metadata']) | ||||
|     parser.set_defaults(no_spaces=config['no-spaces']) | ||||
|     parser.set_defaults(output_ext=config['output-ext']) | ||||
|     parser.set_defaults(overwrite=config['overwrite']) | ||||
|     parser.set_defaults(file_format=config['file-format']) | ||||
|     parser.set_defaults(no_spaces=config['youtube-api-key']) | ||||
|     parser.set_defaults(log_level=config['log-level']) | ||||
|  | ||||
|     return parser.parse_args(raw_args) | ||||
|  | ||||
| @@ -151,6 +153,9 @@ def get_arguments(raw_args=None, to_group=True, to_merge=True): | ||||
|         choices=_LOG_LEVELS_STR, | ||||
|         type=str.upper, | ||||
|         help='set log verbosity') | ||||
|     parser.add_argument( | ||||
|         '-yk', '--youtube-api-key', default=config['youtube-api-key'], | ||||
|         help=argparse.SUPPRESS) | ||||
|     parser.add_argument( | ||||
|         '-c', '--config', default=None, | ||||
|         help='Replace with custom config.yml file') | ||||
|   | ||||
| @@ -120,6 +120,19 @@ def videotime_from_seconds(time): | ||||
|     return '{0}:{1:02}:{2:02}'.format((time//60)//60, (time//60) % 60, time % 60) | ||||
|  | ||||
|  | ||||
| def get_sec(time_str): | ||||
|     v = time_str.split(':', 3) | ||||
|     v.reverse() | ||||
|     sec = 0 | ||||
|     if len(v) > 0:  # seconds | ||||
|         sec += int(v[0]) | ||||
|     if len(v) > 1:  # minutes | ||||
|         sec += int(v[1]) * 60 | ||||
|     if len(v) > 2:  # hours | ||||
|         sec += int(v[2]) * 3600 | ||||
|     return sec | ||||
|  | ||||
|  | ||||
| def get_splits(url): | ||||
|     if '/' in url: | ||||
|         if url.endswith('/'): | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| from bs4 import BeautifulSoup | ||||
| import urllib | ||||
| import pafy | ||||
|  | ||||
| from core import internals | ||||
| @@ -7,13 +9,21 @@ import os | ||||
| import pprint | ||||
|  | ||||
| log = const.log | ||||
| # Please respect this YouTube token :) | ||||
| pafy.set_api_key('AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90') | ||||
|  | ||||
| # Fix download speed throttle on short duration tracks | ||||
| # Read more on mps-youtube/pafy#199 | ||||
| pafy.g.opener.addheaders.append(('Range', 'bytes=0-')) | ||||
|  | ||||
|  | ||||
| def set_api_key(): | ||||
|     if const.args.youtube_api_key: | ||||
|         key = const.args.youtube_api_key | ||||
|     else: | ||||
|         # Please respect this YouTube token :) | ||||
|         key = 'AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90' | ||||
|     pafy.set_api_key(key) | ||||
|  | ||||
|  | ||||
| def go_pafy(raw_song, meta_tags=None): | ||||
|     """ Parse track from YouTube. """ | ||||
|     if internals.is_youtube(raw_song): | ||||
| @@ -58,13 +68,141 @@ def download_song(file_name, content): | ||||
|         return False | ||||
|  | ||||
|  | ||||
| def generate_youtube_url(raw_song, meta_tags, tries_remaining=5): | ||||
|     """ Search for the song on YouTube and generate a URL to its video. """ | ||||
| def generate_search_url(song): | ||||
|     """ Generate YouTube search URL for the given song. """ | ||||
|     # urllib.request.quote() encodes URL with special characters | ||||
|     song = urllib.request.quote(song) | ||||
|     # Special YouTube URL filter to search only for videos | ||||
|     url = 'https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q={0}'.format(song) | ||||
|     return url | ||||
|  | ||||
|  | ||||
| def is_video(result): | ||||
|     # ensure result is not a channel | ||||
|     not_video = result.find('channel') is not None or \ | ||||
|                 'yt-lockup-channel' in result.parent.attrs['class'] or \ | ||||
|                 'yt-lockup-channel' in result.attrs['class'] | ||||
|  | ||||
|     # ensure result is not a mix/playlist | ||||
|     not_video = not_video or \ | ||||
|                'yt-lockup-playlist' in result.parent.attrs['class'] | ||||
|  | ||||
|     # ensure video result is not an advertisement | ||||
|     not_video = not_video or \ | ||||
|                 result.find('googleads') is not None | ||||
|  | ||||
|     video = not not_video | ||||
|     return video | ||||
|  | ||||
|  | ||||
| def generate_youtube_url(raw_song, meta_tags): | ||||
|     url_fetch = GenerateYouTubeURL(raw_song, meta_tags) | ||||
|     if const.args.youtube_api_key: | ||||
|         url = url_fetch.api() | ||||
|     else: | ||||
|         url = url_fetch.scrape() | ||||
|     return url | ||||
|  | ||||
|  | ||||
| class GenerateYouTubeURL: | ||||
|     def __init__(self, raw_song, meta_tags): | ||||
|         self.raw_song = raw_song | ||||
|         self.meta_tags = meta_tags | ||||
|  | ||||
|     def _best_match(self, videos): | ||||
|         """ Select the best matching video from a list of videos. """ | ||||
|         if const.args.manual: | ||||
|             log.info(self.raw_song) | ||||
|             log.info('0. Skip downloading this song.\n') | ||||
|             # fetch all video links on first page on YouTube | ||||
|             for i, v in enumerate(videos): | ||||
|                 log.info(u'{0}. {1} {2} {3}'.format(i+1, v['title'], v['videotime'], | ||||
|                       "http://youtube.com/watch?v="+v['link'])) | ||||
|             # let user select the song to download | ||||
|             result = internals.input_link(videos) | ||||
|             if result is None: | ||||
|                 return None | ||||
|         else: | ||||
|             if not self.meta_tags: | ||||
|                 # if the metadata could not be acquired, take the first result | ||||
|                 # from Youtube because the proper song length is unknown | ||||
|                 result = videos[0] | ||||
|                 log.debug('Since no metadata found on Spotify, going with the first result') | ||||
|             else: | ||||
|                 # filter out videos that do not have a similar length to the Spotify song | ||||
|                 duration_tolerance = 10 | ||||
|                 max_duration_tolerance = 20 | ||||
|                 possible_videos_by_duration = list() | ||||
|  | ||||
|                 ''' | ||||
|                 start with a reasonable duration_tolerance, and increment duration_tolerance | ||||
|                 until one of the Youtube results falls within the correct duration or | ||||
|                 the duration_tolerance has reached the max_duration_tolerance | ||||
|                 ''' | ||||
|                 while len(possible_videos_by_duration) == 0: | ||||
|                     possible_videos_by_duration = list(filter(lambda x: abs(x['seconds'] - self.meta_tags['duration']) <= duration_tolerance, videos)) | ||||
|                     duration_tolerance += 1 | ||||
|                     if duration_tolerance > max_duration_tolerance: | ||||
|                         log.error("{0} by {1} was not found.\n".format(self.meta_tags['name'], self.meta_tags['artists'][0]['name'])) | ||||
|                         return None | ||||
|  | ||||
|                 result = possible_videos_by_duration[0] | ||||
|  | ||||
|         if result: | ||||
|             url = "http://youtube.com/watch?v=" + result['link'] | ||||
|         else: | ||||
|             url = None | ||||
|  | ||||
|         return url | ||||
|  | ||||
|     def scrape(self, tries_remaining=5): | ||||
|         """ Search and scrape YouTube to return a list of matching videos. """ | ||||
|  | ||||
|         # prevents an infinite loop but allows for a few retries | ||||
|         if tries_remaining == 0: | ||||
|             log.debug('No tries left. I quit.') | ||||
|             return | ||||
|  | ||||
|         if self.meta_tags is None: | ||||
|             song = self.raw_song | ||||
|             search_url = generate_search_url(song) | ||||
|         else: | ||||
|             song = internals.generate_songname(const.args.file_format, | ||||
|                                                self.meta_tags) | ||||
|             search_url = generate_search_url(song) | ||||
|         log.debug('Opening URL: {0}'.format(search_url)) | ||||
|  | ||||
|         item = urllib.request.urlopen(search_url).read() | ||||
|         items_parse = BeautifulSoup(item, "html.parser") | ||||
|  | ||||
|         videos = [] | ||||
|         for x in items_parse.find_all('div', {'class': 'yt-lockup-dismissable yt-uix-tile'}): | ||||
|  | ||||
|             if not is_video(x): | ||||
|                 continue | ||||
|  | ||||
|             y = x.find('div', class_='yt-lockup-content') | ||||
|             link = y.find('a')['href'][-11:] | ||||
|             title = y.find('a')['title'] | ||||
|  | ||||
|             try: | ||||
|                 videotime = x.find('span', class_="video-time").get_text() | ||||
|             except AttributeError: | ||||
|                 log.debug('Could not find video duration on YouTube, retrying..') | ||||
|                 return generate_youtube_url(self.raw_song, self.meta_tags, tries_remaining - 1) | ||||
|  | ||||
|             youtubedetails = {'link': link, 'title': title, 'videotime': videotime, | ||||
|                               'seconds': internals.get_sec(videotime)} | ||||
|             videos.append(youtubedetails) | ||||
|             if self.meta_tags is None: | ||||
|                 break | ||||
|  | ||||
|         return self._best_match(videos) | ||||
|  | ||||
|  | ||||
|     def api(self): | ||||
|         """ Use YouTube API to search and return a list of matching videos. """ | ||||
|  | ||||
|         query = { 'part'       : 'snippet', | ||||
|                   'maxResults' :  50, | ||||
|                   'type'       : 'video' } | ||||
| @@ -72,12 +210,12 @@ def generate_youtube_url(raw_song, meta_tags, tries_remaining=5): | ||||
|         if const.args.music_videos_only: | ||||
|             query['videoCategoryId'] = '10' | ||||
|  | ||||
|     if not meta_tags: | ||||
|         song = raw_song | ||||
|         if not self.meta_tags: | ||||
|             song = self.raw_song | ||||
|             query['q'] = song | ||||
|         else: | ||||
|         song = '{0} - {1}'.format(meta_tags['artists'][0]['name'], | ||||
|                                   meta_tags['name']) | ||||
|             song = '{0} - {1}'.format(self.meta_tags['artists'][0]['name'], | ||||
|                                       self.meta_tags['name']) | ||||
|             query['q'] = song | ||||
|         log.debug('query: {0}'.format(query)) | ||||
|  | ||||
| @@ -98,52 +236,7 @@ def generate_youtube_url(raw_song, meta_tags, tries_remaining=5): | ||||
|                               'videotime':internals.videotime_from_seconds(duration_s), | ||||
|                               'seconds': duration_s} | ||||
|             videos.append(youtubedetails) | ||||
|         if not meta_tags: | ||||
|             if not self.meta_tags: | ||||
|                 break | ||||
|  | ||||
|     if not videos: | ||||
|         return None | ||||
|  | ||||
|     if const.args.manual: | ||||
|         log.info(song) | ||||
|         log.info('0. Skip downloading this song.\n') | ||||
|         # fetch all video links on first page on YouTube | ||||
|         for i, v in enumerate(videos): | ||||
|             log.info(u'{0}. {1} {2} {3}'.format(i+1, v['title'], v['videotime'], | ||||
|                   "http://youtube.com/watch?v="+v['link'])) | ||||
|         # let user select the song to download | ||||
|         result = internals.input_link(videos) | ||||
|         if not result: | ||||
|             return None | ||||
|     else: | ||||
|         if not meta_tags: | ||||
|             # if the metadata could not be acquired, take the first result | ||||
|             # from Youtube because the proper song length is unknown | ||||
|             result = videos[0] | ||||
|             log.debug('Since no metadata found on Spotify, going with the first result') | ||||
|         else: | ||||
|             # filter out videos that do not have a similar length to the Spotify song | ||||
|             duration_tolerance = 10 | ||||
|             max_duration_tolerance = 20 | ||||
|             possible_videos_by_duration = list() | ||||
|  | ||||
|             ''' | ||||
|             start with a reasonable duration_tolerance, and increment duration_tolerance | ||||
|             until one of the Youtube results falls within the correct duration or | ||||
|             the duration_tolerance has reached the max_duration_tolerance | ||||
|             ''' | ||||
|             while len(possible_videos_by_duration) == 0: | ||||
|                 possible_videos_by_duration = list(filter(lambda x: abs(x['seconds'] - meta_tags['duration']) <= duration_tolerance, videos)) | ||||
|                 duration_tolerance += 1 | ||||
|                 if duration_tolerance > max_duration_tolerance: | ||||
|                     log.error("{0} by {1} was not found.\n".format(meta_tags['name'], meta_tags['artists'][0]['name'])) | ||||
|                     return None | ||||
|  | ||||
|             result = possible_videos_by_duration[0] | ||||
|  | ||||
|     if result: | ||||
|         url = "http://youtube.com/watch?v=" + result['link'] | ||||
|     else: | ||||
|         url = None | ||||
|  | ||||
|     return url | ||||
|         return self._best_match(videos) | ||||
|   | ||||
| @@ -170,6 +170,7 @@ def download_single(raw_song, number=None): | ||||
| if __name__ == '__main__': | ||||
|     const.args = handle.get_arguments() | ||||
|     internals.filter_path(const.args.folder) | ||||
|     youtube_tools.set_api_key() | ||||
|  | ||||
|     const.log = const.logzero.setup_logger(formatter=const.formatter, | ||||
|                                       level=const.args.log_level) | ||||
|   | ||||
| @@ -13,6 +13,22 @@ loader.load_defaults() | ||||
| raw_song = "Tony's Videos VERY SHORT VIDEO 28.10.2016" | ||||
|  | ||||
|  | ||||
| class TestYouTubeAPIKeys: | ||||
|     def test_custom(self): | ||||
|         expect_key = 'some_api_key' | ||||
|         const.args.youtube_api_key = expect_key | ||||
|         youtube_tools.set_api_key() | ||||
|         key = youtube_tools.pafy.g.api_key | ||||
|         assert key == expect_key | ||||
|  | ||||
|     def test_default(self): | ||||
|         expect_key = 'AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90' | ||||
|         const.args.youtube_api_key = None | ||||
|         youtube_tools.set_api_key() | ||||
|         key = youtube_tools.pafy.g.api_key | ||||
|         assert key == expect_key | ||||
|  | ||||
|  | ||||
| def test_metadata(): | ||||
|     expect_metadata = None | ||||
|     global metadata | ||||
| @@ -22,10 +38,12 @@ def test_metadata(): | ||||
|  | ||||
| class TestYouTubeURL: | ||||
|     def test_only_music_category(self): | ||||
|         expect_url = 'http://youtube.com/watch?v=5USR1Omo7f0' | ||||
|         # YouTube keeps changing its results | ||||
|         expect_urls = ('http://youtube.com/watch?v=qOOcy2-tmbk', | ||||
|                        'http://youtube.com/watch?v=5USR1Omo7f0') | ||||
|         const.args.music_videos_only = True | ||||
|         url = youtube_tools.generate_youtube_url(raw_song, metadata) | ||||
|         assert url == expect_url | ||||
|         assert url in expect_urls | ||||
|  | ||||
|     def test_all_categories(self): | ||||
|         expect_url = 'http://youtube.com/watch?v=qOOcy2-tmbk' | ||||
| @@ -49,16 +67,21 @@ class TestYouTubeURL: | ||||
|  | ||||
|  | ||||
| class TestYouTubeTitle: | ||||
|     def test_single_download(self): | ||||
|     def test_single_download_with_youtube_api(self): | ||||
|         global content | ||||
|         global title | ||||
|         expect_title = "Tony's Videos VERY SHORT VIDEO 28.10.2016" | ||||
|         key = 'AIzaSyAnItl3udec-Q1d5bkjKJGL-RgrKO_vU90' | ||||
|         const.args.youtube_api_key = key | ||||
|         youtube_tools.set_api_key() | ||||
|         content = youtube_tools.go_pafy(raw_song, metadata) | ||||
|         title = youtube_tools.get_youtube_title(content) | ||||
|         assert title == expect_title | ||||
|  | ||||
|     def test_download_from_list(self): | ||||
|     def test_download_from_list_without_youtube_api(self): | ||||
|         expect_title = "1. Tony's Videos VERY SHORT VIDEO 28.10.2016" | ||||
|         const.args.youtube_api_key = None | ||||
|         youtube_tools.set_api_key() | ||||
|         content = youtube_tools.go_pafy(raw_song, metadata) | ||||
|         title = youtube_tools.get_youtube_title(content, 1) | ||||
|         assert title == expect_title | ||||
|   | ||||
		Reference in New Issue
	
	Block a user