spotify-downloader/core/youtube_tools.py

from bs4 import BeautifulSoup
import urllib
import pafy

from core import internals
from core import const

import os
import pprint

log = const.log

# Fix download speed throttle on short duration tracks
# Read more on mps-youtube/pafy#199
pafy.g.opener.addheaders.append(('Range', 'bytes=0-'))


def set_api_key():
    if const.args.youtube_api_key:
        key = const.args.youtube_api_key
    else:
        # Please respect this YouTube token :)
        key = 'AIzaSyC6cEeKlxtOPybk9sEe5ksFN5sB-7wzYp0'
    pafy.set_api_key(key)


def go_pafy(raw_song, meta_tags=None):
    """ Parse track from YouTube. """
    if internals.is_youtube(raw_song):
        track_info = pafy.new(raw_song)
    else:
        track_url = generate_youtube_url(raw_song, meta_tags)

        if track_url:
            track_info = pafy.new(track_url)
        else:
            track_info = None

    return track_info


def get_youtube_title(content, number=None):
    """ Get the YouTube video's title. """
    title = content.title
    if number:
        return '{0}. {1}'.format(number, title)
    else:
        return title


def download_song(file_name, content):
    """ Download the audio file from YouTube. """
    _, extension = os.path.splitext(file_name)
    if extension in ('.webm', '.m4a'):
        link = content.getbestaudio(preftype=extension[1:])
    else:
        log.debug('No audio streams available for {} type'.format(extension))
        return False

    if link:
        log.debug('Downloading from URL: ' + link.url)
        filepath = os.path.join(const.args.folder, file_name)
        log.debug('Saving to: ' + filepath)
        link.download(filepath=filepath)
        return True
    else:
        log.debug('No audio streams available')
        return False


def generate_search_url(query):
    """ Generate YouTube search URL for the given song. """
    # urllib.request.quote() encodes string with special characters
    quoted_query = urllib.request.quote(query)
    # Special YouTube URL filter to search only for videos
    url = 'https://www.youtube.com/results?sp=EgIQAQ%253D%253D&q={0}'.format(quoted_query)
    return url


def is_video(result):
    # ensure result is not a channel
    not_video = result.find('channel') is not None or \
                'yt-lockup-channel' in result.parent.attrs['class'] or \
                'yt-lockup-channel' in result.attrs['class']

    # ensure result is not a mix/playlist
    not_video = not_video or \
               'yt-lockup-playlist' in result.parent.attrs['class']

    # ensure video result is not an advertisement
    not_video = not_video or \
                result.find('googleads') is not None

    video = not not_video
    return video


def generate_youtube_url(raw_song, meta_tags):
    url_fetch = GenerateYouTubeURL(raw_song, meta_tags)
    if const.args.youtube_api_key:
        url = url_fetch.api()
    else:
        url = url_fetch.scrape()
    return url


class GenerateYouTubeURL:
    def __init__(self, raw_song, meta_tags):
        self.raw_song = raw_song
        self.meta_tags = meta_tags

        if meta_tags is None:
            self.search_query = raw_song
        else:
            self.search_query = internals.format_string(const.args.search_format,
                                                        meta_tags, force_spaces=True)

    def _best_match(self, videos):
        """ Select the best matching video from a list of videos. """
        if const.args.manual:
            log.info(self.raw_song)
            log.info('0. Skip downloading this song.\n')
            # fetch all video links on first page on YouTube
            for i, v in enumerate(videos):
                log.info(u'{0}. {1} {2} {3}'.format(i+1, v['title'], v['videotime'],
                      "http://youtube.com/watch?v="+v['link']))
            # let user select the song to download
            result = internals.input_link(videos)
            if result is None:
                return None
        else:
            if not self.meta_tags:
                # if the metadata could not be acquired, take the first result
                # from Youtube because the proper song length is unknown
                result = videos[0]
                log.debug('Since no metadata found on Spotify, going with the first result')
            else:
                # filter out videos that do not have a similar length to the Spotify song
                duration_tolerance = 10
                max_duration_tolerance = 20
                possible_videos_by_duration = []

                # start with a reasonable duration_tolerance, and increment duration_tolerance
                # until one of the Youtube results falls within the correct duration or
                # the duration_tolerance has reached the max_duration_tolerance
                while len(possible_videos_by_duration) == 0:
                    possible_videos_by_duration = list(filter(lambda x: abs(x['seconds'] - self.meta_tags['duration']) <= duration_tolerance, videos))
                    duration_tolerance += 1
                    if duration_tolerance > max_duration_tolerance:
                        log.error("{0} by {1} was not found.\n".format(self.meta_tags['name'], self.meta_tags['artists'][0]['name']))
                        return None

                result = possible_videos_by_duration[0]

        if result:
            url = "http://youtube.com/watch?v={0}".format(result['link'])
        else:
            url = None

        return url

    def scrape(self, bestmatch=True, tries_remaining=5):
        """ Search and scrape YouTube to return a list of matching videos. """

        # prevents an infinite loop but allows for a few retries
        if tries_remaining == 0:
            log.debug('No tries left. I quit.')
            return

        search_url = generate_search_url(self.search_query)
        log.debug('Opening URL: {0}'.format(search_url))

        item = urllib.request.urlopen(search_url).read()
        items_parse = BeautifulSoup(item, "html.parser")

        videos = []
        for x in items_parse.find_all('div', {'class': 'yt-lockup-dismissable yt-uix-tile'}):

            if not is_video(x):
                continue

            y = x.find('div', class_='yt-lockup-content')
            link = y.find('a')['href'][-11:]
            title = y.find('a')['title']

            try:
                videotime = x.find('span', class_="video-time").get_text()
            except AttributeError:
                log.debug('Could not find video duration on YouTube, retrying..')
                return self.scrape(bestmatch=bestmatch, tries_remaining=tries_remaining-1)

            youtubedetails = {'link': link, 'title': title, 'videotime': videotime,
                              'seconds': internals.get_sec(videotime)}
            videos.append(youtubedetails)

        if bestmatch:
            return self._best_match(videos)

        return videos


    def api(self, bestmatch=True):
        """ Use YouTube API to search and return a list of matching videos. """

        query = { 'part'       : 'snippet',
                  'maxResults' :  50,
                  'type'       : 'video' }

        if const.args.music_videos_only:
            query['videoCategoryId'] = '10'

        if not self.meta_tags:
            song = self.raw_song
            query['q'] = song
        else:
            query['q'] = self.search_query
        log.debug('query: {0}'.format(query))

        data = pafy.call_gdata('search', query)
        data['items'] = list(filter(lambda x: x['id'].get('videoId') is not None,
                                    data['items']))
        query_results = {'part': 'contentDetails,snippet,statistics',
                  'maxResults': 50,
                  'id': ','.join(i['id']['videoId'] for i in data['items'])}
        log.debug('query_results: {0}'.format(query_results))

        vdata = pafy.call_gdata('videos', query_results)

        videos = []
        for x in vdata['items']:
            duration_s = pafy.playlist.parseISO8591(x['contentDetails']['duration'])
            youtubedetails = {'link': x['id'], 'title': x['snippet']['title'],
                              'videotime':internals.videotime_from_seconds(duration_s),
                              'seconds': duration_s}
            videos.append(youtubedetails)

        if bestmatch:
            return self._best_match(videos)

        return videos