From 7314e17125aa78fd4e6b28e26fda7ec7db7e0147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ali=20Parlak=C3=A7=C4=B1?= Date: Mon, 23 Jul 2018 23:16:56 +0300 Subject: [PATCH] Added erome support --- script.py | 9 ++-- src/downloader.py | 131 +++++++++++++++++++++++++++++++++++++++++++++- src/searcher.py | 37 ++++++++----- 3 files changed, 158 insertions(+), 19 deletions(-) diff --git a/script.py b/script.py index 76b1dcc..14d5b54 100644 --- a/script.py +++ b/script.py @@ -13,7 +13,7 @@ import time from io import StringIO from pathlib import Path, PurePath -from src.downloader import Direct, Gfycat, Imgur, Self +from src.downloader import Direct, Gfycat, Imgur, Self, Erome from src.errors import * from src.parser import LinkDesigner from src.searcher import getPosts @@ -322,7 +322,6 @@ class PromptUser: GLOBAL.arguments.log = input("\nlog file directory:") if Path(GLOBAL.arguments.log ).is_file(): break - while True: try: GLOBAL.arguments.limit = int(input("\nlimit (0 for none): ")) @@ -447,7 +446,9 @@ def downloadPost(SUBMISSION): global lastRequestTime - downloaders = {"imgur":Imgur,"gfycat":Gfycat,"direct":Direct,"self":Self} + downloaders = { + "imgur":Imgur,"gfycat":Gfycat,"erome":Erome,"direct":Direct,"self":Self + } if SUBMISSION['postType'] in downloaders: @@ -572,8 +573,6 @@ def download(submissions): else: print(" Total of {} links downloaded!".format(downloadedCount)) - return None - def main(): GLOBAL.arguments = parseArguments() diff --git a/src/downloader.py b/src/downloader.py index e7a93c8..88775e1 100644 --- a/src/downloader.py +++ b/src/downloader.py @@ -2,6 +2,7 @@ import io import os import sys import urllib.request +from html.parser import HTMLParser from pathlib import Path import imgurpython @@ -69,6 +70,134 @@ def getFile(fileDir,tempDir,imageURL,indent=0): else: raise FileAlreadyExistsError +class Erome: + def __init__(self,directory,post): + # try: + # IMAGES = self.getLinks(post['postURL']) + # except IndexError: + # # raise NotADownloadableLinkError("Could not read the page source") + # pass + # except Exception as exception: + # pass + # # raise NotADownloadableLinkError("Could not read the page source") + IMAGES = self.getLinks(post['postURL']) + + imagesLenght = len(IMAGES) + howManyDownloaded = imagesLenght + duplicates = 0 + + if imagesLenght == 1: + + extension = getExtension(IMAGES[0]) + + title = nameCorrector(post['postTitle']) + print(title+"_" +post['postId']+extension) + + fileDir = title + "_" + post['postId'] + extension + fileDir = directory / fileDir + + tempDir = title + "_" + post['postId'] + '.tmp' + tempDir = directory / tempDir + + imageURL = "https:" + IMAGES[0] + + try: + getFile(fileDir,tempDir,imageURL) + except FileNameTooLong: + fileDir = directory / (post['postId'] + extension) + tempDir = directory / (post['postId'] + '.tmp') + getFile(fileDir,tempDir,imageURL) + + else: + title = nameCorrector(post['postTitle']) + print(title+"_"+post['postId'],end="\n\n") + + folderDir = directory / (title+"_"+post['postId']) + + try: + if not os.path.exists(folderDir): + os.makedirs(folderDir) + except FileNotFoundError: + folderDir = directory / post['postId'] + os.makedirs(folderDir) + + for i in range(imagesLenght): + + extension = getExtension(IMAGES[i]) + + fileName = str(i+1) + imageURL = "https:" + IMAGES[i] + + fileDir = folderDir / (fileName + extension) + tempDir = folderDir / (fileName + ".tmp") + + print(" ({}/{})".format(i+1,imagesLenght)) + print(" {}".format(fileName+extension)) + + try: + getFile(fileDir,tempDir,imageURL,indent=2) + print() + except FileAlreadyExistsError: + print(" The file already exists" + " "*10,end="\n\n") + duplicates += 1 + howManyDownloaded -= 1 + + except Exception as exception: + raise exception + print("\n Could not get the file") + print(" " + str(exception) + "\n") + exceptionType = exception + howManyDownloaded -= 1 + + if duplicates == imagesLenght: + raise FileAlreadyExistsError + elif howManyDownloaded + duplicates < imagesLenght: + raise AlbumNotDownloadedCompletely( + "Album Not Downloaded Completely" + ) + + def getLinks(self,url,lineNumber=129): + + content = [] + lineNumber = None + + class EromeParser(HTMLParser): + tag = None + def handle_starttag(self, tag, attrs): + self.tag = {tag:{attr[0]: attr[1] for attr in attrs}} + + pageSource = (urllib.request.urlopen(url).read().decode().split('\n')) + + """ FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS""" + for i in range(len(pageSource)): + obj = EromeParser() + obj.feed(pageSource[i]) + tag = obj.tag + + if tag is not None: + if "div" in tag: + if "id" in tag["div"]: + if tag["div"]["id"] == "album": + lineNumber = i + break + + for line in pageSource[lineNumber:]: + obj = EromeParser() + obj.feed(line) + tag = obj.tag + if tag is not None: + if "img" in tag: + if "class" in tag["img"]: + if tag["img"]["class"]=="img-front": + content.append(tag["img"]["src"]) + elif "source" in tag: + content.append(tag["source"]["src"]) + + return [ + link for link in content \ + if link.endswith("_480p.mp4") or not link.endswith(".mp4") + ] + class Imgur: def __init__(self,directory,post): self.imgurClient = self.initImgur() @@ -171,7 +300,7 @@ class Imgur: if duplicates == imagesLenght: raise FileAlreadyExistsError - elif howManyDownloaded < imagesLenght: + elif howManyDownloaded + duplicates < imagesLenght: raise AlbumNotDownloadedCompletely( "Album Not Downloaded Completely" ) diff --git a/src/searcher.py b/src/searcher.py index 1a8ab4e..ea0d0ec 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -299,6 +299,8 @@ def redditSearcher(posts,SINGLE_POST=False): gfycatCount = 0 global imgurCount imgurCount = 0 + global eromeCount + eromeCount = 0 global directCount directCount = 0 global selfCount @@ -360,8 +362,15 @@ def redditSearcher(posts,SINGLE_POST=False): if not len(subList) == 0: print( "\nTotal of {} submissions found!\n"\ - "{} GFYCATs, {} IMGURs, {} DIRECTs and {} SELF POSTS\n" - .format(len(subList),gfycatCount,imgurCount,directCount,selfCount) + "{} GFYCATs, {} IMGURs, {} EROMEs, {} DIRECTs and {} SELF POSTS\n" + .format( + len(subList), + gfycatCount, + imgurCount, + eromeCount, + directCount, + selfCount + ) ) return subList else: @@ -370,6 +379,7 @@ def redditSearcher(posts,SINGLE_POST=False): def checkIfMatching(submission): global gfycatCount global imgurCount + global eromeCount global directCount global selfCount @@ -383,19 +393,20 @@ def checkIfMatching(submission): except AttributeError: return None - if ('gfycat' in submission.domain) or \ - ('imgur' in submission.domain): + if 'gfycat' in submission.domain: + details['postType'] = 'gfycat' + gfycatCount += 1 + return details - if 'gfycat' in submission.domain: - details['postType'] = 'gfycat' - gfycatCount += 1 - return details + elif 'imgur' in submission.domain: + details['postType'] = 'imgur' + imgurCount += 1 + return details - elif 'imgur' in submission.domain: - details['postType'] = 'imgur' - - imgurCount += 1 - return details + elif 'erome' in submission.domain: + details['postType'] = 'erome' + eromeCount += 1 + return details elif isDirectLink(submission.url) is not False: details['postType'] = 'direct'