11 Commits

Author SHA1 Message Date
Ali Parlakci
a6997898ce Update version 2018-07-23 23:37:27 +03:00
Ali Parlakci
61632c7143 Improve error handling 2018-07-23 23:33:11 +03:00
Ali Parlakçı
9bff3399a8 Typo fix 2018-07-23 23:19:29 +03:00
Ali Parlakçı
b00d185f67 Update changelog 2018-07-23 23:18:10 +03:00
Ali Parlakçı
7314e17125 Added erome support 2018-07-23 23:16:56 +03:00
Ali Parlakci
2d334d56bf remove exclude mode 2018-07-23 22:57:54 +03:00
Ali Parlakçı
974517928f Update README.md 2018-07-23 22:07:28 +03:00
Ali Parlakçı
bcae177b1e Split download function 2018-07-23 22:06:33 +03:00
Ali Parlakci
229def6578 Merge branch 'master' of https://github.com/aliparlakci/bulk-downloader-for-reddit 2018-07-23 18:49:39 +03:00
Ali Parlakci
59b0376d6e Added directions for frontpage 2018-07-23 18:49:28 +03:00
Ali Parlakçı
cf1dc7d08c Rename changelog section 2018-07-22 18:16:11 +03:00
6 changed files with 261 additions and 174 deletions

View File

@@ -52,7 +52,13 @@ It should redirect to a page which shows your **imgur_client_id** and **imgur_cl
- All of the user data is held in **config.json** file which is in a folder named "Bulk Downloader for Reddit" in your **Home** directory. You can edit
them, there.
## Changelog
## Changes on *master*
### [23/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/7314e17125aa78fd4e6b28e26fda7ec7db7e0147)
- Split download() function
- Added erome support
- Remove exclude feature
- Bug fix
### [22/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/a67da461d2fcd70672effcb20c8179e3224091bb)
- Put log files in a folder named "LOG_FILES"
- Fixed the bug that makes multireddit mode unusable

View File

@@ -40,8 +40,6 @@ optional arguments:
all
--NoDownload Just gets the posts and store them in a file for
downloading later
--exclude {imgur,gfycat,direct,self} [{imgur,gfycat,direct,self} ...]
Do not download specified links
```
# Examples

255
script.py
View File

@@ -13,7 +13,7 @@ import time
from io import StringIO
from pathlib import Path, PurePath
from src.downloader import Direct, Gfycat, Imgur, Self
from src.downloader import Direct, Gfycat, Imgur, Self, Erome
from src.errors import *
from src.parser import LinkDesigner
from src.searcher import getPosts
@@ -22,7 +22,7 @@ from src.tools import (GLOBAL, createLogFile, jsonFile, nameCorrector,
__author__ = "Ali Parlakci"
__license__ = "GPL"
__version__ = "1.3.1"
__version__ = "1.4.0"
__maintainer__ = "Ali Parlakci"
__email__ = "parlakciali@gmail.com"
@@ -144,11 +144,6 @@ def parseArguments(arguments=[]):
action="store_true",
default=False)
parser.add_argument("--exclude",
nargs="+",
help="Do not download specified links",
choices=["imgur","gfycat","direct","self"],
type=str)
if arguments == []:
return parser.parse_args()
@@ -242,10 +237,10 @@ class PromptUser:
if programMode == "subreddit":
subredditInput = input("subreddit: ")
subredditInput = input("subreddit (enter frontpage for frontpage): ")
GLOBAL.arguments.subreddit = subredditInput
while not subredditInput == "":
while not (subredditInput == "" or subredditInput.lower() == "frontpage"):
subredditInput = input("subreddit: ")
GLOBAL.arguments.subreddit += "+" + subredditInput
@@ -253,7 +248,8 @@ class PromptUser:
GLOBAL.arguments.subreddit = "+".join(GLOBAL.arguments.subreddit.split())
# DELETE THE PLUS (+) AT THE END
GLOBAL.arguments.subreddit = GLOBAL.arguments.subreddit[:-1]
if not subredditInput.lower() == "frontpage":
GLOBAL.arguments.subreddit = GLOBAL.arguments.subreddit[:-1]
print("\nselect sort type:")
sortTypes = [
@@ -326,33 +322,6 @@ class PromptUser:
GLOBAL.arguments.log = input("\nlog file directory:")
if Path(GLOBAL.arguments.log ).is_file():
break
GLOBAL.arguments.exclude = []
sites = ["imgur","gfycat","direct","self"]
excludeInput = input("exclude: ").lower()
if excludeInput in sites and excludeInput != "":
GLOBAL.arguments.exclude = [excludeInput]
while not excludeInput == "":
while True:
excludeInput = input("exclude: ").lower()
if not excludeInput in sites or excludeInput in GLOBAL.arguments.exclude:
break
elif excludeInput == "":
break
else:
GLOBAL.arguments.exclude.append(excludeInput)
for i in range(len(GLOBAL.arguments.exclude)):
if " " in GLOBAL.arguments.exclude[i]:
inputWithWhitespace = GLOBAL.arguments.exclude[i]
del GLOBAL.arguments.exclude[i]
for siteInput in inputWithWhitespace.split():
if siteInput in sites and siteInput not in GLOBAL.arguments.exclude:
GLOBAL.arguments.exclude.append(siteInput)
while True:
try:
GLOBAL.arguments.limit = int(input("\nlimit (0 for none): "))
@@ -472,20 +441,76 @@ def postExists(POST):
else:
return False
def downloadPost(SUBMISSION):
directory = GLOBAL.directory / SUBMISSION['postSubreddit']
global lastRequestTime
downloaders = {
"imgur":Imgur,"gfycat":Gfycat,"erome":Erome,"direct":Direct,"self":Self
}
if SUBMISSION['postType'] in downloaders:
print(SUBMISSION['postType'].upper())
if SUBMISSION['postType'] == "imgur":
if int(time.time() - lastRequestTime) <= 2:
pass
credit = Imgur.get_credits()
IMGUR_RESET_TIME = credit['UserReset']-time.time()
USER_RESET = ("after " \
+ str(int(IMGUR_RESET_TIME/60)) \
+ " Minutes " \
+ str(int(IMGUR_RESET_TIME%60)) \
+ " Seconds")
print(
"Client: {} - User: {} - Reset {}".format(
credit['ClientRemaining'],
credit['UserRemaining'],
USER_RESET
)
)
if not (credit['UserRemaining'] == 0 or \
credit['ClientRemaining'] == 0):
"""This block of code is needed
"""
if int(time.time() - lastRequestTime) <= 2:
pass
lastRequestTime = time.time()
else:
if credit['UserRemaining'] == 0:
KEYWORD = "user"
elif credit['ClientRemaining'] == 0:
KEYWORD = "client"
raise ImgurLimitError('{} LIMIT EXCEEDED\n'.format(KEYWORD.upper()))
downloaders[SUBMISSION['postType']] (directory,SUBMISSION)
else:
raise NoSuitablePost
return None
def download(submissions):
"""Analyze list of submissions and call the right function
to download each one, catch errors, update the log files
"""
subsLenght = len(submissions)
global lastRequestTime
lastRequestTime = 0
downloadedCount = subsLenght
duplicates = 0
BACKUP = {}
if GLOBAL.arguments.exclude is not None:
ToBeDownloaded = GLOBAL.arguments.exclude
else:
ToBeDownloaded = []
FAILED_FILE = createLogFile("FAILED")
@@ -499,131 +524,45 @@ def download(submissions):
)
if postExists(submissions[i]):
result = False
print(submissions[i]['postType'].upper())
print("It already exists")
duplicates += 1
downloadedCount -= 1
continue
directory = GLOBAL.directory / submissions[i]['postSubreddit']
if submissions[i]['postType'] == 'imgur' and not 'imgur' in ToBeDownloaded:
print("IMGUR",end="")
while int(time.time() - lastRequestTime) <= 2:
pass
credit = Imgur.get_credits()
IMGUR_RESET_TIME = credit['UserReset']-time.time()
USER_RESET = ("after " \
+ str(int(IMGUR_RESET_TIME/60)) \
+ " Minutes " \
+ str(int(IMGUR_RESET_TIME%60)) \
+ " Seconds")
print(
" => Client: {} - User: {} - Reset {}".format(
credit['ClientRemaining'],
credit['UserRemaining'],
USER_RESET
)
)
if not (credit['UserRemaining'] == 0 or \
credit['ClientRemaining'] == 0):
"""This block of code is needed
"""
while int(time.time() - lastRequestTime) <= 2:
pass
lastRequestTime = time.time()
try:
Imgur(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
duplicates += 1
downloadedCount -= 1
except ImgurLoginError:
print(
"Imgur login failed. Quitting the program "\
"as unexpected errors might occur."
)
sys.exit()
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
else:
if credit['UserRemaining'] == 0:
KEYWORD = "user"
elif credit['ClientRemaining'] == 0:
KEYWORD = "client"
print('{} LIMIT EXCEEDED\n'.format(KEYWORD.upper()))
FAILED_FILE.add(
{int(i+1):['{} LIMIT EXCEEDED\n'.format(KEYWORD.upper()),
submissions[i]]}
)
downloadedCount -= 1
elif submissions[i]['postType'] == 'gfycat' and not 'gfycat' in ToBeDownloaded:
print("GFYCAT")
try:
Gfycat(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
duplicates += 1
downloadedCount -= 1
except NotADownloadableLinkError as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
elif submissions[i]['postType'] == 'direct' and not 'direct' in ToBeDownloaded:
print("DIRECT")
try:
Direct(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
downloadedCount -= 1
duplicates += 1
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
try:
downloadPost(submissions[i])
elif submissions[i]['postType'] == 'self' and not 'self' in ToBeDownloaded:
print("SELF")
try:
Self(directory,submissions[i])
except FileAlreadyExistsError:
print("It already exists")
duplicates += 1
downloadedCount -= 1
except FileAlreadyExistsError:
print("It already exists")
downloadedCount -= 1
duplicates += 1
except ImgurLoginError:
print(
"Imgur login failed. \nQuitting the program "\
"as unexpected errors might occur."
)
sys.exit()
except Exception as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
except ImgurLimitError as exception:
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
else:
except NotADownloadableLinkError as exception:
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
except NoSuitablePost:
print("No match found, skipping...")
downloadedCount -= 1
except Exception as exception:
# raise exception
print(exception)
FAILED_FILE.add({int(i+1):[str(exception),submissions[i]]})
downloadedCount -= 1
if duplicates:
print("\n There was {} duplicates".format(duplicates))
@@ -660,7 +599,7 @@ def main():
logDir = Path(GLOBAL.arguments.log)
download(postFromLog(logDir))
sys.exit()
try:
POSTS = getPosts(prepareAttributes())
except InsufficientPermission:
@@ -705,10 +644,12 @@ if __name__ == "__main__":
print = printToFile
GLOBAL.RUN_TIME = time.time()
main()
except KeyboardInterrupt:
if GLOBAL.directory is None:
GLOBAL.directory = Path(".\\")
print("\nQUITTING...")
except Exception as exception:
if GLOBAL.directory is None:
GLOBAL.directory = Path(".\\")
@@ -716,4 +657,4 @@ if __name__ == "__main__":
exc_info=full_exc_info(sys.exc_info()))
print(log_stream.getvalue())
input("Press enter to quit\n")
input("\nPress enter to quit\n")

View File

@@ -2,7 +2,9 @@ import io
import os
import sys
import urllib.request
from html.parser import HTMLParser
from pathlib import Path
from urllib.error import HTTPError
import imgurpython
from multiprocessing import Queue
@@ -69,6 +71,129 @@ def getFile(fileDir,tempDir,imageURL,indent=0):
else:
raise FileAlreadyExistsError
class Erome:
def __init__(self,directory,post):
try:
IMAGES = self.getLinks(post['postURL'])
except urllib.error.HTTPError:
raise NotADownloadableLinkError("Not a downloadable link")
imagesLenght = len(IMAGES)
howManyDownloaded = imagesLenght
duplicates = 0
if imagesLenght == 1:
extension = getExtension(IMAGES[0])
title = nameCorrector(post['postTitle'])
print(title+"_" +post['postId']+extension)
fileDir = title + "_" + post['postId'] + extension
fileDir = directory / fileDir
tempDir = title + "_" + post['postId'] + '.tmp'
tempDir = directory / tempDir
imageURL = "https:" + IMAGES[0]
try:
getFile(fileDir,tempDir,imageURL)
except FileNameTooLong:
fileDir = directory / (post['postId'] + extension)
tempDir = directory / (post['postId'] + '.tmp')
getFile(fileDir,tempDir,imageURL)
else:
title = nameCorrector(post['postTitle'])
print(title+"_"+post['postId'],end="\n\n")
folderDir = directory / (title+"_"+post['postId'])
try:
if not os.path.exists(folderDir):
os.makedirs(folderDir)
except FileNotFoundError:
folderDir = directory / post['postId']
os.makedirs(folderDir)
for i in range(imagesLenght):
extension = getExtension(IMAGES[i])
fileName = str(i+1)
imageURL = "https:" + IMAGES[i]
fileDir = folderDir / (fileName + extension)
tempDir = folderDir / (fileName + ".tmp")
print(" ({}/{})".format(i+1,imagesLenght))
print(" {}".format(fileName+extension))
try:
getFile(fileDir,tempDir,imageURL,indent=2)
print()
except FileAlreadyExistsError:
print(" The file already exists" + " "*10,end="\n\n")
duplicates += 1
howManyDownloaded -= 1
except Exception as exception:
raise exception
print("\n Could not get the file")
print(" " + str(exception) + "\n")
exceptionType = exception
howManyDownloaded -= 1
if duplicates == imagesLenght:
raise FileAlreadyExistsError
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)
def getLinks(self,url,lineNumber=129):
content = []
lineNumber = None
class EromeParser(HTMLParser):
tag = None
def handle_starttag(self, tag, attrs):
self.tag = {tag:{attr[0]: attr[1] for attr in attrs}}
pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
for i in range(len(pageSource)):
obj = EromeParser()
obj.feed(pageSource[i])
tag = obj.tag
if tag is not None:
if "div" in tag:
if "id" in tag["div"]:
if tag["div"]["id"] == "album":
lineNumber = i
break
for line in pageSource[lineNumber:]:
obj = EromeParser()
obj.feed(line)
tag = obj.tag
if tag is not None:
if "img" in tag:
if "class" in tag["img"]:
if tag["img"]["class"]=="img-front":
content.append(tag["img"]["src"])
elif "source" in tag:
content.append(tag["source"]["src"])
return [
link for link in content \
if link.endswith("_480p.mp4") or not link.endswith(".mp4")
]
class Imgur:
def __init__(self,directory,post):
self.imgurClient = self.initImgur()
@@ -171,7 +296,7 @@ class Imgur:
if duplicates == imagesLenght:
raise FileAlreadyExistsError
elif howManyDownloaded < imagesLenght:
elif howManyDownloaded + duplicates < imagesLenght:
raise AlbumNotDownloadedCompletely(
"Album Not Downloaded Completely"
)

View File

@@ -80,4 +80,10 @@ class InvalidSortingType(Exception):
pass
class FileNotFoundError(Exception):
pass
class NoSuitablePost(Exception):
pass
class ImgurLimitError(Exception):
pass

View File

@@ -299,6 +299,8 @@ def redditSearcher(posts,SINGLE_POST=False):
gfycatCount = 0
global imgurCount
imgurCount = 0
global eromeCount
eromeCount = 0
global directCount
directCount = 0
global selfCount
@@ -360,8 +362,15 @@ def redditSearcher(posts,SINGLE_POST=False):
if not len(subList) == 0:
print(
"\nTotal of {} submissions found!\n"\
"{} GFYCATs, {} IMGURs, {} DIRECTs and {} SELF POSTS\n"
.format(len(subList),gfycatCount,imgurCount,directCount,selfCount)
"{} GFYCATs, {} IMGURs, {} EROMEs, {} DIRECTs and {} SELF POSTS\n"
.format(
len(subList),
gfycatCount,
imgurCount,
eromeCount,
directCount,
selfCount
)
)
return subList
else:
@@ -370,6 +379,7 @@ def redditSearcher(posts,SINGLE_POST=False):
def checkIfMatching(submission):
global gfycatCount
global imgurCount
global eromeCount
global directCount
global selfCount
@@ -383,19 +393,20 @@ def checkIfMatching(submission):
except AttributeError:
return None
if ('gfycat' in submission.domain) or \
('imgur' in submission.domain):
if 'gfycat' in submission.domain:
details['postType'] = 'gfycat'
gfycatCount += 1
return details
if 'gfycat' in submission.domain:
details['postType'] = 'gfycat'
gfycatCount += 1
return details
elif 'imgur' in submission.domain:
details['postType'] = 'imgur'
imgurCount += 1
return details
elif 'imgur' in submission.domain:
details['postType'] = 'imgur'
imgurCount += 1
return details
elif 'erome' in submission.domain:
details['postType'] = 'erome'
eromeCount += 1
return details
elif isDirectLink(submission.url) is not False:
details['postType'] = 'direct'