Added docstring to all functions. 🎉

This commit is contained in:
2017-12-22 12:52:38 +01:00
parent 6bdf904a40
commit 98c05a380d
4 changed files with 195 additions and 160 deletions

View File

@@ -8,26 +8,41 @@ from urllib.error import URLError
logger = logging.getLogger('torrentSearch') logger = logging.getLogger('torrentSearch')
def build_url(ssl, baseUrl, path, args_dict=[]): def build_url(ssl, baseUrl, path, args_dict={}):
"""
Given the parameters joins them together to a url to
:param bool ssl: if ssl is to be used or not
:param str baseUrl: the start of the url (http://thepiratebay.org)
:param list path: the rest of the path to the url (['search', 'lucifer', '0'])
:param dict args_dict: a dict with the query element we want to append to the url
:return: complete url based on the inputs
:rtype: str
"""
url_parts = list(parse.urlparse(baseUrl)) url_parts = list(parse.urlparse(baseUrl))
url_parts[0] = 'https' if ssl else 'http' url_parts[0] = 'https' if ssl else 'http'
if type(path) is list: url_parts[2] = '/'.join(path)
url_parts[2] = '/'.join(path)
else:
url_parts[2] = path
url_parts[4] = parse.urlencode(args_dict) url_parts[4] = parse.urlencode(args_dict)
return parse.urlunparse(url_parts) return parse.urlunparse(url_parts)
# Converts a input string or list to percent-encoded string,
# this is for encoding information in a Uniform Resource
# Identifier (URI) using urllib
def convert_query_to_percent_encoded_octets(input_query): def convert_query_to_percent_encoded_octets(input_query):
"""
Converts a string with spaces to a string separated by '%20'
:param str input_query:
:return: string with spaces replaced with '%20' if found any
:rtype: str
"""
if type(input_query) is list: if type(input_query) is list:
input_query = ' '.join(input_query) input_query = ' '.join(input_query)
return parse.quote(input_query) return parse.quote(input_query)
def fetch_url(url): def fetch_url(url):
"""
Call and get output for a given url
:param str url: the url we want to make a request to
:return: a response object with contents and status code of the request
:rtype: http.client.HTTPResponse
"""
logger.debug('Fetching query: {}'.format(url)) logger.debug('Fetching query: {}'.format(url))
req = request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) req = request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try: try:

View File

@@ -12,89 +12,99 @@ from torrentSearch.utils import humansize, representsInteger
logger = logging.getLogger('torrentSearch') logger = logging.getLogger('torrentSearch')
class Jackett(object): class Jackett(object):
"""docstring for Jackett""" """docstring for Jackett"""
def __init__(self, apikey, host, path, limit, ssl): def __init__(self, apikey, host, path, limit, ssl):
super(Jackett, self).__init__() super(Jackett, self).__init__()
self.apikey = apikey self.apikey = apikey
self.host = host self.host = host
self.path = path self.path = path
self.page_limit = limit self.page_limit = limit
self.ssl = ssl self.ssl = ssl
# Returns the api key set in the initiator def get_apikey(self):
# return [string] logger.debug('Using api key: {}'.format(self.apikey))
def get_apikey(self): return self.apikey
logger.debug('Using api key: {}'.format(self.apikey))
return self.apikey
# Returns the path set in the initiator def get_path(self):
# return [string] return self.path
def get_path(self):
return self.path
# Returns the page_limit set in the initiator def get_page_limit(self):
# return [string] logger.debug('Current page limit: {} pages'.format(self.page_limit))
def get_page_limit(self): return self.page_limit
logger.debug('Current page limit: {} pages'.format(self.page_limit))
return self.page_limit
# Starts the call to getting result from our indexer def search(self, query):
# query [string] """
# returns [List of Torrent objects] Starts the call to getting result from our indexer
def search(self, query): :param jackett.Jackett self: object instance
baseUrl = 'http://' + self.host :param str query: query we want to search for
path = self.get_path() :return: list of results we found from scraping jackett output based on query
url_args = { :rtype: list
'apikey': self.get_apikey(), """
'limit': self.get_page_limit(), baseUrl = 'http://' + self.host
'q': query path = self.get_path()
} url_args = {
logger.debug('Url arguments for jackett search: {}'.format(url_args)) 'apikey': self.get_apikey(),
'limit': self.get_page_limit(),
'q': query
}
logger.debug('Url arguments for jackett search: {}'.format(url_args))
url = build_url(self.ssl, baseUrl, path, url_args) url = build_url(self.ssl, baseUrl, path, url_args)
res = fetch_url(url) res = fetch_url(url)
return self.parse_xml_for_torrents(res.read()) return self.parse_xml_for_torrents(res.read())
# def __init__(self, name, magnet=None, size=None, uploader=None, date=None, def find_xml_attribute(self, xml_element, attr):
# seed_count=None, leech_count=None, url=None): """
Finds a specific XML attribute given a element name
:param jackett.Jackett self: object instance
:param xml.etree.ElementTree.Element xml_element: the xml tree we want to search
:param str attr: the attribute/element name we want to find in the xml tree
:return: the value of the element fiven the attr/element name
:rtype: str
"""
value = xml_element.find(attr)
if (value != None):
logger.debug('Found attribute: {}'.format(attr))
return value.text
else:
logger.warning('Could not find attribute: {}'.format(attr))
return ''
def find_xml_attribute(self, xml_element, attr): def parse_xml_for_torrents(self, raw_xml):
value = xml_element.find(attr) """
if (value != None): Finds a specific XML attribute given a element name
logger.debug('Found attribute: {}'.format(attr)) :param jackett.Jackett self: object instance
return value.text :param bytes raw_xml: the xml page returned by querying jackett
else: :return: all the torrents we found in the xml page
logger.warning('Could not find attribute: {}'.format(attr)) :rtype: list
return '' """
tree = ET.fromstring(raw_xml)
channel = tree.find('channel')
results = []
for child in channel.findall('item'):
title = self.find_xml_attribute(child, 'title')
date = self.find_xml_attribute(child, 'pubDate')
magnet = self.find_xml_attribute(child, 'link')
size = self.find_xml_attribute(child, 'size')
files = self.find_xml_attribute(child, 'files')
seeders = 0
peers = 0
def parse_xml_for_torrents(self, raw_xml): for elm in child.findall('{http://torznab.com/schemas/2015/feed}attr'):
tree = ET.fromstring(raw_xml) if elm.get('name') == 'seeders':
channel = tree.find('channel') seeders = elm.get('value')
results = [] if elm.get('name') == 'peers':
for child in channel.findall('item'): peers = elm.get('value')
title = self.find_xml_attribute(child, 'title')
date = self.find_xml_attribute(child, 'pubDate')
magnet = self.find_xml_attribute(child, 'link')
size = self.find_xml_attribute(child, 'size')
files = self.find_xml_attribute(child, 'files')
seeders = 0
peers = 0
for elm in child.findall('{http://torznab.com/schemas/2015/feed}attr'): if (size != '' and representsInteger(size)):
if elm.get('name') == 'seeders': size = humansize(int(size))
seeders = elm.get('value')
if elm.get('name') == 'peers':
peers = elm.get('value')
if (size != '' and representsInteger(size)): logger.debug('Found torrent with info: \n\ttitle: {}\n\tmagnet: {}\n\tsize: {}\n\tdate: {}\
size = humansize(int(size)) \n\tseeders: {}\n\tpeers: {}'.format(title, magnet, size, date, seeders, peers))
torrent = Torrent(title, magnet=magnet, size=size, date=date, seed_count=seeders, leech_count=peers)
results.append(torrent)
logger.debug('Found torrent with info: \n\ttitle: {}\n\tmagnet: {}\n\tsize: {}\n\tdate: {}\ return results
\n\tseeders: {}\n\tpeers: {}'.format(title, magnet, size, date, seeders, peers))
torrent = Torrent(title, magnet=magnet, size=size, date=date, seed_count=seeders, leech_count=peers)
results.append(torrent)
return results

View File

@@ -11,98 +11,97 @@ from torrentSearch.torrent import Torrent
logger = logging.getLogger('torrentSearch') logger = logging.getLogger('torrentSearch')
class Piratebay(object): class Piratebay(object):
"""docstring for Piratebay""" """docstring for Piratebay"""
def __init__(self, host, path, limit, ssl): def __init__(self, host, path, limit, ssl):
super(Piratebay, self).__init__() super(Piratebay, self).__init__()
self.host = host self.host = host
self.path = path self.path = path
self.page_limit = limit self.page_limit = limit
self.ssl = ssl self.ssl = ssl
self.page = 0 self.page = 0
self.total_pages = -1 self.total_pages = -1
# Returns the path set in the initiator def get_path(self):
# return [string] return self.path
def get_path(self):
return self.path
# Returns the page_limit set in the initiator def get_page_limit(self):
# return [string] return self.page_limit
def get_page_limit(self):
return self.page_limit
# Starts the call to getting result from our indexer def search(self, query):
# query [string] """
# returns [List of Torrent objects] Starts the call to getting result from our thepiratebay site
def search(self, query): :param piratebay.Piratebay self: object instance
search_query = convert_query_to_percent_encoded_octets(query) :param str query: query we want to search for
baseUrl = 'http://' + self.host :return: list of results we found from scraping thepiratebay site based on query
:rtype: list
"""
search_query = convert_query_to_percent_encoded_octets(query)
baseUrl = 'http://' + self.host
path = [self.get_path(), search_query, str(self.page)] path = [self.get_path(), search_query, str(self.page)]
url = build_url(self.ssl, baseUrl, path) url = build_url(self.ssl, baseUrl, path)
res = fetch_url(url) res = fetch_url(url)
return self.parse_raw_page_for_torrents(res.read()) return self.parse_raw_page_for_torrents(res.read())
def removeHeader(self, bs4_element):
if ('header' in bs4_element['class']):
return bs4_element.find_next('tr')
def removeHeader(self, bs4_element): return bs4_element
if ('header' in bs4_element['class']):
return bs4_element.find_next('tr')
return bs4_element def has_magnet(self, href):
return href and re.compile('magnet').search(href)
def has_magnet(self, href): def parse_raw_page_for_torrents(self, content):
return href and re.compile('magnet').search(href) soup = BeautifulSoup(content, 'html.parser')
content_searchResult = soup.body.find(id='searchResult')
def parse_raw_page_for_torrents(self, content): if content_searchResult is None:
soup = BeautifulSoup(content, 'html.parser') logging.info('No torrents found for the search criteria.')
content_searchResult = soup.body.find(id='searchResult') return None
if content_searchResult is None: listElements = content_searchResult.tr
logging.info('No torrents found for the search criteria.')
return None
listElements = content_searchResult.tr torrentWrapper = self.removeHeader(listElements)
torrentWrapper = self.removeHeader(listElements) torrents_found = []
for torrentElement in torrentWrapper.find_all_next('td'):
if torrentElement.find_all("div", class_='detName'):
torrents_found = [] name = torrentElement.find('a', class_='detLink').get_text()
for torrentElement in torrentWrapper.find_all_next('td'): url = torrentElement.find('a', class_='detLink')['href']
if torrentElement.find_all("div", class_='detName'): magnet = torrentElement.find(href=self.has_magnet)
name = torrentElement.find('a', class_='detLink').get_text() uploader = torrentElement.find('a', class_='detDesc')
url = torrentElement.find('a', class_='detLink')['href']
magnet = torrentElement.find(href=self.has_magnet)
uploader = torrentElement.find('a', class_='detDesc') if uploader is None:
uploader = torrentElement.find('i')
if uploader is None: uploader = uploader.get_text()
uploader = torrentElement.find('i')
uploader = uploader.get_text() info_text = torrentElement.find('font', class_='detDesc').get_text()
info_text = torrentElement.find('font', class_='detDesc').get_text() date = return_re_match(info_text, r"(\d+\-\d+\s\d+)|(Y\-day\s\d{2}\:\d{2})")
size = return_re_match(info_text, r"(\d+(\.\d+)?\s[a-zA-Z]+)")
byteSize = deHumansize(size)
date = return_re_match(info_text, r"(\d+\-\d+\s\d+)|(Y\-day\s\d{2}\:\d{2})") # COULD NOT FIND HREF!
size = return_re_match(info_text, r"(\d+(\.\d+)?\s[a-zA-Z]+)") if (magnet is None):
byteSize = deHumansize(size) logger.warning('Could not find magnet for {}'.format(name))
continue
# COULD NOT FIND HREF! seed_and_leech = torrentElement.find_all_next(attrs={"align": "right"})
if (magnet is None): seed = seed_and_leech[0].get_text()
logger.warning('Could not find magnet for {}'.format(name)) leech = seed_and_leech[1].get_text()
continue
seed_and_leech = torrentElement.find_all_next(attrs={"align": "right"}) torrent = Torrent(name, magnet['href'], byteSize, uploader, date, seed, leech, url)
seed = seed_and_leech[0].get_text()
leech = seed_and_leech[1].get_text()
torrent = Torrent(name, magnet['href'], byteSize, uploader, date, seed, leech, url) torrents_found.append(torrent)
else:
logger.warning('Could not find torrent element on thepiratebay webpage.')
continue
torrents_found.append(torrent) logging.info('Found %s torrents for given search criteria.' % len(torrents_found))
else: return torrents_found
logger.warning('Could not find torrent element on thepiratebay webpage.')
continue
logging.info('Found %s torrents for given search criteria.' % len(torrents_found))
return torrents_found

View File

@@ -79,7 +79,9 @@ def main():
def getConfig(): def getConfig():
""" """
Read path and get configuartion file with site settings Read path and get configuartion file with site settings
Returns config [configparser]
:return: config settings read from 'config.ini'
:rtype: configparser.ConfigParser
""" """
config = configparser.ConfigParser() config = configparser.ConfigParser()
config_dir = os.path.join(BASE_DIR, 'config.ini') config_dir = os.path.join(BASE_DIR, 'config.ini')
@@ -90,7 +92,10 @@ def getConfig():
def createJSONList(torrents): def createJSONList(torrents):
""" """
Iterates over all torrent objects in torrents and gets all attributes which are appended to a list Iterates over all torrent objects in torrents and gets all attributes which are appended to a list
Returns: List of torrents with all their info in a JSON format
:param list torrents: integer of size of torrent file
:return: List of torrents with all their info in a JSON format
:rtype: str
""" """
jsonList = [] jsonList = []
for torrent in torrents: for torrent in torrents:
@@ -129,7 +134,13 @@ def chooseCandidate(torrent_list):
def searchTorrentSite(config, query, site, print_result): def searchTorrentSite(config, query, site, print_result):
""" """
Selects site based on input and finds torrents for that site based on query Selects site based on input and finds torrents for that site based on query
Returns json list with results. If print_results is True in args then also prints the output to terminal
:param configparser.ConfigParser config: integer of size of torrent filest
:param str query: query to search search torrents for
:param str site: the site we want to index/scrape
:param boolean print_result: if the in results should be printed to terminal
:return: json list with results
:rtype: str
""" """
logger.debug('Searching for query {} at {}'.format(query, site)) logger.debug('Searching for query {} at {}'.format(query, site))