Since the piratebay has the file size in human readable form it is hard to sort by file size. Now we have a de-human size function in our utils that can be used to convert these sizes to byte form.

2017-11-19 00:07:40 +01:00
parent 8ec95cf216
commit cc672efed2
2 changed files with 45 additions and 3 deletions
@@ -4,7 +4,7 @@ import re, logging
 from bs4 import BeautifulSoup

 from http_utils import convert_query_to_percent_encoded_octets, build_url, fetch_url
-from utils import return_re_match
+from utils import return_re_match, deHumansize
 from torrent import Torrent

 class Piratebay(object):
@@ -85,6 +85,7 @@ class Piratebay(object):
 				
 				date = return_re_match(info_text, r"(\d+\-\d+\s\d+)|(Y\-day\s\d{2}\:\d{2})")
 				size = return_re_match(info_text, r"(\d+(\.\d+)?\s[a-zA-Z]+)")
+				byteSize = deHumansize(size)

 				# COULD NOT FIND HREF!
 				if (magnet is None):
@@ -94,7 +95,7 @@ class Piratebay(object):
 				seed = seed_and_leech[0].get_text()
 				leech = seed_and_leech[1].get_text()

-				torrent = Torrent(name, magnet['href'], size, uploader, date, seed, leech, url)
+				torrent = Torrent(name, magnet['href'], byteSize, uploader, date, seed, leech, url)

 				torrents_found.append(torrent)
 			else:
@@ -3,11 +3,20 @@
 # @Author: KevinMidboe
 # @Date:   2017-11-01 15:57:23
 # @Last Modified by:   KevinMidboe
-# @Last Modified time: 2017-11-02 16:20:29
+# @Last Modified time: 2017-11-19 00:05:10

 import re
 from datetime import datetime

+SYMBOLS = {
+	'customary'     : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
+	'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
+					   'zetta', 'iotta'),
+	'iec'           : ('BiB', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'),
+	'iec_ext'       : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
+					   'zebi', 'yobi'),
+}
+
 def sanitize(string, ignore_characters=None, replace_characters=None):
 	"""Sanitize a string to strip special characters.

@@ -49,6 +58,38 @@ def pagesToCount(multiple, total):
 		return total
 	return multiple

+def deHumansize(s):
+	""" 
+	Attempts to guess the string format based on default symbols
+	set and return the corresponding bytes as an integer.
+	When unable to recognize the format ValueError is raised.
+
+	:param str s: human file size that we want to convert
+	:return: the guessed bytes in from the human file size
+	:rtype: int
+	"""
+
+	init = s
+	num = ""
+	while s and s[0:1].isdigit() or s[0:1] == '.':
+		num += s[0]
+		s = s[1:]
+	num = float(num)
+	letter = s.strip()
+	for name, sset in SYMBOLS.items():
+		if letter in sset:
+			break
+	else:
+		if letter == 'k':
+			# treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
+			sset = SYMBOLS['customary']
+			letter = letter.upper()
+		else:
+			raise ValueError("can't interpret %r" % init)
+	prefix = {sset[0]:1}
+	for i, s in enumerate(sset[1:]):
+		prefix[s] = 1 << (i+1)*10
+	return int(num * prefix[letter])

 def humansize(nbytes):
 	suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']