add multithreading and object orientation

This commit is contained in:
sigvartmh
2017-05-12 01:22:39 +02:00
parent e438fd3287
commit 18225b375b

View File

@@ -1,102 +1,97 @@
import os import os
import sys
import re import re
import argparse import argparse
import requests as rq import requests as rq
import mechanicalsoup as ms import mechanicalsoup as ms
import html2text import html2text
from multiprocessing import Process
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from getpass import getpass from getpass import getpass
b = ms.StatefulBrowser()
h = html2text.HTML2Text()
url = "https://innsida.ntnu.no/lms-ntnu"
login = input("Enter NTNU-username: ")
password = getpass("Enter your NTNU-password: ")
b.open(url)
b.select_form("form[name=f]")
b["feidename"]=login
b["password"]=password
b.submit_selected()
b.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
b.submit_selected()
key = b.session.cookies.get_dict()
cookie={"JSESSIONID": key["JSESSIONID"]}
print(cookie)
r = rq.get(url, cookies=cookie)
print(r.url)
tc = r.request.headers["Cookie"].split(";")
sp_tc = [[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in tc]
itl_cookies=dict(sp_tc)
print(itl_cookies)
base_url = "https://ntnu.itslearning.com/" base_url = "https://ntnu.itslearning.com/"
folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
c = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=itl_cookies)
print(c.url)
p = bs(c.text, "html.parser")
course = p.find("table",{"class":"h-table-show-first-4-columns"})
t = course.find_all("a",{"class":"ccl-iconlink"})
print(course)
print(t)
courses = []
course_title = {}
for link in t:
title = link.contents[0].contents[0]
course_title[link.get("href")]=title
courses.append(link.get("href"))
print(courses)
print(course_title)
path = os.path.abspath(os.path.curdir)
newpath = os.path.join(path,"scrape")
global failure
failure=0
if not os.path.exists(newpath):
os.makedirs(newpath)
print(path)
print(newpath)
os.chdir(newpath)
def make_folder(curpath, title): def make_folder(curpath, title):
folder_path = os.path.join(curpath,title) folder_path = os.path.join(curpath,title)
print("making dir:",folder_path) #print("making dir:",folder_path)
if not os.path.exists(folder_path): if not os.path.exists(folder_path):
os.makedirs(folder_path) os.makedirs(folder_path)
os.chdir(folder_path) os.chdir(folder_path)
def find_folder_table(html): class itslearning_scraper():
def __init__(self):
self.failure=0
self.browser = ms.StatefulBrowser()
self.html2text = html2text.HTML2Text()
self.start_url = "https://innsida.ntnu.no/lms-ntnu"
path = os.path.abspath(os.path.curdir)
newpath = os.path.join(path,"scraped")
if not os.path.exists(newpath):
os.makedirs(newpath)
os.chdir(newpath)
def login(self, username, password):
self.browser.open(self.start_url)
self.browser.select_form("form[name=f]")
self.browser["feidename"]=username
self.browser["password"]=password
self.browser.submit_selected()
self.browser.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
self.browser.submit_selected()
self.key = self.browser.session.cookies.get_dict()
self.jsession={"JSESSIONID": self.key["JSESSIONID"]}
resp = rq.get(self.start_url, cookies=self.jsession)
self.get_cookies(resp)
self.find_courses()
def get_cookies(self, resp):
split_cookie = resp.request.headers["Cookie"].split(";")
self.cookies = dict([[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in split_cookie])
def enter(self):
username = input("Enter NTNU-username: ")
password = getpass("Enter your NTNU-password: ")
self.login(username,password)
def find_courses(self):
resp = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=self.cookies)
print(resp.url)
three = bs(resp.text, "html.parser")
course = three.find("table",{"class":"h-table-show-first-4-columns"})
active_courses = course.find_all("a",{"class":"ccl-iconlink"})
courses = {}
for link in active_courses:
courses[link.get("href")]=link.contents[0].contents[0]
self.courses = courses
def find_folder_table(self,html):
three = bs(html, "html.parser") three = bs(html, "html.parser")
folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"}) folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
return folders return folders
def download_link(self, link, title):
def download_link(link, title):
print("Trying to download: {}".format(link)) print("Trying to download: {}".format(link))
r = rq.get(link, cookies=itl_cookies, stream=True) r = rq.get(link, cookies=self.cookies, stream=True)
print(r.url)
try: try:
filename = re.search('FileName=(.+?)&',r.url).group(1) filename = re.search('FileName=(.+?)&',r.url).group(1)
except: except:
filename = title filename = title
global failure self.failure += 1
failure += 1 print("File created with name:",filename)
print(filename)
filename = os.path.join(os.path.abspath(os.path.curdir),filename) filename = os.path.join(os.path.abspath(os.path.curdir),filename)
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024): for chunk in r.iter_content(chunk_size=1024):
if chunk: if chunk:
f.write(chunk) f.write(chunk)
print("complete") print("complete")
import sys
def find_file(html): def find_file(self, html):
try: try:
three = bs(html, "html.parser") three = bs(html, "html.parser")
except: except:
@@ -105,42 +100,47 @@ def find_file(html):
print(html.find_all('a')) print(html.find_all('a'))
sys.exit(1) sys.exit(1)
links = three.find_all('a') links = three.find_all('a')
#print(links)
for link in links: for link in links:
if "download.aspx" in link.get("href"): if "download.aspx" in link.get("href"):
download_link(base_url+link.get("href")[2:], failure) Process(target=self.download_link, args=(base_url+link.get("href")[2:], self.failure)).start()
elif "DownloadRedirect.ashx" in link.get("href"): elif "DownloadRedirect.ashx" in link.get("href"):
title = link.contents[1].contents[0] title = link.contents[1].contents[0]
download_link(link.get("href"), title) Process(target=self.download_link, args=(link.get("href"), title)).start()
#print(r.text)
def find_essay_files(html): def find_essay_files(self, html):
three = bs(html, "html.parser") three = bs(html, "html.parser")
attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"}) attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"})
handin_files=three.find("div", {"id":"DF_FileList"}) handin_files=three.find("div", {"id":"DF_FileList"})
text=three.find("div", {"class":"h-userinput itsl-assignment-description"}) text=three.find("div", {"class":"h-userinput itsl-assignment-description"})
if text: if text:
title = three.find("span", {"id":"ctl05_TT"}).contents[0] title = three.find("span", {"id":"ctl05_TT"}).contents[0]
text = h.handle(str(text)) text = self.html2text.handle(str(text))
fp = os.path.join(os.path.abspath(os.path.curdir), title) fp = os.path.join(os.path.abspath(os.path.curdir), title)
#convert to md? #convert to md?
with open(fp+".md", "wb") as note: with open(fp+".md", "wb") as note:
note.write(bytes(text.encode("utf-8"))) note.write(bytes(text.encode("utf-8")))
note.close() note.close()
if attached_files:
find_file(str(attached_files))
if handin_files:
find_file(str(handin_files))
def find_link(html): if attached_files:
self.find_file(str(attached_files))
if handin_files:
self.find_file(str(handin_files))
def find_link(self,html):
three = bs(html, "html.parser") three = bs(html, "html.parser")
section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"}) section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"})
if section_link is None: if section_link is None:
link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"}) link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"})
try:
print(link.get("download")) print(link.get("download"))
download_link(link.get("href"), link.get("download")) Process(target=self.download_link,args=(link.get("href"), link.get("download"))).start()
except:
print("Broken download link")
pass
return return
print(section_link) print(section_link)
@@ -153,49 +153,49 @@ def find_link(html):
shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8"))) shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8")))
shortcut.close() shortcut.close()
def save_note(html): def save_note(self,html):
three = bs(html, "html.parser") three = bs(html, "html.parser")
title = three.find("h1").contents[0].contents[1] title = three.find("h1").contents[0].contents[1]
print(title) print(title)
text = three.find("div", {"class":"h-userinput"}) text = three.find("div", {"class":"h-userinput"})
print(text.contents[0]) print(text.contents[0])
text = h.handle(str(text)) text = self.html2text.handle(str(text))
fp = os.path.join(os.path.abspath(os.path.curdir), title) fp = os.path.join(os.path.abspath(os.path.curdir), title)
#convert to md? #convert to md?
with open(fp+".md", "wb") as note: with open(fp+".md", "wb") as note:
note.write(bytes(text.encode("utf-8"))) note.write(bytes(text.encode("utf-8")))
note.close() note.close()
def find_files(folders): def find_files(self,folders):
for link in folders.find_all('a'): for link in folders.find_all('a'):
if "File" in link.get("href"): if "File" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies) r = rq.get(base_url+link.get("href"), cookies=self.cookies)
find_file(r.text) self.find_file(r.text)
elif "LearningToolElement" in link.get("href"): elif "LearningToolElement" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies) r = rq.get(base_url+link.get("href"), cookies=self.cookies)
three = bs(r.text, "html.parser") three = bs(r.text, "html.parser")
iframe = three.find('iframe') iframe = three.find('iframe')
print(iframe.get("src")) print(iframe.get("src"))
if iframe is not None: if iframe is not None:
url = iframe.get("src") url = iframe.get("src")
r = rq.get(url, cookies=itl_cookies) r = rq.get(url, cookies=self.cookies)
link = find_link(r.text) link = self.find_link(r.text)
elif "/note/View_Note" in link.get("href"): elif "/note/View_Note" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies) r = rq.get(base_url+link.get("href"), cookies=self.cookies)
print(r.url) print(r.url)
save_note(r.text) self.save_note(r.text)
elif "folder" in link.get("href"): elif "folder" in link.get("href"):
#print(link) #print(link)
itl_path = os.path.join(os.path.abspath(os.path.curdir)) itl_path = os.path.join(os.path.abspath(os.path.curdir))
title = link.contents[0] title = link.contents[0]
make_folder(itl_path, title) make_folder(itl_path, title)
r = rq.get(base_url+link.get("href"), cookies=itl_cookies) r = rq.get(base_url+link.get("href"), cookies=self.cookies)
table = find_folder_table(r.text) table = self.find_folder_table(r.text)
#print(table) #print(table)
find_files(table) self.find_files(table)
os.chdir('..') os.chdir('..')
#print(r.url) #print(r.url)
@@ -204,37 +204,43 @@ def find_files(folders):
itl_path = os.path.join(os.path.abspath(os.path.curdir)) itl_path = os.path.join(os.path.abspath(os.path.curdir))
title = link.contents[0] title = link.contents[0]
make_folder(itl_path, title) make_folder(itl_path, title)
r = rq.get(base_url+link.get("href"), cookies=itl_cookies) r = rq.get(base_url+link.get("href"), cookies=self.cookies)
find_essay_files(r.text) self.find_essay_files(r.text)
os.chdir('..') os.chdir('..')
def download_all(self):
p = []
for link in self.courses:
r = rq.get(base_url+link, cookies=self.cookies)
course_path = os.path.join(os.path.abspath(os.path.curdir))
make_folder(course_path, self.courses[link])
folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
print("folder id",folder_id)
print("folder_url"+folder_id)
r = rq.get(folder_url+folder_id, cookies=self.cookies)
print(r.url)
table = self.find_folder_table(r.text)
Process(target=self.find_files, args=(table,)).start()
os.chdir('..')
def download_one(self):
#print(args.session_cookie) course_url = input("Emne link:")
#key = args.session_cookie
folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
course_url = input("Emne link or leave blank to download all:")
if course_url:
folder_title=input("folder title:") folder_title=input("folder title:")
r = rq.get(course_url, cookies=itl_cookies) r = rq.get(course_url, cookies=self.cookies)
course_path = os.path.join(os.path.abspath(os.path.curdir)) course_path = os.path.join(os.path.abspath(os.path.curdir))
make_folder(course_path, folder_title) make_folder(course_path, folder_title)
folder_id = re.search("FolderID=(.+?)'",r.text).group(1) folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
r = rq.get(folder_url+folder_id, cookies=itl_cookies) r = rq.get(folder_url+folder_id, cookies=itl_cookies)
r = rq.get(folder_url+folder_id, cookies=itl_cookies) r = rq.get(folder_url+folder_id, cookies=itl_cookies)
table = find_folder_table(r.text) table = self.find_folder_table(r.text)
find_files(table) self.find_files(table)
else:
for course in courses:
r = rq.get(base_url+course, cookies=itl_cookies)
course_path = os.path.join(os.path.abspath(os.path.curdir))
make_folder(course_path, course_title[course])
folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
print("folder id",folder_id)
print("folder_url"+folder_id)
r = rq.get(folder_url+folder_id, cookies=itl_cookies)
print(r.url)
table = find_folder_table(r.text)
find_files(table)
os.chdir('..')
def get_courses(self):
return self.courses
#print(args.session_cookie)
#key = args.session_cookie
if __name__ == '__main__':
scraper = itslearning_scraper()
scraper.enter()
scraper.download_all()