From a047b6c3d1b92ab7cc259d4765fa6f50c242464b Mon Sep 17 00:00:00 2001 From: sigvartmh Date: Mon, 8 May 2017 23:21:16 +0200 Subject: [PATCH] Initial commit of code --- itl_scrape.py | 180 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 + 2 files changed, 183 insertions(+) create mode 100644 itl_scrape.py create mode 100644 requirements.txt diff --git a/itl_scrape.py b/itl_scrape.py new file mode 100644 index 0000000..eb5c6c3 --- /dev/null +++ b/itl_scrape.py @@ -0,0 +1,180 @@ +import os +import re +import argparse +import requests as rq +from bs4 import BeautifulSoup as bs + +key = input("Enter sessionID:") +print(key) +cookie={"JSESSIONID": key} +print(cookie) +login = "https://innsida.ntnu.no/lms-ntnu" + +r = rq.get(login, cookies=cookie) +print(r.url) + +tc = r.request.headers["Cookie"].split(";") +sp_tc = [[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in tc] +itl_cookies=dict(sp_tc) +print(itl_cookies) + +base_url = "https://ntnu.itslearning.com/" +url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID=3204092" + +path = os.path.abspath(os.path.curdir) +newpath = os.path.join(path,"scrape") +global failure +failure=0 + +if not os.path.exists(newpath): + os.makedirs(newpath) +print(path) +print(newpath) +os.chdir(newpath) + +def make_folder(curpath, title): + folder_path = os.path.join(curpath,title) + print("making dir:",folder_path) + if not os.path.exists(folder_path): + os.makedirs(folder_path) + os.chdir(folder_path) + +def find_folder_table(html): + three = bs(html, "html.parser") + folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"}) + return folders + + +def download_link(link, title): + print("Trying to download: {}".format(link)) + r = rq.get(link, cookies=itl_cookies, stream=True) + print(r.url) + try: + filename = re.search('FileName=(.+?)&',r.url).group(1) + except: + filename = title + global failure + failure += 1 + print(filename) + filename = os.path.join(os.path.abspath(os.path.curdir),filename) + with open(filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + print("complete") +import sys + +def find_file(html): + try: + three = bs(html, "html.parser") + except: + print(html) + print(type(html)) + print(html.find_all('a')) + sys.exit(1) + links = three.find_all('a') + #print(links) + + for link in links: + if "download.aspx" in link.get("href"): + download_link(base_url+link.get("href")[2:], failure) + elif "DownloadRedirect.ashx" in link.get("href"): + title = link.contents[1].contents[0] + download_link(link.get("href"), title) + #print(r.text) + +def find_essay_files(html): + three = bs(html, "html.parser") + attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"}) + handin_files=three.find("div", {"id":"DF_FileList"}) + if attached_files: + find_file(str(attached_files)) + if handin_files: + find_file(str(handin_files)) + +def find_link(html): + + three = bs(html, "html.parser") + section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"}) + if section_link is None: + link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"}) + print(link.get("download")) + download_link(link.get("href"), link.get("download")) + return + + print(section_link) + target = section_link.get("href") + print(target) + fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/'))) + print("filepath:", fp) + with open(fp+".url", "wb") as shortcut: + shortcut.write(b'[InternetShortcut]\n') + shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8"))) + shortcut.close() +import html2text +def save_note(html): + three = bs(html, "html.parser") + title = three.find("h1").contents[0].contents[1] + print(title) + text = three.find("div", {"class":"h-userinput"}) + print(text.contents[0]) + h = html2text.HTML2Text() + text = h.handle(str(text)) + fp = os.path.join(os.path.abspath(os.path.curdir), title) + #convert to md? + with open(fp+".md", "wb") as note: + note.write(bytes(text.encode("utf-8"))) + note.close() + +def find_files(folders): + for link in folders.find_all('a'): + if "File" in link.get("href"): + r = rq.get(base_url+link.get("href"), cookies=itl_cookies) + find_file(r.text) + + elif "LearningToolElement" in link.get("href"): + r = rq.get(base_url+link.get("href"), cookies=itl_cookies) + three = bs(r.text, "html.parser") + iframe = three.find('iframe') + print(iframe.get("src")) + if iframe is not None: + url = iframe.get("src") + r = rq.get(url, cookies=itl_cookies) + link = find_link(r.text) + + elif "/note/View_Note" in link.get("href"): + r = rq.get(base_url+link.get("href"), cookies=itl_cookies) + print(r.url) + save_note(r.text) + + elif "folder" in link.get("href"): + #print(link) + itl_path = os.path.join(os.path.abspath(os.path.curdir)) + title = link.contents[0] + make_folder(itl_path, title) + r = rq.get(base_url+link.get("href"), cookies=itl_cookies) + table = find_folder_table(r.text) + #print(table) + find_files(table) + os.chdir('..') + #print(r.url) + + elif "read_essay" in link.get("href"): + print("read_essay:",link.get("href")) + itl_path = os.path.join(os.path.abspath(os.path.curdir)) + title = link.contents[0] + make_folder(itl_path, title) + r = rq.get(base_url+link.get("href"), cookies=itl_cookies) + find_essay_files(r.text) + os.chdir('..') + + + +#print(args.session_cookie) +#key = args.session_cookie + +r = rq.get(url, cookies=itl_cookies) +print(r.url) +table = find_folder_table(r.text) +find_files(table) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..31ed7ef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4==4.6.0 +html2text==2016.9.19 +requests==2.13.0