Files
ITLscraper/itl_scrape.py
2017-05-11 23:03:14 +02:00

241 lines
8.0 KiB
Python

import os
import re
import argparse
import requests as rq
import mechanicalsoup as ms
import html2text
from bs4 import BeautifulSoup as bs
from getpass import getpass
b = ms.StatefulBrowser()
h = html2text.HTML2Text()
url = "https://innsida.ntnu.no/lms-ntnu"
login = input("Enter NTNU-username: ")
password = getpass("Enter your NTNU-password: ")
b.open(url)
b.select_form("form[name=f]")
b["feidename"]=login
b["password"]=password
b.submit_selected()
b.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
b.submit_selected()
key = b.session.cookies.get_dict()
cookie={"JSESSIONID": key["JSESSIONID"]}
print(cookie)
r = rq.get(url, cookies=cookie)
print(r.url)
tc = r.request.headers["Cookie"].split(";")
sp_tc = [[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in tc]
itl_cookies=dict(sp_tc)
print(itl_cookies)
base_url = "https://ntnu.itslearning.com/"
c = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=itl_cookies)
print(c.url)
p = bs(c.text, "html.parser")
course = p.find("table",{"class":"h-table-show-first-4-columns"})
t = course.find_all("a",{"class":"ccl-iconlink"})
print(course)
print(t)
courses = []
course_title = {}
for link in t:
title = link.contents[0].contents[0]
course_title[link.get("href")]=title
courses.append(link.get("href"))
print(courses)
print(course_title)
path = os.path.abspath(os.path.curdir)
newpath = os.path.join(path,"scrape")
global failure
failure=0
if not os.path.exists(newpath):
os.makedirs(newpath)
print(path)
print(newpath)
os.chdir(newpath)
def make_folder(curpath, title):
folder_path = os.path.join(curpath,title)
print("making dir:",folder_path)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
os.chdir(folder_path)
def find_folder_table(html):
three = bs(html, "html.parser")
folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
return folders
def download_link(link, title):
print("Trying to download: {}".format(link))
r = rq.get(link, cookies=itl_cookies, stream=True)
print(r.url)
try:
filename = re.search('FileName=(.+?)&',r.url).group(1)
except:
filename = title
global failure
failure += 1
print(filename)
filename = os.path.join(os.path.abspath(os.path.curdir),filename)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print("complete")
import sys
def find_file(html):
try:
three = bs(html, "html.parser")
except:
print(html)
print(type(html))
print(html.find_all('a'))
sys.exit(1)
links = three.find_all('a')
#print(links)
for link in links:
if "download.aspx" in link.get("href"):
download_link(base_url+link.get("href")[2:], failure)
elif "DownloadRedirect.ashx" in link.get("href"):
title = link.contents[1].contents[0]
download_link(link.get("href"), title)
#print(r.text)
def find_essay_files(html):
three = bs(html, "html.parser")
attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"})
handin_files=three.find("div", {"id":"DF_FileList"})
text=three.find("div", {"class":"h-userinput itsl-assignment-description"})
if text:
title = three.find("span", {"id":"ctl05_TT"}).contents[0]
text = h.handle(str(text))
fp = os.path.join(os.path.abspath(os.path.curdir), title)
#convert to md?
with open(fp+".md", "wb") as note:
note.write(bytes(text.encode("utf-8")))
note.close()
if attached_files:
find_file(str(attached_files))
if handin_files:
find_file(str(handin_files))
def find_link(html):
three = bs(html, "html.parser")
section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"})
if section_link is None:
link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"})
print(link.get("download"))
download_link(link.get("href"), link.get("download"))
return
print(section_link)
target = section_link.get("href")
print(target)
fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
print("filepath:", fp)
with open(fp+".url", "wb") as shortcut:
shortcut.write(b'[InternetShortcut]\n')
shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8")))
shortcut.close()
def save_note(html):
three = bs(html, "html.parser")
title = three.find("h1").contents[0].contents[1]
print(title)
text = three.find("div", {"class":"h-userinput"})
print(text.contents[0])
text = h.handle(str(text))
fp = os.path.join(os.path.abspath(os.path.curdir), title)
#convert to md?
with open(fp+".md", "wb") as note:
note.write(bytes(text.encode("utf-8")))
note.close()
def find_files(folders):
for link in folders.find_all('a'):
if "File" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
find_file(r.text)
elif "LearningToolElement" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
three = bs(r.text, "html.parser")
iframe = three.find('iframe')
print(iframe.get("src"))
if iframe is not None:
url = iframe.get("src")
r = rq.get(url, cookies=itl_cookies)
link = find_link(r.text)
elif "/note/View_Note" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
print(r.url)
save_note(r.text)
elif "folder" in link.get("href"):
#print(link)
itl_path = os.path.join(os.path.abspath(os.path.curdir))
title = link.contents[0]
make_folder(itl_path, title)
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
table = find_folder_table(r.text)
#print(table)
find_files(table)
os.chdir('..')
#print(r.url)
elif "read_essay" in link.get("href"):
print("read_essay:",link.get("href"))
itl_path = os.path.join(os.path.abspath(os.path.curdir))
title = link.contents[0]
make_folder(itl_path, title)
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
find_essay_files(r.text)
os.chdir('..')
#print(args.session_cookie)
#key = args.session_cookie
folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
course_url = input("Emne link or leave blank to download all:")
if course_url:
folder_title=input("folder title:")
r = rq.get(course_url, cookies=itl_cookies)
course_path = os.path.join(os.path.abspath(os.path.curdir))
make_folder(course_path, folder_title)
folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
r = rq.get(folder_url+folder_id, cookies=itl_cookies)
r = rq.get(folder_url+folder_id, cookies=itl_cookies)
table = find_folder_table(r.text)
find_files(table)
else:
for course in courses:
r = rq.get(base_url+course, cookies=itl_cookies)
course_path = os.path.join(os.path.abspath(os.path.curdir))
make_folder(course_path, course_title[course])
folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
print("folder id",folder_id)
print("folder_url"+folder_id)
r = rq.get(folder_url+folder_id, cookies=itl_cookies)
print(r.url)
table = find_folder_table(r.text)
find_files(table)
os.chdir('..')