Files
ITLscraper/itl_scrape.py
2017-05-11 16:57:52 +02:00

219 lines
7.0 KiB
Python

import os
import re
import argparse
import requests as rq
import mechanicalsoup as ms
from bs4 import BeautifulSoup as bs
from getpass import getpass
url = "https://innsida.ntnu.no/lms-ntnu"
b = ms.StatefulBrowser()
login = input("Enter NTNU-username: ")
password = getpass("Enter your NTNU-password: ")
b.open(url)
b.select_form("form[name=f]")
b["feidename"]=login
b["password"]=password
b.submit_selected()
b.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
b.submit_selected()
key = b.session.cookies.get_dict()
cookie={"JSESSIONID": key["JSESSIONID"]}
print(cookie)
r = rq.get(url, cookies=cookie)
print(r.url)
tc = r.request.headers["Cookie"].split(";")
sp_tc = [[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in tc]
itl_cookies=dict(sp_tc)
print(itl_cookies)
base_url = "https://ntnu.itslearning.com/"
c = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=itl_cookies)
print(c.url)
p = bs(c.text, "html.parser")
course = p.find("table",{"class":"h-table-show-first-4-columns"})
t = course.find_all("a",{"class":"ccl-iconlink"})
print(course)
print(t)
courses = []
course_title = {}
for link in t:
title = link.contents[0].contents[0]
course_title[link.get("href")]=title
courses.append(link.get("href"))
print(courses)
print(course_title)
path = os.path.abspath(os.path.curdir)
newpath = os.path.join(path,"scrape")
global failure
failure=0
if not os.path.exists(newpath):
os.makedirs(newpath)
print(path)
print(newpath)
os.chdir(newpath)
def make_folder(curpath, title):
folder_path = os.path.join(curpath,title)
print("making dir:",folder_path)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
os.chdir(folder_path)
def find_folder_table(html):
three = bs(html, "html.parser")
folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
return folders
def download_link(link, title):
print("Trying to download: {}".format(link))
r = rq.get(link, cookies=itl_cookies, stream=True)
print(r.url)
try:
filename = re.search('FileName=(.+?)&',r.url).group(1)
except:
filename = title
global failure
failure += 1
print(filename)
filename = os.path.join(os.path.abspath(os.path.curdir),filename)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print("complete")
import sys
def find_file(html):
try:
three = bs(html, "html.parser")
except:
print(html)
print(type(html))
print(html.find_all('a'))
sys.exit(1)
links = three.find_all('a')
#print(links)
for link in links:
if "download.aspx" in link.get("href"):
download_link(base_url+link.get("href")[2:], failure)
elif "DownloadRedirect.ashx" in link.get("href"):
title = link.contents[1].contents[0]
download_link(link.get("href"), title)
#print(r.text)
def find_essay_files(html):
three = bs(html, "html.parser")
attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"})
handin_files=three.find("div", {"id":"DF_FileList"})
if attached_files:
find_file(str(attached_files))
if handin_files:
find_file(str(handin_files))
def find_link(html):
three = bs(html, "html.parser")
section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"})
if section_link is None:
link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"})
print(link.get("download"))
download_link(link.get("href"), link.get("download"))
return
print(section_link)
target = section_link.get("href")
print(target)
fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
print("filepath:", fp)
with open(fp+".url", "wb") as shortcut:
shortcut.write(b'[InternetShortcut]\n')
shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8")))
shortcut.close()
import html2text
def save_note(html):
three = bs(html, "html.parser")
title = three.find("h1").contents[0].contents[1]
print(title)
text = three.find("div", {"class":"h-userinput"})
print(text.contents[0])
h = html2text.HTML2Text()
text = h.handle(str(text))
fp = os.path.join(os.path.abspath(os.path.curdir), title)
#convert to md?
with open(fp+".md", "wb") as note:
note.write(bytes(text.encode("utf-8")))
note.close()
def find_files(folders):
for link in folders.find_all('a'):
if "File" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
find_file(r.text)
elif "LearningToolElement" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
three = bs(r.text, "html.parser")
iframe = three.find('iframe')
print(iframe.get("src"))
if iframe is not None:
url = iframe.get("src")
r = rq.get(url, cookies=itl_cookies)
link = find_link(r.text)
elif "/note/View_Note" in link.get("href"):
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
print(r.url)
save_note(r.text)
elif "folder" in link.get("href"):
#print(link)
itl_path = os.path.join(os.path.abspath(os.path.curdir))
title = link.contents[0]
make_folder(itl_path, title)
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
table = find_folder_table(r.text)
#print(table)
find_files(table)
os.chdir('..')
#print(r.url)
elif "read_essay" in link.get("href"):
print("read_essay:",link.get("href"))
itl_path = os.path.join(os.path.abspath(os.path.curdir))
title = link.contents[0]
make_folder(itl_path, title)
r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
find_essay_files(r.text)
os.chdir('..')
#print(args.session_cookie)
#key = args.session_cookie
folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
for course in courses:
r = rq.get(base_url+course, cookies=itl_cookies)
course_path = os.path.join(os.path.abspath(os.path.curdir))
make_folder(course_path, course_title[course])
folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
print("folder id",folder_id)
print("folder_url"+folder_id)
r = rq.get(folder_url+folder_id, cookies=itl_cookies)
print(r.url)
table = find_folder_table(r.text)
find_files(table)
os.chdir('..')