add multithreading and object orientation

2026-03-11 08:26:01 +00:00 · 2017-05-12 01:22:39 +02:00
parent e438fd3287
commit 18225b375b
1 changed files with 185 additions and 179 deletions
--- a/itl_scrape.py
+++ b/itl_scrape.py
@@ -1,201 +1,201 @@
 import os
 import sys
 import re
 import argparse
 import requests as rq
 import mechanicalsoup as ms
 import html2text
 from multiprocessing import Process
 from bs4 import BeautifulSoup as bs
 from getpass import getpass
 b = ms.StatefulBrowser()
 h = html2text.HTML2Text()
 url = "https://innsida.ntnu.no/lms-ntnu"
 login = input("Enter NTNU-username: ")
 password = getpass("Enter your NTNU-password: ")
 b.open(url)
 b.select_form("form[name=f]")
 b["feidename"]=login
 b["password"]=password
 b.submit_selected()
 b.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
 b.submit_selected()
 key = b.session.cookies.get_dict()
 cookie={"JSESSIONID": key["JSESSIONID"]}
 print(cookie)
 r = rq.get(url, cookies=cookie)
 print(r.url)
 tc = r.request.headers["Cookie"].split(";")
 sp_tc = [[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in tc]
 itl_cookies=dict(sp_tc)
 print(itl_cookies)
 base_url = "https://ntnu.itslearning.com/"
-
+folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
 c = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=itl_cookies)
 print(c.url)
 p = bs(c.text, "html.parser")
 course = p.find("table",{"class":"h-table-show-first-4-columns"})
 t = course.find_all("a",{"class":"ccl-iconlink"})
 print(course)
 print(t)
 courses = []
 course_title = {}
 for link in t:
    title = link.contents[0].contents[0]
    course_title[link.get("href")]=title
    courses.append(link.get("href"))
 print(courses)
 print(course_title)
 path = os.path.abspath(os.path.curdir)
 newpath = os.path.join(path,"scrape")
 global failure
 failure=0
 if not os.path.exists(newpath):
    os.makedirs(newpath)
 print(path)
 print(newpath)
 os.chdir(newpath)
 def make_folder(curpath, title):
    folder_path = os.path.join(curpath,title)
-    print("making dir:",folder_path)
+    #print("making dir:",folder_path)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    os.chdir(folder_path)
-def find_folder_table(html):
+class itslearning_scraper():
-    three = bs(html, "html.parser")
+    def __init__(self):
-    folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
+        self.failure=0
-    return folders
+        self.browser = ms.StatefulBrowser()
        self.html2text = html2text.HTML2Text()
        self.start_url = "https://innsida.ntnu.no/lms-ntnu"
        path = os.path.abspath(os.path.curdir)
        newpath = os.path.join(path,"scraped")
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        os.chdir(newpath)
    def login(self, username, password):
        self.browser.open(self.start_url)
        self.browser.select_form("form[name=f]")
        self.browser["feidename"]=username
        self.browser["password"]=password
        self.browser.submit_selected()
        self.browser.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
        self.browser.submit_selected()
        self.key = self.browser.session.cookies.get_dict()
        self.jsession={"JSESSIONID": self.key["JSESSIONID"]}
        resp = rq.get(self.start_url, cookies=self.jsession)
        self.get_cookies(resp)
        self.find_courses()
    def get_cookies(self, resp):
        split_cookie = resp.request.headers["Cookie"].split(";")
        self.cookies = dict([[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in split_cookie])
    def enter(self):
        username = input("Enter NTNU-username: ")
        password = getpass("Enter your NTNU-password: ")
        self.login(username,password)
    def find_courses(self):
        resp = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=self.cookies)
        print(resp.url)
        three = bs(resp.text, "html.parser")
        course = three.find("table",{"class":"h-table-show-first-4-columns"})
        active_courses = course.find_all("a",{"class":"ccl-iconlink"})
        courses = {}
        for link in active_courses:
            courses[link.get("href")]=link.contents[0].contents[0]
        self.courses = courses
-def download_link(link, title):
+    def find_folder_table(self,html):
            print("Trying to download: {}".format(link))
            r = rq.get(link, cookies=itl_cookies, stream=True)
            print(r.url)
            try:
                filename = re.search('FileName=(.+?)&',r.url).group(1)
            except:
                filename = title
                global failure
                failure += 1
            print(filename)
            filename = os.path.join(os.path.abspath(os.path.curdir),filename)
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            print("complete")
 import sys
 def find_file(html):
    try:
        three = bs(html, "html.parser")
-    except:
+        folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
-        print(html)
+        return folders
        print(type(html))
        print(html.find_all('a'))
        sys.exit(1)
    links = three.find_all('a')
    #print(links)
-    for link in links:
+    def download_link(self, link, title):
-        if "download.aspx" in link.get("href"):
+        print("Trying to download: {}".format(link))
-            download_link(base_url+link.get("href")[2:], failure)
+        r = rq.get(link, cookies=self.cookies, stream=True)
-        elif "DownloadRedirect.ashx" in link.get("href"):
+        try:
-            title = link.contents[1].contents[0]
+            filename = re.search('FileName=(.+?)&',r.url).group(1)
-            download_link(link.get("href"), title)
+        except:
-            #print(r.text)
+            filename = title
            self.failure += 1
        print("File created with name:",filename)
        filename = os.path.join(os.path.abspath(os.path.curdir),filename)
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print("complete")
-def find_essay_files(html):
+    def find_file(self, html):
-    three = bs(html, "html.parser")
+        try:
-    attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"})
+            three = bs(html, "html.parser")
-    handin_files=three.find("div", {"id":"DF_FileList"})
+        except:
-    text=three.find("div", {"class":"h-userinput itsl-assignment-description"})
+            print(html)
-    if text:
+            print(type(html))
-        title = three.find("span", {"id":"ctl05_TT"}).contents[0]
+            print(html.find_all('a'))
-        text = h.handle(str(text))
+            sys.exit(1)
        links = three.find_all('a')
        for link in links:
            if "download.aspx" in link.get("href"):
                Process(target=self.download_link, args=(base_url+link.get("href")[2:], self.failure)).start()
            elif "DownloadRedirect.ashx" in link.get("href"):
                title = link.contents[1].contents[0]
                Process(target=self.download_link, args=(link.get("href"), title)).start()
    def find_essay_files(self, html):
        three = bs(html, "html.parser")
        attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"})
        handin_files=three.find("div", {"id":"DF_FileList"})
        text=three.find("div", {"class":"h-userinput itsl-assignment-description"})
        if text:
            title = three.find("span", {"id":"ctl05_TT"}).contents[0]
            text = self.html2text.handle(str(text))
            fp = os.path.join(os.path.abspath(os.path.curdir), title)
            #convert to md?
            with open(fp+".md", "wb") as note:
                note.write(bytes(text.encode("utf-8")))
                note.close()
        if attached_files:
            self.find_file(str(attached_files))
        if handin_files:
            self.find_file(str(handin_files))
    def find_link(self,html):
        three = bs(html, "html.parser")
        section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"})
        if section_link is None:
            link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"})
            try:
                print(link.get("download"))
                Process(target=self.download_link,args=(link.get("href"), link.get("download"))).start()
            except:
                print("Broken download link")
                pass
            return
        print(section_link)
        target = section_link.get("href")
        print(target)
        fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
        print("filepath:", fp)
        with  open(fp+".url", "wb") as shortcut:
            shortcut.write(b'[InternetShortcut]\n')
            shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8")))
            shortcut.close()
    def save_note(self,html):
        three = bs(html, "html.parser")
        title = three.find("h1").contents[0].contents[1]
        print(title)
        text = three.find("div", {"class":"h-userinput"})
        print(text.contents[0])
        text = self.html2text.handle(str(text))
        fp = os.path.join(os.path.abspath(os.path.curdir), title)
        #convert to md?
        with open(fp+".md", "wb") as note:
            note.write(bytes(text.encode("utf-8")))
            note.close()
    if attached_files:
        find_file(str(attached_files))
    if handin_files:
        find_file(str(handin_files))
-def find_link(html):
+    def find_files(self,folders):
    three = bs(html, "html.parser")
    section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"})
    if section_link is None:
        link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"})
        print(link.get("download"))
        download_link(link.get("href"), link.get("download"))
        return
    print(section_link)
    target = section_link.get("href")
    print(target)
    fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
    print("filepath:", fp)
    with  open(fp+".url", "wb") as shortcut:
        shortcut.write(b'[InternetShortcut]\n')
        shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8")))
        shortcut.close()
 def save_note(html):
    three = bs(html, "html.parser")
    title = three.find("h1").contents[0].contents[1]
    print(title)
    text = three.find("div", {"class":"h-userinput"})
    print(text.contents[0])
    text = h.handle(str(text))
    fp = os.path.join(os.path.abspath(os.path.curdir), title)
    #convert to md?
    with open(fp+".md", "wb") as note:
        note.write(bytes(text.encode("utf-8")))
        note.close()
 def find_files(folders):
        for link in folders.find_all('a'):
            if "File" in link.get("href"):
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
-                find_file(r.text)
+                self.find_file(r.text)
            elif "LearningToolElement" in link.get("href"):
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
                three = bs(r.text, "html.parser")
                iframe = three.find('iframe')
                print(iframe.get("src"))
                if iframe is not None:
                    url = iframe.get("src")
-                    r = rq.get(url, cookies=itl_cookies)
+                    r = rq.get(url, cookies=self.cookies)
-                    link = find_link(r.text)
+                    link = self.find_link(r.text)
            elif "/note/View_Note" in link.get("href"):
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
                print(r.url)
-                save_note(r.text)
+                self.save_note(r.text)
            elif "folder" in link.get("href"):
                #print(link)
                itl_path = os.path.join(os.path.abspath(os.path.curdir))
                title = link.contents[0]
                make_folder(itl_path, title)
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
-                table = find_folder_table(r.text)
+                table = self.find_folder_table(r.text)
                #print(table)
-                find_files(table)
+                self.find_files(table)
                os.chdir('..')
                #print(r.url)
@@ -204,37 +204,43 @@ def find_files(folders):
                itl_path = os.path.join(os.path.abspath(os.path.curdir))
                title = link.contents[0]
                make_folder(itl_path, title)
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
-                find_essay_files(r.text)
+                self.find_essay_files(r.text)
                os.chdir('..')
    def download_all(self):
        p  = []
        for link in self.courses:
            r = rq.get(base_url+link, cookies=self.cookies)
            course_path = os.path.join(os.path.abspath(os.path.curdir))
            make_folder(course_path, self.courses[link])
            folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
            print("folder id",folder_id)
            print("folder_url"+folder_id)
            r = rq.get(folder_url+folder_id, cookies=self.cookies)
            print(r.url)
            table = self.find_folder_table(r.text)
            Process(target=self.find_files, args=(table,)).start()
            os.chdir('..')
    def download_one(self):
        course_url = input("Emne link:")
        folder_title=input("folder title:")
        r = rq.get(course_url, cookies=self.cookies)
        course_path = os.path.join(os.path.abspath(os.path.curdir))
        make_folder(course_path, folder_title)
        folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
        r = rq.get(folder_url+folder_id, cookies=itl_cookies)
        r = rq.get(folder_url+folder_id, cookies=itl_cookies)
        table = self.find_folder_table(r.text)
        self.find_files(table)
    def get_courses(self):
        return self.courses
 #print(args.session_cookie)
 #key = args.session_cookie
-folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
+if __name__ == '__main__':
-course_url = input("Emne link or leave blank to download all:")
+    scraper = itslearning_scraper()
-if course_url:
+    scraper.enter()
-    folder_title=input("folder title:")
+    scraper.download_all()
    r = rq.get(course_url, cookies=itl_cookies)
    course_path = os.path.join(os.path.abspath(os.path.curdir))
    make_folder(course_path, folder_title)
    folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
    r = rq.get(folder_url+folder_id, cookies=itl_cookies)
    r = rq.get(folder_url+folder_id, cookies=itl_cookies)
    table = find_folder_table(r.text)
    find_files(table)
 else:
    for course in courses:
        r = rq.get(base_url+course, cookies=itl_cookies)
        course_path = os.path.join(os.path.abspath(os.path.curdir))
        make_folder(course_path, course_title[course])
        folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
        print("folder id",folder_id)
        print("folder_url"+folder_id)
        r = rq.get(folder_url+folder_id, cookies=itl_cookies)
        print(r.url)
        table = find_folder_table(r.text)
        find_files(table)
        os.chdir('..')