add multithreading and object orientation

2026-03-11 00:20:36 +00:00 · 2017-05-12 01:22:39 +02:00
parent e438fd3287
commit 18225b375b
1 changed files with 185 additions and 179 deletions
--- a/itl_scrape.py
+++ b/itl_scrape.py
@@ -1,201 +1,201 @@
 import os
+import sys
 import re
 import argparse
 import requests as rq
 import mechanicalsoup as ms
 import html2text

+from multiprocessing import Process
 from bs4 import BeautifulSoup as bs
 from getpass import getpass

-b = ms.StatefulBrowser()
-h = html2text.HTML2Text()
-url = "https://innsida.ntnu.no/lms-ntnu"
-
-login = input("Enter NTNU-username: ")
-password = getpass("Enter your NTNU-password: ")
-b.open(url)
-b.select_form("form[name=f]")
-b["feidename"]=login
-b["password"]=password
-b.submit_selected()
-b.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
-b.submit_selected()
-key = b.session.cookies.get_dict()
-
-cookie={"JSESSIONID": key["JSESSIONID"]}
-print(cookie)
-
-r = rq.get(url, cookies=cookie)
-print(r.url)
-
-tc = r.request.headers["Cookie"].split(";")
-sp_tc = [[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in tc]
-itl_cookies=dict(sp_tc)
-print(itl_cookies)
-
 base_url = "https://ntnu.itslearning.com/"
-
-c = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=itl_cookies)
-print(c.url)
-p = bs(c.text, "html.parser")
-course = p.find("table",{"class":"h-table-show-first-4-columns"})
-t = course.find_all("a",{"class":"ccl-iconlink"})
-print(course)
-print(t)
-courses = []
-course_title = {}
-for link in t:
-    title = link.contents[0].contents[0]
-    course_title[link.get("href")]=title
-    courses.append(link.get("href"))
-print(courses)
-print(course_title)
-
-
-path = os.path.abspath(os.path.curdir)
-newpath = os.path.join(path,"scrape")
-global failure
-failure=0
-
-if not os.path.exists(newpath):
-    os.makedirs(newpath)
-print(path)
-print(newpath)
-os.chdir(newpath)
-
+folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
 def make_folder(curpath, title):
    folder_path = os.path.join(curpath,title)
-    print("making dir:",folder_path)
+    #print("making dir:",folder_path)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    os.chdir(folder_path)

-def find_folder_table(html):
-    three = bs(html, "html.parser")
-    folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
-    return folders
+class itslearning_scraper():
+    def __init__(self):
+        self.failure=0
+        self.browser = ms.StatefulBrowser()
+        self.html2text = html2text.HTML2Text()
+        self.start_url = "https://innsida.ntnu.no/lms-ntnu"
+        path = os.path.abspath(os.path.curdir)
+        newpath = os.path.join(path,"scraped")
+        if not os.path.exists(newpath):
+            os.makedirs(newpath)
+        os.chdir(newpath)
+
+    def login(self, username, password):
+        self.browser.open(self.start_url)
+        self.browser.select_form("form[name=f]")
+        self.browser["feidename"]=username
+        self.browser["password"]=password
+        self.browser.submit_selected()
+        self.browser.select_form("form[action=https://sats.itea.ntnu.no/sso-wrapper/feidelogin]")
+        self.browser.submit_selected()
+
+        self.key = self.browser.session.cookies.get_dict()
+        self.jsession={"JSESSIONID": self.key["JSESSIONID"]}
+        resp = rq.get(self.start_url, cookies=self.jsession)
+
+        self.get_cookies(resp)
+        self.find_courses()
+
+    def get_cookies(self, resp):
+        split_cookie = resp.request.headers["Cookie"].split(";")
+        self.cookies = dict([[elm.split("=",1)[0].strip(), elm.split("=",1)[1].strip()] for elm in split_cookie])
+
+    def enter(self):
+        username = input("Enter NTNU-username: ")
+        password = getpass("Enter your NTNU-password: ")
+        self.login(username,password)
+
+    def find_courses(self):
+        resp = rq.get("https://ntnu.itslearning.com/Course/AllCourses.aspx", cookies=self.cookies)
+        print(resp.url)
+
+        three = bs(resp.text, "html.parser")
+        course = three.find("table",{"class":"h-table-show-first-4-columns"})
+        active_courses = course.find_all("a",{"class":"ccl-iconlink"})
+        courses = {}
+
+        for link in active_courses:
+            courses[link.get("href")]=link.contents[0].contents[0]
+        self.courses = courses


-def download_link(link, title):
-            print("Trying to download: {}".format(link))
-            r = rq.get(link, cookies=itl_cookies, stream=True)
-            print(r.url)
-            try:
-                filename = re.search('FileName=(.+?)&',r.url).group(1)
-            except:
-                filename = title
-                global failure
-                failure += 1
-            print(filename)
-            filename = os.path.join(os.path.abspath(os.path.curdir),filename)
-            with open(filename, 'wb') as f:
-                for chunk in r.iter_content(chunk_size=1024):
-                    if chunk:
-                        f.write(chunk)
-            print("complete")
-import sys
-
-def find_file(html):
-    try:
+    def find_folder_table(self,html):
        three = bs(html, "html.parser")
-    except:
-        print(html)
-        print(type(html))
-        print(html.find_all('a'))
-        sys.exit(1)
-    links = three.find_all('a')
-    #print(links)
+        folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"})
+        return folders

-    for link in links:
-        if "download.aspx" in link.get("href"):
-            download_link(base_url+link.get("href")[2:], failure)
-        elif "DownloadRedirect.ashx" in link.get("href"):
-            title = link.contents[1].contents[0]
-            download_link(link.get("href"), title)
-            #print(r.text)
+    def download_link(self, link, title):
+        print("Trying to download: {}".format(link))
+        r = rq.get(link, cookies=self.cookies, stream=True)
+        try:
+            filename = re.search('FileName=(.+?)&',r.url).group(1)
+        except:
+            filename = title
+            self.failure += 1
+        print("File created with name:",filename)
+        filename = os.path.join(os.path.abspath(os.path.curdir),filename)
+        with open(filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+        print("complete")

-def find_essay_files(html):
-    three = bs(html, "html.parser")
-    attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"})
-    handin_files=three.find("div", {"id":"DF_FileList"})
-    text=three.find("div", {"class":"h-userinput itsl-assignment-description"})
-    if text:
-        title = three.find("span", {"id":"ctl05_TT"}).contents[0]
-        text = h.handle(str(text))
+    def find_file(self, html):
+        try:
+            three = bs(html, "html.parser")
+        except:
+            print(html)
+            print(type(html))
+            print(html.find_all('a'))
+            sys.exit(1)
+        links = three.find_all('a')
+
+        for link in links:
+            if "download.aspx" in link.get("href"):
+                Process(target=self.download_link, args=(base_url+link.get("href")[2:], self.failure)).start()
+            elif "DownloadRedirect.ashx" in link.get("href"):
+                title = link.contents[1].contents[0]
+                Process(target=self.download_link, args=(link.get("href"), title)).start()
+
+    def find_essay_files(self, html):
+        three = bs(html, "html.parser")
+        attached_files=three.find("div", {"id":"EssayDetailedInformation_FileListWrapper_FileList"})
+        handin_files=three.find("div", {"id":"DF_FileList"})
+        text=three.find("div", {"class":"h-userinput itsl-assignment-description"})
+
+        if text:
+            title = three.find("span", {"id":"ctl05_TT"}).contents[0]
+            text = self.html2text.handle(str(text))
+            fp = os.path.join(os.path.abspath(os.path.curdir), title)
+            #convert to md?
+            with open(fp+".md", "wb") as note:
+                note.write(bytes(text.encode("utf-8")))
+                note.close()
+
+        if attached_files:
+            self.find_file(str(attached_files))
+
+        if handin_files:
+            self.find_file(str(handin_files))
+
+    def find_link(self,html):
+
+        three = bs(html, "html.parser")
+        section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"})
+        if section_link is None:
+            link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"})
+            try:
+                print(link.get("download"))
+                Process(target=self.download_link,args=(link.get("href"), link.get("download"))).start()
+            except:
+                print("Broken download link")
+                pass
+            return
+
+        print(section_link)
+        target = section_link.get("href")
+        print(target)
+        fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
+        print("filepath:", fp)
+        with  open(fp+".url", "wb") as shortcut:
+            shortcut.write(b'[InternetShortcut]\n')
+            shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8")))
+            shortcut.close()
+
+    def save_note(self,html):
+        three = bs(html, "html.parser")
+        title = three.find("h1").contents[0].contents[1]
+        print(title)
+        text = three.find("div", {"class":"h-userinput"})
+        print(text.contents[0])
+        text = self.html2text.handle(str(text))
        fp = os.path.join(os.path.abspath(os.path.curdir), title)
        #convert to md?
        with open(fp+".md", "wb") as note:
            note.write(bytes(text.encode("utf-8")))
            note.close()
-    if attached_files:
-        find_file(str(attached_files))
-    if handin_files:
-        find_file(str(handin_files))

-def find_link(html):
-
-    three = bs(html, "html.parser")
-    section_link = three.find("a", {"id":"ctl00_ctl00_MainFormContent_ResourceContent_Link"})
-    if section_link is None:
-        link = three.find("a",{"id":"ctl00_ctl00_MainFormContent_ResourceContent_DownloadButton_DownloadLink"})
-        print(link.get("download"))
-        download_link(link.get("href"), link.get("download"))
-        return
-
-    print(section_link)
-    target = section_link.get("href")
-    print(target)
-    fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
-    print("filepath:", fp)
-    with  open(fp+".url", "wb") as shortcut:
-        shortcut.write(b'[InternetShortcut]\n')
-        shortcut.write(bytes(r'URL={}'.format(target).encode("utf-8")))
-        shortcut.close()
-
-def save_note(html):
-    three = bs(html, "html.parser")
-    title = three.find("h1").contents[0].contents[1]
-    print(title)
-    text = three.find("div", {"class":"h-userinput"})
-    print(text.contents[0])
-    text = h.handle(str(text))
-    fp = os.path.join(os.path.abspath(os.path.curdir), title)
-    #convert to md?
-    with open(fp+".md", "wb") as note:
-        note.write(bytes(text.encode("utf-8")))
-        note.close()
-
-def find_files(folders):
+    def find_files(self,folders):
        for link in folders.find_all('a'):
            if "File" in link.get("href"):
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
-                find_file(r.text)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
+                self.find_file(r.text)

            elif "LearningToolElement" in link.get("href"):
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
                three = bs(r.text, "html.parser")
                iframe = three.find('iframe')
                print(iframe.get("src"))
                if iframe is not None:
                    url = iframe.get("src")
-                    r = rq.get(url, cookies=itl_cookies)
-                    link = find_link(r.text)
+                    r = rq.get(url, cookies=self.cookies)
+                    link = self.find_link(r.text)

            elif "/note/View_Note" in link.get("href"):
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
                print(r.url)
-                save_note(r.text)
+                self.save_note(r.text)

            elif "folder" in link.get("href"):
                #print(link)
                itl_path = os.path.join(os.path.abspath(os.path.curdir))
                title = link.contents[0]
                make_folder(itl_path, title)
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
-                table = find_folder_table(r.text)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
+                table = self.find_folder_table(r.text)
                #print(table)
-                find_files(table)
+                self.find_files(table)
                os.chdir('..')
                #print(r.url)

@@ -204,37 +204,43 @@ def find_files(folders):
                itl_path = os.path.join(os.path.abspath(os.path.curdir))
                title = link.contents[0]
                make_folder(itl_path, title)
-                r = rq.get(base_url+link.get("href"), cookies=itl_cookies)
-                find_essay_files(r.text)
+                r = rq.get(base_url+link.get("href"), cookies=self.cookies)
+                self.find_essay_files(r.text)
                os.chdir('..')

+    def download_all(self):
+        p  = []
+        for link in self.courses:
+            r = rq.get(base_url+link, cookies=self.cookies)
+            course_path = os.path.join(os.path.abspath(os.path.curdir))
+            make_folder(course_path, self.courses[link])
+            folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
+            print("folder id",folder_id)
+            print("folder_url"+folder_id)
+            r = rq.get(folder_url+folder_id, cookies=self.cookies)
+            print(r.url)
+            table = self.find_folder_table(r.text)
+            Process(target=self.find_files, args=(table,)).start()
+            os.chdir('..')

+    def download_one(self):
+        course_url = input("Emne link:")
+        folder_title=input("folder title:")
+        r = rq.get(course_url, cookies=self.cookies)
+        course_path = os.path.join(os.path.abspath(os.path.curdir))
+        make_folder(course_path, folder_title)
+        folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
+        r = rq.get(folder_url+folder_id, cookies=itl_cookies)
+        r = rq.get(folder_url+folder_id, cookies=itl_cookies)
+        table = self.find_folder_table(r.text)
+        self.find_files(table)
+
+    def get_courses(self):
+        return self.courses

 #print(args.session_cookie)
 #key = args.session_cookie
-folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
-course_url = input("Emne link or leave blank to download all:")
-if course_url:
-    folder_title=input("folder title:")
-    r = rq.get(course_url, cookies=itl_cookies)
-    course_path = os.path.join(os.path.abspath(os.path.curdir))
-    make_folder(course_path, folder_title)
-    folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
-    r = rq.get(folder_url+folder_id, cookies=itl_cookies)
-    r = rq.get(folder_url+folder_id, cookies=itl_cookies)
-    table = find_folder_table(r.text)
-    find_files(table)
-else:
-    for course in courses:
-        r = rq.get(base_url+course, cookies=itl_cookies)
-        course_path = os.path.join(os.path.abspath(os.path.curdir))
-        make_folder(course_path, course_title[course])
-        folder_id = re.search("FolderID=(.+?)'",r.text).group(1)
-        print("folder id",folder_id)
-        print("folder_url"+folder_id)
-        r = rq.get(folder_url+folder_id, cookies=itl_cookies)
-        print(r.url)
-        table = find_folder_table(r.text)
-        find_files(table)
-        os.chdir('..')
-
+if __name__ == '__main__':
+    scraper = itslearning_scraper()
+    scraper.enter()
+    scraper.download_all()