mirror of
https://github.com/KevinMidboe/ITLscraper.git
synced 2025-10-29 09:40:13 +00:00
add slugify to handle special characters
This commit is contained in:
@@ -11,13 +11,14 @@ import getpass
|
|||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
from bs4 import BeautifulSoup as bs
|
from bs4 import BeautifulSoup as bs
|
||||||
from getpass import getpass
|
from getpass import getpass
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
|
|
||||||
base_url = "https://ntnu.itslearning.com/"
|
base_url = "https://ntnu.itslearning.com/"
|
||||||
folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
|
folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
|
||||||
|
|
||||||
def make_folder(curpath, title):
|
def make_folder(curpath, title):
|
||||||
|
title = slugify(title)
|
||||||
folder_path = os.path.join(curpath,title)
|
folder_path = os.path.join(curpath,title)
|
||||||
#print("making dir:",folder_path)
|
#print("making dir:",folder_path)
|
||||||
if not os.path.exists(folder_path):
|
if not os.path.exists(folder_path):
|
||||||
@@ -146,6 +147,7 @@ class itslearning_scraper():
|
|||||||
filename = title
|
filename = title
|
||||||
self.failure += 1
|
self.failure += 1
|
||||||
print("File created with name:",filename)
|
print("File created with name:",filename)
|
||||||
|
filename = slugify(filename)
|
||||||
filename = os.path.join(os.path.abspath(os.path.curdir),filename)
|
filename = os.path.join(os.path.abspath(os.path.curdir),filename)
|
||||||
with open(filename, 'wb') as f:
|
with open(filename, 'wb') as f:
|
||||||
for chunk in r.iter_content(chunk_size=1024):
|
for chunk in r.iter_content(chunk_size=1024):
|
||||||
@@ -179,6 +181,7 @@ class itslearning_scraper():
|
|||||||
if text:
|
if text:
|
||||||
title = three.find("span", {"id":"ctl05_TT"}).contents[0]
|
title = three.find("span", {"id":"ctl05_TT"}).contents[0]
|
||||||
text = self.html2text.handle(str(text))
|
text = self.html2text.handle(str(text))
|
||||||
|
title = slugify(title)
|
||||||
fp = os.path.join(os.path.abspath(os.path.curdir), title)
|
fp = os.path.join(os.path.abspath(os.path.curdir), title)
|
||||||
#convert to md?
|
#convert to md?
|
||||||
with open(fp+".md", "wb") as note:
|
with open(fp+".md", "wb") as note:
|
||||||
@@ -208,7 +211,8 @@ class itslearning_scraper():
|
|||||||
print(section_link)
|
print(section_link)
|
||||||
target = section_link.get("href")
|
target = section_link.get("href")
|
||||||
print(target)
|
print(target)
|
||||||
fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
|
|
||||||
|
fp = os.path.join(os.path.abspath(os.path.curdir), slugify("".join(target.split('/'))))
|
||||||
print("filepath:", fp)
|
print("filepath:", fp)
|
||||||
with open(fp+".url", "wb") as shortcut:
|
with open(fp+".url", "wb") as shortcut:
|
||||||
shortcut.write(b'[InternetShortcut]\n')
|
shortcut.write(b'[InternetShortcut]\n')
|
||||||
@@ -222,6 +226,7 @@ class itslearning_scraper():
|
|||||||
text = three.find("div", {"class":"h-userinput"})
|
text = three.find("div", {"class":"h-userinput"})
|
||||||
print(text.contents[0])
|
print(text.contents[0])
|
||||||
text = self.html2text.handle(str(text))
|
text = self.html2text.handle(str(text))
|
||||||
|
title = slugify(title)
|
||||||
fp = os.path.join(os.path.abspath(os.path.curdir), title)
|
fp = os.path.join(os.path.abspath(os.path.curdir), title)
|
||||||
#convert to md?
|
#convert to md?
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -3,3 +3,4 @@ MechanicalSoup==0.7.0
|
|||||||
html2text==2016.9.19
|
html2text==2016.9.19
|
||||||
requests==2.13.0
|
requests==2.13.0
|
||||||
appJar==0.52
|
appJar==0.52
|
||||||
|
python-slugify==1.2.4
|
||||||
|
|||||||
Reference in New Issue
Block a user