add slugify to handle special characters

This commit is contained in:
sigvartmh
2017-05-14 00:47:17 +02:00
parent b1e236e445
commit 9223a837cd
2 changed files with 8 additions and 2 deletions

View File

@@ -11,13 +11,14 @@ import getpass
from multiprocessing import Process from multiprocessing import Process
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from getpass import getpass from getpass import getpass
from slugify import slugify
base_url = "https://ntnu.itslearning.com/" base_url = "https://ntnu.itslearning.com/"
folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID=" folder_url = "https://ntnu.itslearning.com/Folder/processfolder.aspx?FolderID="
def make_folder(curpath, title): def make_folder(curpath, title):
title = slugify(title)
folder_path = os.path.join(curpath,title) folder_path = os.path.join(curpath,title)
#print("making dir:",folder_path) #print("making dir:",folder_path)
if not os.path.exists(folder_path): if not os.path.exists(folder_path):
@@ -146,6 +147,7 @@ class itslearning_scraper():
filename = title filename = title
self.failure += 1 self.failure += 1
print("File created with name:",filename) print("File created with name:",filename)
filename = slugify(filename)
filename = os.path.join(os.path.abspath(os.path.curdir),filename) filename = os.path.join(os.path.abspath(os.path.curdir),filename)
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024): for chunk in r.iter_content(chunk_size=1024):
@@ -179,6 +181,7 @@ class itslearning_scraper():
if text: if text:
title = three.find("span", {"id":"ctl05_TT"}).contents[0] title = three.find("span", {"id":"ctl05_TT"}).contents[0]
text = self.html2text.handle(str(text)) text = self.html2text.handle(str(text))
title = slugify(title)
fp = os.path.join(os.path.abspath(os.path.curdir), title) fp = os.path.join(os.path.abspath(os.path.curdir), title)
#convert to md? #convert to md?
with open(fp+".md", "wb") as note: with open(fp+".md", "wb") as note:
@@ -208,7 +211,8 @@ class itslearning_scraper():
print(section_link) print(section_link)
target = section_link.get("href") target = section_link.get("href")
print(target) print(target)
fp = os.path.join(os.path.abspath(os.path.curdir), "".join(target.split('/')))
fp = os.path.join(os.path.abspath(os.path.curdir), slugify("".join(target.split('/'))))
print("filepath:", fp) print("filepath:", fp)
with open(fp+".url", "wb") as shortcut: with open(fp+".url", "wb") as shortcut:
shortcut.write(b'[InternetShortcut]\n') shortcut.write(b'[InternetShortcut]\n')
@@ -222,6 +226,7 @@ class itslearning_scraper():
text = three.find("div", {"class":"h-userinput"}) text = three.find("div", {"class":"h-userinput"})
print(text.contents[0]) print(text.contents[0])
text = self.html2text.handle(str(text)) text = self.html2text.handle(str(text))
title = slugify(title)
fp = os.path.join(os.path.abspath(os.path.curdir), title) fp = os.path.join(os.path.abspath(os.path.curdir), title)
#convert to md? #convert to md?
try: try:

View File

@@ -3,3 +3,4 @@ MechanicalSoup==0.7.0
html2text==2016.9.19 html2text==2016.9.19
requests==2.13.0 requests==2.13.0
appJar==0.52 appJar==0.52
python-slugify==1.2.4