mtayseer · berdario · Nov 4, 2014
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 ## Installation
 On Windows, just download [this file](dist/infoq_downloader.exe?raw=true)
 
-On Linux, run the following
+On Linux, ensure you have the development headers neeeded by lxml (`sudo apt-get install libxml2-dev libxslt1-dev` on Ubuntu) and run the following:
 
 ```sh
 git clone https://github.com/mtayseer/infoq-downloader.git

diff --git a/infoq_downloader.py b/infoq_downloader.py
@@ -2,6 +2,7 @@
 
 from __future__ import division, print_function
 import os
+from pathlib import Path
 import sys
 import re
 import argparse
@@ -15,8 +16,9 @@
 else:
     text_type = unicode
 
+
 # Some settings
-download_directory = 'downloads'
+download_directory = Path('downloads')
 cleanup_elements = [
     '#footer', '#header', '#topInfo', '.share_this', '.random_links',
     '.vendor_vs_popular', '.bottomContent', '#id_300x250_banner_top',
@@ -45,9 +47,11 @@
 # Start downloading
 print('Downloading HTML file')
 
-content = requests.get(url, headers={'User-Agent': user_agent}).content
+content = requests.get(url, headers={'User-Agent': user_agent}).text
 html_doc = lxml.html.fromstring(content)
 title = html_doc.find(".//title").text
+if not isinstance(title, text_type):
+    title = title.decode() # lxml python2 fix
 video_url = html_doc.cssselect('video > source')[0].attrib['src']
 video_file = os.path.split(video_url)[1]
 html_doc.cssselect('video > source')[0].attrib['src'] = video_file
@@ -63,67 +67,60 @@
 slides = slides_re.findall(content)
 
 # Create a directory for the downloaded presentation if it doesn't exist
-if not os.path.exists(download_directory):
-    os.makedirs(download_directory)
 
-# presentation folder path
-if isinstance(title, text_type):
-    normalized_title = unicodedata.normalize('NFKD', title)
-else:
-    normalized_title = text_type(title)
-presentation_directory = os.path.join(download_directory, normalized_title)
-# Create a folder with the name of the presentation
-if not os.path.exists(presentation_directory):
-    os.makedirs(presentation_directory)
+def sanitize_path(name):
+    sanitized = unicodedata.normalize('NFKD', name)
+    if sys.platform == 'win32':
+        sanitized = u''.join(c for c in sanitized if ord(c) > 31 and c not in u'<>:"/\|?*')
+    return sanitized
+
+presentation_directory = download_directory / sanitize_path(title)
+slides_dir = presentation_directory / 'slides'
 
 # Create a slides folder inside the presentation folder
-if not os.path.exists('{}/slides'.format(presentation_directory)):
-    os.makedirs('{}/slides'.format(presentation_directory))
+if not slides_dir.exists():
+    slides_dir.mkdir(parents=True)
+
 
 #Write content
 content = re.sub(r"/resource/presentations/[^']*?/en/", '', content)
-with open('{}/index.html'.format(presentation_directory), 'w') as f:
+with (presentation_directory / 'index.html').open('w') as f:
     f.write(content)
-    f.flush()
 
 # Download slides
-slides_dir = os.path.join(presentation_directory, 'slides')
-if not os.path.isdir(slides_dir):
-    os.makedirs(slides_dir)
 for i, slide in enumerate(slides):
-    filename = os.path.split(slide)[1]
-    full_path = os.path.join(slides_dir, '{0}'.format(filename))
-    if os.path.exists(full_path):
+    full_path = slides_dir / os.path.split(slide)[1]
+    if full_path.exists():
         continue
     print('\rDownloading slide {0} of {1}'.format(i+1, len(slides)), end='')
     sys.stdout.flush()  # Hack for Python 2
     url = 'http://www.infoq.com{0}'.format(slide)
-    with open(full_path, 'wb') as f:
+
+    with full_path.open('wb') as f:
         f.write(requests.get(url).content)
 
 print()
 
 # If the video file is already downloaded successfully, don't do anything else
-if os.path.exists(video_file):
+video_path = presentation_directory / video_file
+if video_path.exists():
     print('Video file already exists')
     sys.exit()
 
 # Download the video file. stream=True here is important to allow me to iterate
 # over content
-downloaded_file = os.path.join(
-    presentation_directory, '{}.part'.format(video_file)
-)
+downloaded_file = presentation_directory / '{}.part'.format(video_file)
 
-if os.path.exists(downloaded_file):
-    bytes_downloaded = os.stat(downloaded_file).st_size
+if downloaded_file.exists():
+    bytes_downloaded = downloaded_file.stat().st_size
 else:
     bytes_downloaded = 0
 
 r = requests.get(video_url, stream=True,
                  headers={'Range': 'bytes={0}-'.format(bytes_downloaded)})
 content_length = int(r.headers['content-length']) + bytes_downloaded
 
-with open(downloaded_file, 'ab') as f:
+with downloaded_file.open('ab') as f:
     for chunk in r.iter_content(10 * 1024):
         f.write(chunk)
         f.flush()
@@ -135,5 +132,4 @@
         print('\rDownloading video {0:.2f}%'.format(percent), end='')
         sys.stdout.flush()  # Hack for Python 2
 
-final_video_name = os.path.join(presentation_directory, video_file)
-os.rename(downloaded_file, final_video_name)
+downloaded_file.rename(video_path)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 requests
 lxml
-cssselect
+cssselect
+pathlib