diff --git a/README.md b/README.md index e26ea75..ba5709e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ## Installation On Windows, just download [this file](dist/infoq_downloader.exe?raw=true) -On Linux, run the following +On Linux, ensure you have the development headers neeeded by lxml (`sudo apt-get install libxml2-dev libxslt1-dev` on Ubuntu) and run the following: ```sh git clone https://github.com/mtayseer/infoq-downloader.git diff --git a/infoq_downloader.py b/infoq_downloader.py index c8f3bbb..dfcec48 100644 --- a/infoq_downloader.py +++ b/infoq_downloader.py @@ -2,6 +2,7 @@ from __future__ import division, print_function import os +from pathlib import Path import sys import re import argparse @@ -15,8 +16,9 @@ else: text_type = unicode + # Some settings -download_directory = 'downloads' +download_directory = Path('downloads') cleanup_elements = [ '#footer', '#header', '#topInfo', '.share_this', '.random_links', '.vendor_vs_popular', '.bottomContent', '#id_300x250_banner_top', @@ -45,9 +47,11 @@ # Start downloading print('Downloading HTML file') -content = requests.get(url, headers={'User-Agent': user_agent}).content +content = requests.get(url, headers={'User-Agent': user_agent}).text html_doc = lxml.html.fromstring(content) title = html_doc.find(".//title").text +if not isinstance(title, text_type): + title = title.decode() # lxml python2 fix video_url = html_doc.cssselect('video > source')[0].attrib['src'] video_file = os.path.split(video_url)[1] html_doc.cssselect('video > source')[0].attrib['src'] = video_file @@ -63,59 +67,52 @@ slides = slides_re.findall(content) # Create a directory for the downloaded presentation if it doesn't exist -if not os.path.exists(download_directory): - os.makedirs(download_directory) -# presentation folder path -if isinstance(title, text_type): - normalized_title = unicodedata.normalize('NFKD', title) -else: - normalized_title = text_type(title) -presentation_directory = os.path.join(download_directory, normalized_title) -# Create a folder with the name of the presentation -if not os.path.exists(presentation_directory): - os.makedirs(presentation_directory) +def sanitize_path(name): + sanitized = unicodedata.normalize('NFKD', name) + if sys.platform == 'win32': + sanitized = u''.join(c for c in sanitized if ord(c) > 31 and c not in u'<>:"/\|?*') + return sanitized + +presentation_directory = download_directory / sanitize_path(title) +slides_dir = presentation_directory / 'slides' # Create a slides folder inside the presentation folder -if not os.path.exists('{}/slides'.format(presentation_directory)): - os.makedirs('{}/slides'.format(presentation_directory)) +if not slides_dir.exists(): + slides_dir.mkdir(parents=True) + #Write content content = re.sub(r"/resource/presentations/[^']*?/en/", '', content) -with open('{}/index.html'.format(presentation_directory), 'w') as f: +with (presentation_directory / 'index.html').open('w') as f: f.write(content) - f.flush() # Download slides -slides_dir = os.path.join(presentation_directory, 'slides') -if not os.path.isdir(slides_dir): - os.makedirs(slides_dir) for i, slide in enumerate(slides): - filename = os.path.split(slide)[1] - full_path = os.path.join(slides_dir, '{0}'.format(filename)) - if os.path.exists(full_path): + full_path = slides_dir / os.path.split(slide)[1] + if full_path.exists(): continue print('\rDownloading slide {0} of {1}'.format(i+1, len(slides)), end='') sys.stdout.flush() # Hack for Python 2 url = 'http://www.infoq.com{0}'.format(slide) - with open(full_path, 'wb') as f: + + with full_path.open('wb') as f: f.write(requests.get(url).content) print() # If the video file is already downloaded successfully, don't do anything else -if os.path.exists(video_file): +video_path = presentation_directory / video_file +if video_path.exists(): print('Video file already exists') sys.exit() # Download the video file. stream=True here is important to allow me to iterate # over content -downloaded_file = os.path.join( - presentation_directory, '{}.part'.format(video_file) -) +downloaded_file = presentation_directory / '{}.part'.format(video_file) -if os.path.exists(downloaded_file): - bytes_downloaded = os.stat(downloaded_file).st_size +if downloaded_file.exists(): + bytes_downloaded = downloaded_file.stat().st_size else: bytes_downloaded = 0 @@ -123,7 +120,7 @@ headers={'Range': 'bytes={0}-'.format(bytes_downloaded)}) content_length = int(r.headers['content-length']) + bytes_downloaded -with open(downloaded_file, 'ab') as f: +with downloaded_file.open('ab') as f: for chunk in r.iter_content(10 * 1024): f.write(chunk) f.flush() @@ -135,5 +132,4 @@ print('\rDownloading video {0:.2f}%'.format(percent), end='') sys.stdout.flush() # Hack for Python 2 -final_video_name = os.path.join(presentation_directory, video_file) -os.rename(downloaded_file, final_video_name) +downloaded_file.rename(video_path) diff --git a/requirements.txt b/requirements.txt index 5cdb6d9..d90ed41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ requests lxml -cssselect \ No newline at end of file +cssselect +pathlib