forked from mtayseer/infoq-downloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
infoq_downloader.py
139 lines (117 loc) · 4.73 KB
/
infoq_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
from __future__ import division, print_function
import os
import sys
import re
import argparse
import requests
import cssselect
import lxml.html
import unicodedata
if sys.version_info.major == 3:
text_type = str
else:
text_type = unicode
# Some settings
download_directory = 'downloads'
cleanup_elements = [
'#footer', '#header', '#topInfo', '.share_this', '.random_links',
'.vendor_vs_popular', '.bottomContent', '#id_300x250_banner_top',
'.presentation_type', '#conference', '#imgPreload', '#text_height_fix_box',
'.download_presentation', '.recorded', 'script[async]',
'script[src*=addthis]'
]
# Set argparse to parse the paramaters
parser = argparse.ArgumentParser(description='Download InfoQ presentations.')
parser.add_argument('url', metavar='URL', type=str,
help='URL of the presentation to download')
# Parse the arguments passed to the script
args = parser.parse_args()
url = args.url
# Tell infoq that I'm an iPad, so it gives me simpler HTML to parse & mp4 file
# qto download
user_agent = (
"Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) "
"AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b "
"Safari/531.21.10')"
)
# Start downloading
print('Downloading HTML file')
content = requests.get(url, headers={'User-Agent': user_agent}).content
html_doc = lxml.html.fromstring(content)
title = html_doc.find(".//title").text
video_url = html_doc.cssselect('video > source')[0].attrib['src']
video_file = os.path.split(video_url)[1]
html_doc.cssselect('video > source')[0].attrib['src'] = video_file
# Clean the page
for elt in html_doc.cssselect(', '.join(e for e in cleanup_elements)):
elt.getparent().remove(elt)
html_doc.cssselect('#wrapper')[0].attrib['style'] = 'background: none'
content = lxml.html.tostring(html_doc).decode('utf-8')
# Make slides links point to local copies
slides_re = re.compile(r"'(/resource/presentations/[^']*?/en/slides/[^']*?)'")
slides = slides_re.findall(content)
# Create a directory for the downloaded presentation if it doesn't exist
if not os.path.exists(download_directory):
os.makedirs(download_directory)
# presentation folder path
if isinstance(title, text_type):
normalized_title = unicodedata.normalize('NFKD', title)
else:
normalized_title = text_type(title)
presentation_directory = os.path.join(download_directory, normalized_title)
# Create a folder with the name of the presentation
if not os.path.exists(presentation_directory):
os.makedirs(presentation_directory)
# Create a slides folder inside the presentation folder
if not os.path.exists('{}/slides'.format(presentation_directory)):
os.makedirs('{}/slides'.format(presentation_directory))
#Write content
content = re.sub(r"/resource/presentations/[^']*?/en/", '', content)
with open('{}/index.html'.format(presentation_directory), 'w') as f:
f.write(content)
f.flush()
# Download slides
slides_dir = os.path.join(presentation_directory, 'slides')
if not os.path.isdir(slides_dir):
os.makedirs(slides_dir)
for i, slide in enumerate(slides):
filename = os.path.split(slide)[1]
full_path = os.path.join(slides_dir, '{0}'.format(filename))
if os.path.exists(full_path):
continue
print('\rDownloading slide {0} of {1}'.format(i+1, len(slides)), end='')
sys.stdout.flush() # Hack for Python 2
url = 'http://www.infoq.com{0}'.format(slide)
with open(full_path, 'wb') as f:
f.write(requests.get(url).content)
print()
# If the video file is already downloaded successfully, don't do anything else
if os.path.exists(video_file):
print('Video file already exists')
sys.exit()
# Download the video file. stream=True here is important to allow me to iterate
# over content
downloaded_file = os.path.join(
presentation_directory, '{}.part'.format(video_file)
)
if os.path.exists(downloaded_file):
bytes_downloaded = os.stat(downloaded_file).st_size
else:
bytes_downloaded = 0
r = requests.get(video_url, stream=True,
headers={'Range': 'bytes={0}-'.format(bytes_downloaded)})
content_length = int(r.headers['content-length']) + bytes_downloaded
with open(downloaded_file, 'ab') as f:
for chunk in r.iter_content(10 * 1024):
f.write(chunk)
f.flush()
# \r used to return the cursor to beginning of line, so I can write
# progress on a single line.
# The comma at the end of line is important, to stop the 'print' command
# from printing an additional new line
percent = f.tell() / content_length * 100
print('\rDownloading video {0:.2f}%'.format(percent), end='')
sys.stdout.flush() # Hack for Python 2
final_video_name = os.path.join(presentation_directory, video_file)
os.rename(downloaded_file, final_video_name)