forked from victor-o-silva/game-programming-patterns-epub
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_epub.py
More file actions
165 lines (132 loc) · 5.6 KB
/
create_epub.py
File metadata and controls
165 lines (132 loc) · 5.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# std lib
import os
from urllib.parse import urljoin, urlparse
# third party
import requests
from bs4 import BeautifulSoup
from ebooklib import epub
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
EPUB_PATH = os.path.join(BASE_PATH, 'epubs')
BOOK_BASE_URL = 'http://gameprogrammingpatterns.com'
parsed_url = urlparse(BOOK_BASE_URL)
BOOK_SERVER_URL = parsed_url.scheme + '://' + parsed_url.netloc
BOOK_TOC_URL = urljoin(BOOK_BASE_URL, '/contents.html')
def get_index_links():
"""Fetch links from the index of the book.
Return a list of sections, where each section contains a
list of its links.
"""
print('Fetching links from index')
req = requests.get(BOOK_TOC_URL)
soup = BeautifulSoup(req.text, 'lxml', from_encoding='UTF-8')
sections = []
for section in soup.select('ol[type=I] > li'):
section_anchors = section.select('a[href]')
section_links = []
for anchor in section_anchors:
section_links.append({
'title': anchor.get_text(),
'url': urljoin(BOOK_BASE_URL, anchor['href'])
})
sections.append(section_links)
return sections
def fetch_links_contents(sections):
"""Fetch data for the links in the sections and update them."""
for section_index, section in enumerate(sections):
for link_index, link in enumerate(section):
print('Fetching chapter "{}"'.format(link['title']))
# Fetch content
req = requests.get(link['url'])
soup = BeautifulSoup(req.text, 'lxml', from_encoding='UTF-8')
content = soup.select('div.content')[0]
# Remove <nav>s
while(content.select('nav')):
content.select('nav')[0].extract()
# Replace <a>s with their content
for anchor in content.select('a'):
anchor.replaceWith(anchor.text)
# Fetch images' contents and create image items
chapter_images = []
for img_index, img_tag in enumerate(content.select('img[src]')):
# Find image absolute URL
if img_tag['src'].startswith('http'): # img on another server
img_src = img_tag['src']
else: # image on same server as the book
if img_tag['src'].startswith('/'): # absolute path
img_src = urljoin(BOOK_SERVER_URL, img_tag['src'])
else: # relative path
img_src = urljoin(link['url'], img_tag['src'])
# Build image file name to use inside epub
img_extension = img_tag['src'].split('.')[-1]
img_file_name = 's{}_c{}_i{}.{}'.format(
str(section_index).zfill(2),
str(link_index).zfill(2),
str(img_index).zfill(2),
img_extension
)
# Fetch image content and create book item for it
print(' - Fetching image {}'.format(img_tag['src']))
req = requests.get(img_src)
if req.status_code == 200:
image_item = epub.EpubItem(
file_name=img_file_name,
media_type='image/'.format(img_extension),
content=req.content
)
chapter_images.append(image_item)
# Update tag's src attr to the image item we just created
img_tag['src'] = img_file_name
else:
img_tag['src'] = ''
# Update link with content, file name and images
content = '<html><head><meta charset="UTF-8"></head><body>' \
'{}</body></html>'.format(content.prettify())
file_name = 's{}_c{}.htmlx'.format(str(section_index).zfill(2),
str(link_index).zfill(2))
link.update({
'content': content,
'file_name': file_name,
'images_items': chapter_images
})
def create_book(sections):
"""Receive the sections list and create the epub file."""
print('Creating ebook...')
book = epub.EpubBook()
# set metadata
book.set_identifier('gpp')
book.set_title('Game Programming Patterns')
book.set_language('en')
book.add_author('Robert Nystrom')
# create chapters
chapters = []
for section_index, section in enumerate(sections):
for link_index, link in enumerate(section):
title = link['title']
if link_index > 0:
title = ' - {}'.format(title)
chapter = epub.EpubHtml(title=title,
file_name=link['file_name'],
media_type='application/xhtml+xml',
content=link['content'])
book.add_item(chapter)
chapters.append(chapter)
for image_item in link['images_items']:
book.add_item(image_item)
# book's Table of contents
book.toc = chapters
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# book's spine
book.spine = chapters
if not os.path.isdir(EPUB_PATH):
os.mkdir(EPUB_PATH)
file_path = os.path.join(EPUB_PATH, 'game-programming-patterns.epub')
epub.write_epub(file_path, book, {})
print('Book created: {}'.format(file_path))
def generate():
sections = get_index_links()
fetch_links_contents(sections)
create_book(sections)
if __name__ == '__main__':
generate()