-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraped2epub.py
89 lines (71 loc) · 3.28 KB
/
scraped2epub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from ebooklib import epub
import os
from bs4 import BeautifulSoup
OPTIMIZE_IMAGE = True
MAXIMAL_IMAGE_WIDTH = 800
JPEG_QUALITY = 70
def calc_ideal_size(width, height):
if width > MAXIMAL_IMAGE_WIDTH:
ratio = MAXIMAL_IMAGE_WIDTH / width
width = MAXIMAL_IMAGE_WIDTH
height = height * ratio
if height > MAXIMAL_IMAGE_WIDTH:
ratio = MAXIMAL_IMAGE_WIDTH / height
height = MAXIMAL_IMAGE_WIDTH
width = width * ratio
return int(width), int(height)
if OPTIMIZE_IMAGE:
import cv2
book = epub.EpubBook()
# set metadata
book.set_identifier('chinadaily_scraper')
book.set_title('China Daily')
book.set_language('en')
book.add_author('China Daily Editor')
book.add_author('Tsumugi Shirogane as scraper script developer')
# cover
image_content = open('chinadailycover.jpg', 'rb').read()
book.set_cover('cover.jpg', image_content, True)
chapter_Disclaimer = epub.EpubHtml(title='Disclaimer', file_name='disclaimer.xhtml', lang='en')
chapter_Disclaimer.content=u"""
<h1>Disclaimer</h1>
<p> this ebook is generated by a scraper script, which is not affiliated with China Daily. </p>
<p> China Daily Copyright: Copyright 1995 - 2022 . All rights reserved. The content (including but not limited to text, photo, multimedia information, etc) published in this site belongs to China Daily Information Co (CDIC). Without written authorization from CDIC, such content shall not be republished or used in any form. </p>
<p> This scraper script can found on <a href="https://github.com/inapp123/chinadaily2epub">https://github.com/inapp123/chinadaily2epub</a></p>
"""
book.add_item(chapter_Disclaimer)
# define Table Of Contents
book.toc = [epub.Link("disclaimer.xhtml" ,"Disclaimer", "disclaimer")]
# basic spine
book.spine = ['nav',chapter_Disclaimer]
# add chapters
for filename in os.listdir("data"):
if filename == "img":
continue
with open(os.path.join("data",filename),"r",encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, 'html.parser')
if soup.find(name="h1") == None:
continue
title = soup.find(name="h1").get_text()
chapter = epub.EpubHtml(title=title, file_name=filename + ".xhtml", lang='en')
chapter.content = str(soup.find(name="body"))
if soup.find(name="img") != None:
for img in soup.find_all(name="img"):
imgsrc = img.get('src')
if OPTIMIZE_IMAGE:
imgmat = cv2.imread(os.path.join("data",imgsrc))
dstwidth,dstheight = calc_ideal_size(imgmat.shape[1],imgmat.shape[0])
imgmat = cv2.resize(imgmat,(dstwidth,dstheight))
image_content = cv2.imencode('.jpg', imgmat, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY])[1].tobytes()
else:
image_content = open(os.path.join("data",imgsrc), 'rb').read()
book.add_item(epub.EpubItem(uid=imgsrc.split("\\")[1],file_name=imgsrc, media_type='image/jpeg', content=image_content))
book.add_item(chapter)
book.toc.append(epub.Link(filename + ".xhtml" ,title, filename))
book.spine.append(chapter)
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# write to the file
epub.write_epub('chinadaily.epub', book, {})