scraped2epub.py

from ebooklib import epub
import os
from bs4 import BeautifulSoup


OPTIMIZE_IMAGE = True
MAXIMAL_IMAGE_WIDTH = 800
JPEG_QUALITY = 70

def calc_ideal_size(width, height):
    if width > MAXIMAL_IMAGE_WIDTH:
        ratio = MAXIMAL_IMAGE_WIDTH / width
        width = MAXIMAL_IMAGE_WIDTH
        height = height * ratio
    if height > MAXIMAL_IMAGE_WIDTH:
        ratio = MAXIMAL_IMAGE_WIDTH / height
        height = MAXIMAL_IMAGE_WIDTH
        width = width * ratio
    return int(width), int(height)

if OPTIMIZE_IMAGE:
    import cv2


book = epub.EpubBook()

# set metadata
book.set_identifier('chinadaily_scraper')
book.set_title('China Daily')
book.set_language('en')

book.add_author('China Daily Editor')
book.add_author('Tsumugi Shirogane as scraper script developer')

# cover
image_content = open('chinadailycover.jpg', 'rb').read()
book.set_cover('cover.jpg', image_content, True)


chapter_Disclaimer = epub.EpubHtml(title='Disclaimer', file_name='disclaimer.xhtml', lang='en')
chapter_Disclaimer.content=u"""
<h1>Disclaimer</h1>
<p> this ebook is generated by a scraper script, which is not affiliated with China Daily. </p>
<p> China Daily Copyright: Copyright 1995 - 2022 . All rights reserved. The content (including but not limited to text, photo, multimedia information, etc) published in this site belongs to China Daily Information Co (CDIC). Without written authorization from CDIC, such content shall not be republished or used in any form. </p>
<p> This scraper script can found on <a href="https://github.com/inapp123/chinadaily2epub">https://github.com/inapp123/chinadaily2epub</a></p>
"""

book.add_item(chapter_Disclaimer)

# define Table Of Contents
book.toc = [epub.Link("disclaimer.xhtml" ,"Disclaimer", "disclaimer")]

# basic spine
book.spine = ['nav',chapter_Disclaimer]

# add chapters
for filename in os.listdir("data"):
    if filename == "img":
        continue
    with open(os.path.join("data",filename),"r",encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    if soup.find(name="h1") == None:
        continue
    title = soup.find(name="h1").get_text()
    chapter = epub.EpubHtml(title=title, file_name=filename + ".xhtml", lang='en')
    chapter.content = str(soup.find(name="body"))
    if soup.find(name="img") != None:
        for img in soup.find_all(name="img"):
            imgsrc = img.get('src')
            if OPTIMIZE_IMAGE:
                imgmat = cv2.imread(os.path.join("data",imgsrc))
                dstwidth,dstheight = calc_ideal_size(imgmat.shape[1],imgmat.shape[0])
                imgmat = cv2.resize(imgmat,(dstwidth,dstheight))
                image_content = cv2.imencode('.jpg', imgmat, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY])[1].tobytes()
            else:
                image_content = open(os.path.join("data",imgsrc), 'rb').read()
            book.add_item(epub.EpubItem(uid=imgsrc.split("\\")[1],file_name=imgsrc, media_type='image/jpeg', content=image_content))
    book.add_item(chapter)
    book.toc.append(epub.Link(filename + ".xhtml" ,title, filename))
    book.spine.append(chapter)


# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())

# write to the file
epub.write_epub('chinadaily.epub', book, {})