Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

把pdf文档的飞书链接转成内链 #10

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions main_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
import logging
import os.path
from asyncio import Task
from typing import List, TypedDict
from typing import List, TypedDict, Any, Optional
import shutil
import time
import random
import pymupdf
from pymupdf import Page, Document
from pathlib import Path

import lark_oapi as lark
Expand Down Expand Up @@ -220,8 +221,10 @@ async def download_exported_pdf(client: lark.Client, task: ExportTask, path: str
f.write(resp.raw.content)
return path

# TODO Create dedicated class for toc?
type Toc = list[tuple[int, str, int]]

def generate_toc(nodes: List[DocTreeNode], tempdir) -> List[tuple[int, str, int]]:
def generate_toc(nodes: List[DocTreeNode], tempdir) -> Toc:
toc_no_page_numbers = list(map(lambda node: node.toc_entry, nodes))
toc = []
page_number = 1
Expand All @@ -238,6 +241,53 @@ def generate_toc(nodes: List[DocTreeNode], tempdir) -> List[tuple[int, str, int]

return toc

def is_lark_link(link: dict[str, Any]) -> bool:
return "larksuite.com" in link["uri"]

def link_text(page: Page, link: dict[str, Any]) -> str:
return page.get_textbox(link["from"])

def points_to(text: str, section: str) -> bool:
# 有的链接的锚文本跟链接所指向的章节的名字不一样,例如
# 「III.6.1 ⾸次办理居留证」的第四章的第一个链接「《居留证的延
# 期》」,它所指向的章节的全名则是「III.6.2 居留证的延期」。
#
# 其次,从飞书导出的pdf文档的精度不够高,用pymupdf取得以上例子里的
# 链接的锚文本时会发现锚文本实是「《居留证的延期》和」而非天书文档
# 的「《居留证的延》」。有时候锚文本还会跨越两行。
import re
section_split = section.split()
section_no_levels = section_split[1] if len(section_split) > 1 else section
# also match newline characters in the prefix
regex = "(?s).*" + "\\s?".join(list(section_no_levels)) + ".*"

return section.endswith(text)\
or section.startswith(text)\
or section_no_levels.endswith(text)\
or section_no_levels.startswith(text)\
or re.match(regex, text)

def localize_link(link: dict[str, Any], text: str, toc: Toc) -> Optional[dict[str, Any]]:
toc_entries = [entry for entry in toc if points_to(text, entry[1])]
if len(toc_entries) > 0:
entry = toc_entries[0]
link["kind"] = 1 # LINK_GOTO
link["page"] = entry[2] - 1 # Counts from 0
link["uri"] = None
return link
return None

def localize_links_in_doc(doc, toc: Toc):
for page in doc:
for link in page.links():
if is_lark_link(link):
# u200b是隐形的空格unicode符号
text = link_text(page, link).replace("\u200b", "").strip()
new_link = localize_link(link, text, toc)
if new_link:
page.update_link(new_link)
else:
logging.info(f"no link found for '{text}'")

async def main():
logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -301,6 +351,7 @@ class ExportJob(TypedDict):
# incremental save is not allowed on a new document
with pymupdf.open(output_path) as doc:
doc.set_toc(toc)
localize_links_in_doc(doc, toc)
doc.saveIncr()

# cleanup temporary directory
Expand Down