Skip to content

Commit

Permalink
fix: show inner HTML in place of non-existent sounds
Browse files Browse the repository at this point in the history
  • Loading branch information
Crissium committed Sep 28, 2024
1 parent 17bf759 commit e75f082
Showing 1 changed file with 36 additions and 23 deletions.
59 changes: 36 additions & 23 deletions server/app/dicts/mdict/html_cleaner.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
import os
import re
import shutil
from pathlib import Path
import re
from html.parser import HTMLParser


class HTMLCleaner:
class _Concatenator(HTMLParser):
def __init__(self) -> None:
super().__init__()
self._buf = []

def handle_data(self, data: str) -> None:
self._buf.append(data)

def get_text(self) -> str:
return ''.join(self._buf)

_re_non_printing_chars = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]')
_re_compact_html_index = re.compile(r'`(\d+)`')
_re_single_quotes = re.compile(r"=\'([^']*)\'(?=[ >])")
Expand Down Expand Up @@ -180,27 +192,21 @@ def _fix_internal_href(self, definition_html: str) -> str:
# That is, links like entry://#81305a5747ca42b28f2b50de9b762963_nav2
return definition_html.replace('entry://#', '#')

def _flatten_nested_a(self, definition_html: str, depth: int) -> str:
def _flatten_nested_a(self, definition_html: str) -> str:
# Sometimes there're multiple inner elements inside the <a> element, which should be removed
# For example, in my Fr-En En-Fr Collins Dictionary, there's a <span> element inside the <a> element
# The text within the <span> should be preserved, though
# <a class="ref" href="/lookup/collinse22f/badly" title="Translation of badly"><span class="orth">badly</span></a>
if depth == 0:
return definition_html
else:
a_closing_tag_pos = 0
while (a_tag_start_pos := definition_html.find('<a', a_closing_tag_pos)) != -1:
a_tag_end_pos = definition_html.find('>', a_tag_start_pos)
inner_html_start_pos = definition_html.find('>', a_tag_end_pos + 1) + 1
if (a_closing_tag_pos := definition_html.find('</a>', a_tag_end_pos, inner_html_start_pos)) != -1:
continue
inner_html_end_pos = definition_html.find('</', inner_html_start_pos)
inner_html = definition_html[inner_html_start_pos:inner_html_end_pos]
a_closing_tag_pos = definition_html.find('</a>', inner_html_end_pos)
if definition_html.find('href', a_tag_start_pos, a_tag_end_pos) != -1:
definition_html = definition_html[:a_tag_end_pos + 1] +\
inner_html + definition_html[a_closing_tag_pos:]
return self._flatten_nested_a(definition_html, depth - 1)
a_closing_tag_pos = 0
while (a_tag_start_pos := definition_html.find('<a', a_closing_tag_pos)) != -1:
a_tag_end_pos = definition_html.find('>', a_tag_start_pos)
a_closing_tag_pos = definition_html.find('</a>', a_tag_end_pos)
a_inner_html = definition_html[a_tag_end_pos+1:a_closing_tag_pos]
concatenator = self._Concatenator()
concatenator.feed(a_inner_html)
a_inner_text = concatenator.get_text()
definition_html = definition_html[:a_tag_end_pos+1] + a_inner_text + definition_html[a_closing_tag_pos:]
return definition_html

def _fix_entry_cross_ref(self, definition_html: str) -> str:
if definition_html.startswith('@@@LINK='): # strange special case
Expand All @@ -211,27 +217,34 @@ def _fix_entry_cross_ref(self, definition_html: str) -> str:
return f'<a href="{self._lookup_url_root + entry_linked}">{entry_linked}</a>'
else:
definition_html = definition_html.replace('entry://', self._lookup_url_root)
# fingers crossed there are no more than three layers
return self._flatten_nested_a(definition_html, 3)
return self._flatten_nested_a(definition_html)

def _fix_sound_link(self, definition_html: str) -> str:
# Use HTML sound element instead of the original <a> element, which looks like this:
# <a class="hwd_sound sound audio_play_button icon-volume-up ptr fa fa-volume-up" data-lang="en_GB" data-src-mp3="https://www.collinsdictionary.com/sounds/hwd_sounds/EN-GB-W0020530.mp3" href="sound://audio/ef/7650.mp3" title="Pronunciation for "><img class="soundpng" src="/api/cache/collinse22f/img/sound.png"></a>
autoplay_string = 'autoplay'
sound_element_template = '<audio controls %s src="%s">%s</audio>'
not_found_fallback = '<span>%s</span>'
while (sound_link_start_pos := definition_html.find('sound://')) != -1:
sound_link_end_pos = definition_html.find('"', sound_link_start_pos)
original_sound_link = definition_html[sound_link_start_pos:sound_link_end_pos]
sound_link = original_sound_link.replace('sound://', self._href_root_dir)
sound_path = os.path.join(self._resources_dir, sound_link[len(self._href_root_dir):])

inner_html_start_pos = definition_html.find('>', sound_link_end_pos) + 1
inner_html_end_pos = definition_html.find('</a>', inner_html_start_pos)
inner_html = definition_html[inner_html_start_pos:inner_html_end_pos]
outer_html_start_pos = definition_html.rfind('<a', 0, sound_link_start_pos)
outer_html_end_pos = definition_html.find('</a>', inner_html_end_pos) + len('</a>')
definition_html = definition_html[:outer_html_start_pos] +\
sound_element_template % (autoplay_string, sound_link, inner_html) +\

if os.path.isfile(sound_path):
sound_html = sound_element_template % (autoplay_string, sound_link, inner_html)
autoplay_string = ''
else:
sound_html = not_found_fallback % inner_html

definition_html = definition_html[:outer_html_start_pos] + sound_html +\
definition_html[outer_html_end_pos:]
autoplay_string = ''

return definition_html

Expand Down

0 comments on commit e75f082

Please sign in to comment.