Skip to content

Commit ffea5e9

Browse files
committed
bugfix: HTML tags inside code blocks won't be removed now.
Beta
1 parent 22007a7 commit ffea5e9

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

modules/utils.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,27 @@ def convert_mdtext(md_text): # deprecated
245245

246246
def remove_html_tags(data):
247247
def clean_text(text):
248-
# Remove all HTML tags
249-
cleaned = re.sub(r'<[^>]+>', '', text)
250-
# Remove any remaining HTML entities
251-
cleaned = re.sub(r'&[#\w]+;', '', cleaned)
252-
return cleaned.strip()
248+
# Regular expression to match code blocks, including all newlines
249+
code_block_pattern = r'(```[\s\S]*?```)'
250+
251+
# Split the text into code blocks and non-code blocks
252+
parts = re.split(code_block_pattern, text)
253+
254+
cleaned_parts = []
255+
for part in parts:
256+
if part.startswith('```') and part.endswith('```'):
257+
# This is a code block, keep it exactly as is
258+
cleaned_parts.append(part)
259+
else:
260+
# This is not a code block, remove HTML tags
261+
# Remove all HTML tags
262+
cleaned = re.sub(r'<[^>]+>', '', part)
263+
# Remove any remaining HTML entities
264+
cleaned = re.sub(r'&[#\w]+;', '', cleaned)
265+
cleaned_parts.append(cleaned) # Don't strip here to preserve newlines
266+
267+
# Join the cleaned parts back together
268+
return ''.join(cleaned_parts)
253269

254270
return [
255271
[clean_text(item) for item in sublist]

0 commit comments

Comments
 (0)