bugfix: HTML tags inside code blocks won't be removed now.

GaiZhenbiao · GaiZhenbiao · commit ffea5e9419ed · 2024-09-19T18:11:45.000+08:00
Beta
diff --git a/modules/utils.py b/modules/utils.py
@@ -245,11 +245,27 @@ def convert_mdtext(md_text):  # deprecated
 
 def remove_html_tags(data):
     def clean_text(text):
-        # Remove all HTML tags
-        cleaned = re.sub(r'<[^>]+>', '', text)
-        # Remove any remaining HTML entities
-        cleaned = re.sub(r'&[#\w]+;', '', cleaned)
-        return cleaned.strip()
+        # Regular expression to match code blocks, including all newlines
+        code_block_pattern = r'(```[\s\S]*?```)'
+
+        # Split the text into code blocks and non-code blocks
+        parts = re.split(code_block_pattern, text)
+
+        cleaned_parts = []
+        for part in parts:
+            if part.startswith('```') and part.endswith('```'):
+                # This is a code block, keep it exactly as is
+                cleaned_parts.append(part)
+            else:
+                # This is not a code block, remove HTML tags
+                # Remove all HTML tags
+                cleaned = re.sub(r'<[^>]+>', '', part)
+                # Remove any remaining HTML entities
+                cleaned = re.sub(r'&[#\w]+;', '', cleaned)
+                cleaned_parts.append(cleaned)  # Don't strip here to preserve newlines
+
+        # Join the cleaned parts back together
+        return ''.join(cleaned_parts)
 
     return [
         [clean_text(item) for item in sublist]