@@ -245,11 +245,27 @@ def convert_mdtext(md_text): # deprecated
245
245
246
246
def remove_html_tags (data ):
247
247
def clean_text (text ):
248
- # Remove all HTML tags
249
- cleaned = re .sub (r'<[^>]+>' , '' , text )
250
- # Remove any remaining HTML entities
251
- cleaned = re .sub (r'&[#\w]+;' , '' , cleaned )
252
- return cleaned .strip ()
248
+ # Regular expression to match code blocks, including all newlines
249
+ code_block_pattern = r'(```[\s\S]*?```)'
250
+
251
+ # Split the text into code blocks and non-code blocks
252
+ parts = re .split (code_block_pattern , text )
253
+
254
+ cleaned_parts = []
255
+ for part in parts :
256
+ if part .startswith ('```' ) and part .endswith ('```' ):
257
+ # This is a code block, keep it exactly as is
258
+ cleaned_parts .append (part )
259
+ else :
260
+ # This is not a code block, remove HTML tags
261
+ # Remove all HTML tags
262
+ cleaned = re .sub (r'<[^>]+>' , '' , part )
263
+ # Remove any remaining HTML entities
264
+ cleaned = re .sub (r'&[#\w]+;' , '' , cleaned )
265
+ cleaned_parts .append (cleaned ) # Don't strip here to preserve newlines
266
+
267
+ # Join the cleaned parts back together
268
+ return '' .join (cleaned_parts )
253
269
254
270
return [
255
271
[clean_text (item ) for item in sublist ]
0 commit comments