From ffea5e9419ed517e8b80208567f4f70404047cbe Mon Sep 17 00:00:00 2001 From: Tuchuanhuhuhu Date: Thu, 19 Sep 2024 18:11:45 +0800 Subject: [PATCH] bugfix: HTML tags inside code blocks won't be removed now. Beta --- modules/utils.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/modules/utils.py b/modules/utils.py index bef80a73..f2baefad 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -245,11 +245,27 @@ def convert_mdtext(md_text): # deprecated def remove_html_tags(data): def clean_text(text): - # Remove all HTML tags - cleaned = re.sub(r'<[^>]+>', '', text) - # Remove any remaining HTML entities - cleaned = re.sub(r'&[#\w]+;', '', cleaned) - return cleaned.strip() + # Regular expression to match code blocks, including all newlines + code_block_pattern = r'(```[\s\S]*?```)' + + # Split the text into code blocks and non-code blocks + parts = re.split(code_block_pattern, text) + + cleaned_parts = [] + for part in parts: + if part.startswith('```') and part.endswith('```'): + # This is a code block, keep it exactly as is + cleaned_parts.append(part) + else: + # This is not a code block, remove HTML tags + # Remove all HTML tags + cleaned = re.sub(r'<[^>]+>', '', part) + # Remove any remaining HTML entities + cleaned = re.sub(r'&[#\w]+;', '', cleaned) + cleaned_parts.append(cleaned) # Don't strip here to preserve newlines + + # Join the cleaned parts back together + return ''.join(cleaned_parts) return [ [clean_text(item) for item in sublist]