From ffea5e9419ed517e8b80208567f4f70404047cbe Mon Sep 17 00:00:00 2001
From: Tuchuanhuhuhu <gzblog@hdu.edu.cn>
Date: Thu, 19 Sep 2024 18:11:45 +0800
Subject: [PATCH] bugfix: HTML tags inside code blocks won't be removed now.
 Beta

---
 modules/utils.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/modules/utils.py b/modules/utils.py
index bef80a73..f2baefad 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -245,11 +245,27 @@ def convert_mdtext(md_text):  # deprecated
 
 def remove_html_tags(data):
     def clean_text(text):
-        # Remove all HTML tags
-        cleaned = re.sub(r'<[^>]+>', '', text)
-        # Remove any remaining HTML entities
-        cleaned = re.sub(r'&[#\w]+;', '', cleaned)
-        return cleaned.strip()
+        # Regular expression to match code blocks, including all newlines
+        code_block_pattern = r'(```[\s\S]*?```)'
+
+        # Split the text into code blocks and non-code blocks
+        parts = re.split(code_block_pattern, text)
+
+        cleaned_parts = []
+        for part in parts:
+            if part.startswith('```') and part.endswith('```'):
+                # This is a code block, keep it exactly as is
+                cleaned_parts.append(part)
+            else:
+                # This is not a code block, remove HTML tags
+                # Remove all HTML tags
+                cleaned = re.sub(r'<[^>]+>', '', part)
+                # Remove any remaining HTML entities
+                cleaned = re.sub(r'&[#\w]+;', '', cleaned)
+                cleaned_parts.append(cleaned)  # Don't strip here to preserve newlines
+
+        # Join the cleaned parts back together
+        return ''.join(cleaned_parts)
 
     return [
         [clean_text(item) for item in sublist]