From b3e2a59a41239feb1b1bc0dff9969803084aa68f Mon Sep 17 00:00:00 2001 From: Bojie Li Date: Fri, 23 Aug 2024 17:43:24 +0800 Subject: [PATCH] [fix] highlight search results: split English and Chinese parts --- app/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/app/utils.py b/app/utils.py index d95025a6..50d3d259 100644 --- a/app/utils.py +++ b/app/utils.py @@ -15,6 +15,7 @@ import lxml.html from hashlib import sha256 import pdfkit +from app.views.search import filter mail = Mail(app) @@ -166,7 +167,11 @@ def abstract_by_keyword(content, keyword): plaintext = Markup(content).striptags() lower_plaintext = plaintext.lower() keyword = keyword.lower() - words = keyword.split() + # remove English and Chinese sentence separators + sentence = filter(keyword) + # split sentence into English and Chinese parts + words = re.findall(r'[A-Za-z0-9]+|[\u4e00-\u9fff]+', sentence) + first_index = len(plaintext) for word in words: index = lower_plaintext.find(word)