Skip to content

Commit

Permalink
[fix] highlight search results: split English and Chinese parts
Browse files Browse the repository at this point in the history
  • Loading branch information
bojieli committed Aug 23, 2024
1 parent 07e0d0b commit b3e2a59
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import lxml.html
from hashlib import sha256
import pdfkit
from app.views.search import filter


mail = Mail(app)
Expand Down Expand Up @@ -166,7 +167,11 @@ def abstract_by_keyword(content, keyword):
plaintext = Markup(content).striptags()
lower_plaintext = plaintext.lower()
keyword = keyword.lower()
words = keyword.split()
# remove English and Chinese sentence separators
sentence = filter(keyword)
# split sentence into English and Chinese parts
words = re.findall(r'[A-Za-z0-9]+|[\u4e00-\u9fff]+', sentence)

first_index = len(plaintext)
for word in words:
index = lower_plaintext.find(word)
Expand Down

0 comments on commit b3e2a59

Please sign in to comment.