Skip to content

Commit

Permalink
移除冗余
Browse files Browse the repository at this point in the history
  • Loading branch information
AlongWY committed Jun 18, 2020
1 parent 7d36eaa commit edd549c
Showing 1 changed file with 10 additions and 15 deletions.
25 changes: 10 additions & 15 deletions ltp/utils/sent_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,35 @@
# Author: jeffrey
# Changed:
# sen -> sent / auto strip: ylfeng


import re
import itertools
from typing import List


def split_sentence(document: str, flag: str = "all", limit: int = 512):
def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
"""
Args:
document:
flag: Type:str, "all" 中英文标点分句,"zh" 中文标点分句,"en" 英文标点分句
limit: 限制最大长度为512个字符
limit: 默认单句最大长度为510个字符
Returns: Type:list
"""
sent_list = []
try:
if flag == "zh":
document = re.sub('(?P<quotation_mark>([。?!。!?…](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
document = re.sub('(?P<quotation_mark>([。?!。!?]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document) # 特殊引号

document = re.sub('(?P<quotation_mark>([。?!…](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
document = re.sub('(?P<quotation_mark>([。?!]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document) # 特殊引号
elif flag == "en":
document = re.sub('(?P<quotation_mark>([\\.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 英文单字符断句符
document = re.sub('(?P<quotation_mark>([?!\\.]["\']))', r'\g<quotation_mark>\n', document) # 特殊引号

document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 英文单字符断句符
document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document) # 特殊引号
else:
document = re.sub('(?P<quotation_mark>([。?!。!?…\\.?!](?![”’"\'])))', r'\g<quotation_mark>\n',
document) # 单字符断句符
document = re.sub('(?P<quotation_mark>(([。?!。!?\\.!?]|\\…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
document = re.sub('(?P<quotation_mark>([。?!….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
document = re.sub('(?P<quotation_mark>(([。?!.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
document) # 特殊引号

sent_list_ori = document.splitlines()
sent_list = []
for sent in sent_list_ori:
sent = sent.strip()
if not sent:
Expand Down

0 comments on commit edd549c

Please sign in to comment.