DFA算法实现,本地文件存储相关敏感词汇,然后对输入的文本进行检测
/* *
构建DFA结构
@note 参考:https://github.com/hailin0/sensitive-word-filter
https://www.cnblogs.com/zyguo/p/4705086.html
@param diskCachePath 文件磁盘路径
*/
- (void )buildDFABadWordsFromDiskCachePath:(NSURL *)diskCachePath
{
self.badwordsNode = nil ;
if (diskCachePath == nil ) {
return ;
}
@autoreleasepool {
NSError *error;
NSString *str = [NSString stringWithContentsOfURL: diskCachePath encoding: NSUTF8StringEncoding error: &error];
if (!hw_stringEmpty (str)) {
NSArray <NSString *> *tempBadwords =
[[str componentsSeparatedByString: @" \n " ] hw_map: ^id _Nullable (NSString *_Nonnull obj, NSUInteger idx) {
if (!hw_stringEmpty (obj)) {
return [[obj stringByTrimmingCharactersInSet: [NSCharacterSet whitespaceAndNewlineCharacterSet ]]
lowercaseString ];
}
return nil ;
}];
if (tempBadwords != nil && tempBadwords.count > 0 ) {
for (NSString *badword in tempBadwords) {
[self insertNodeWithBadword: badword];
}
}
}
}
}
/* *
将badword 构建成DFA的某些节点
@param badword 敏感词汇
*/
- (void )insertNodeWithBadword:(NSString *)badword
{
if (hw_stringEmpty (badword)) {
return ;
}
if (self.badwordsNode == nil ) {
self.badwordsNode = [NSMutableDictionary new ];
}
__block NSMutableDictionary *node = self.badwordsNode ;
[badword enumerateSubstringsInRange: NSMakeRange (0 , badword.length)
options: NSStringEnumerationByComposedCharacterSequences
usingBlock: ^(NSString *_Nullable substring,
NSRange substringRange,
NSRange enclosingRange,
BOOL *_Nonnull stop) {
if (node[substring] == nil ) {
node[substring] = [NSMutableDictionary dictionary ];
}
node = node[substring];
}];
// 敏感词最后一个字符标识
node[kEndKey ] = [NSNumber numberWithInt: 1 ];
}
/* *
在DFA中查找该words是否包含badword
@param words 文本
@param badwordsNode DFA的头结点
@return 是否包含敏感词汇
*/
- (BOOL )containBadWordsWithWords:(NSString *)words dfaBadwordsNode:(NSMutableDictionary *)badwordsNode
{
if (hw_stringEmpty (words) || badwordsNode == nil || badwordsNode.count == 0 ) {
return NO ;
}
NSString *tempWords = [words lowercaseString ];
__block NSMutableArray *tempArray = [NSMutableArray arrayWithCapacity: tempWords.length];
[tempWords enumerateSubstringsInRange: NSMakeRange (0 , tempWords.length)
options: NSStringEnumerationByComposedCharacterSequences
usingBlock: ^(NSString *_Nullable substring,
NSRange substringRange,
NSRange enclosingRange,
BOOL *_Nonnull stop) {
[tempArray addObject: substring];
}];
NSInteger wordsCount = tempArray.count ;
for (NSInteger i = 0 ; i < wordsCount; i++) {
NSArray <NSString *> *subTempArray = [tempArray subarrayWithRange: NSMakeRange (i, wordsCount - i)];
NSMutableDictionary *node = [badwordsNode mutableCopy ];
for (NSInteger j = 0 ; j < subTempArray.count ; j++) {
NSString *word = subTempArray[j];
if (node[word] == nil ) {
break ;
} else {
node = node[word];
}
// 敏感词匹配成功
if (node[kEndKey ] != nil && [node[kEndKey ] integerValue ] == 1 ) {
return YES ;
}
}
}
return NO ;
}