Skip to content

Latest commit

 

History

History
108 lines (103 loc) · 4.15 KB

敏感词汇本地检测.md

File metadata and controls

108 lines (103 loc) · 4.15 KB

DFA算法实现,本地文件存储相关敏感词汇,然后对输入的文本进行检测

  • 效率比较高,灵活性一般
/**
 构建DFA结构
 @note 参考:https://github.com/hailin0/sensitive-word-filter
            https://www.cnblogs.com/zyguo/p/4705086.html
 @param diskCachePath 文件磁盘路径
 */
- (void)buildDFABadWordsFromDiskCachePath:(NSURL *)diskCachePath
{
    self.badwordsNode = nil;
    if (diskCachePath == nil) {
        return;
    }
    @autoreleasepool {
        NSError *error;
        NSString *str = [NSString stringWithContentsOfURL:diskCachePath encoding:NSUTF8StringEncoding error:&error];
        if (!hw_stringEmpty(str)) {
            NSArray<NSString *> *tempBadwords =
                [[str componentsSeparatedByString:@"\n"] hw_map:^id _Nullable(NSString *_Nonnull obj, NSUInteger idx) {
                    if (!hw_stringEmpty(obj)) {
                        return [[obj stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
                            lowercaseString];
                    }
                    return nil;
                }];
            if (tempBadwords != nil && tempBadwords.count > 0) {
                for (NSString *badword in tempBadwords) {
                    [self insertNodeWithBadword:badword];
                }
            }
        }
    }
}

/**
 将badword 构建成DFA的某些节点

 @param badword 敏感词汇
 */
- (void)insertNodeWithBadword:(NSString *)badword
{
    if (hw_stringEmpty(badword)) {
        return;
    }

    if (self.badwordsNode == nil) {
        self.badwordsNode = [NSMutableDictionary new];
    }
    __block NSMutableDictionary *node = self.badwordsNode;
    [badword enumerateSubstringsInRange:NSMakeRange(0, badword.length)
                                options:NSStringEnumerationByComposedCharacterSequences
                             usingBlock:^(NSString *_Nullable substring,
                                          NSRange substringRange,
                                          NSRange enclosingRange,
                                          BOOL *_Nonnull stop) {
                                 if (node[substring] == nil) {
                                     node[substring] = [NSMutableDictionary dictionary];
                                 }
                                 node = node[substring];
                             }];
    //敏感词最后一个字符标识
    node[kEndKey] = [NSNumber numberWithInt:1];
}

/**
 在DFA中查找该words是否包含badword

 @param words 文本
 @param badwordsNode DFA的头结点
 @return 是否包含敏感词汇
 */
- (BOOL)containBadWordsWithWords:(NSString *)words dfaBadwordsNode:(NSMutableDictionary *)badwordsNode
{
    if (hw_stringEmpty(words) || badwordsNode == nil || badwordsNode.count == 0) {
        return NO;
    }
    NSString *tempWords = [words lowercaseString];
    __block NSMutableArray *tempArray = [NSMutableArray arrayWithCapacity:tempWords.length];
    [tempWords enumerateSubstringsInRange:NSMakeRange(0, tempWords.length)
                                  options:NSStringEnumerationByComposedCharacterSequences
                               usingBlock:^(NSString *_Nullable substring,
                                            NSRange substringRange,
                                            NSRange enclosingRange,
                                            BOOL *_Nonnull stop) {
                                   [tempArray addObject:substring];
                               }];
    NSInteger wordsCount = tempArray.count;
    for (NSInteger i = 0; i < wordsCount; i++) {
        NSArray<NSString *> *subTempArray = [tempArray subarrayWithRange:NSMakeRange(i, wordsCount - i)];
        NSMutableDictionary *node = [badwordsNode mutableCopy];
        for (NSInteger j = 0; j < subTempArray.count; j++) {
            NSString *word = subTempArray[j];
            if (node[word] == nil) {
                break;
            } else {
                node = node[word];
            }
            //敏感词匹配成功
            if (node[kEndKey] != nil && [node[kEndKey] integerValue] == 1) {
                return YES;
            }
        }
    }
    return NO;
}