mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-08-18 20:38:02 +08:00
敏感词过滤
This commit is contained in:
50
hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java
Normal file
50
hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java
Normal file
@@ -0,0 +1,50 @@
|
||||
package cn.hutool.dfa;
|
||||
|
||||
/**
|
||||
* @author 肖海斌
|
||||
* @Date 2020-12-05
|
||||
* <p>
|
||||
* 匹配到的敏感词,包含敏感词,text中匹配敏感词的内容,以及匹配内容在text中的下标,
|
||||
* 下标可以用来做敏感词的进一步处理,如果替换成**
|
||||
*/
|
||||
public class FoundWord {
|
||||
/**
|
||||
* 生效的敏感词
|
||||
*/
|
||||
private String word;
|
||||
/**
|
||||
* 敏感词匹配到的内容
|
||||
*/
|
||||
private String foundWord;
|
||||
/**
|
||||
* 匹配内容在待分析字符串中的开始位置
|
||||
*/
|
||||
private int startIndex;
|
||||
/**
|
||||
* 匹配内容在待分析字符串中的结束位置
|
||||
*/
|
||||
private int endIndex;
|
||||
|
||||
public FoundWord(String word, String foundWord, int start, int end) {
|
||||
this.word = word;
|
||||
this.foundWord = foundWord;
|
||||
this.startIndex = start;
|
||||
this.endIndex = end;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
public String getFoundWord() {
|
||||
return foundWord;
|
||||
}
|
||||
|
||||
public int getStartIndex() {
|
||||
return startIndex;
|
||||
}
|
||||
|
||||
public int getEndIndex() {
|
||||
return endIndex;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package cn.hutool.dfa;
|
||||
|
||||
/**
|
||||
* @author 肖海斌
|
||||
* @Date 2020-12-05
|
||||
* 敏感词过滤处理器,默认按字符数替换成*
|
||||
*/
|
||||
public interface SensitiveProcessor {
|
||||
|
||||
/**
|
||||
* 敏感词过滤处理
|
||||
* @param foundWord 敏感词匹配到的内容
|
||||
* @return 敏感词过滤后的内容,默认按字符数替换成*
|
||||
*/
|
||||
default String process(FoundWord foundWord) {
|
||||
int length = foundWord.getFoundWord().length();
|
||||
StringBuilder sb = new StringBuilder(length);
|
||||
for (int i = 0; i < length; i++) {
|
||||
sb.append("*");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
@@ -1,77 +1,84 @@
|
||||
package cn.hutool.dfa;
|
||||
|
||||
import cn.hutool.core.collection.CollectionUtil;
|
||||
import cn.hutool.core.lang.Filter;
|
||||
import cn.hutool.core.thread.ThreadUtil;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import cn.hutool.json.JSONUtil;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 敏感词工具类
|
||||
* @author Looly
|
||||
*
|
||||
* @author Looly
|
||||
*/
|
||||
public final class SensitiveUtil {
|
||||
|
||||
public static final char DEFAULT_SEPARATOR = StrUtil.C_COMMA;
|
||||
private static final WordTree sensitiveTree = new WordTree();
|
||||
|
||||
|
||||
/**
|
||||
* @return 是否已经被初始化
|
||||
*/
|
||||
public static boolean isInited(){
|
||||
public static boolean isInited() {
|
||||
return !sensitiveTree.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 初始化敏感词树
|
||||
* @param isAsync 是否异步初始化
|
||||
*
|
||||
* @param isAsync 是否异步初始化
|
||||
* @param sensitiveWords 敏感词列表
|
||||
*/
|
||||
public static void init(final Collection<String> sensitiveWords, boolean isAsync){
|
||||
if(isAsync){
|
||||
public static void init(final Collection<String> sensitiveWords, boolean isAsync) {
|
||||
if (isAsync) {
|
||||
ThreadUtil.execAsync(() -> {
|
||||
init(sensitiveWords);
|
||||
return true;
|
||||
});
|
||||
}else{
|
||||
} else {
|
||||
init(sensitiveWords);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 初始化敏感词树
|
||||
*
|
||||
* @param sensitiveWords 敏感词列表
|
||||
*/
|
||||
public static void init(Collection<String> sensitiveWords){
|
||||
public static void init(Collection<String> sensitiveWords) {
|
||||
sensitiveTree.clear();
|
||||
sensitiveTree.addWords(sensitiveWords);
|
||||
// log.debug("Sensitive init finished, sensitives: {}", sensitiveWords);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 初始化敏感词树
|
||||
*
|
||||
* @param sensitiveWords 敏感词列表组成的字符串
|
||||
* @param isAsync 是否异步初始化
|
||||
* @param separator 分隔符
|
||||
* @param isAsync 是否异步初始化
|
||||
* @param separator 分隔符
|
||||
*/
|
||||
public static void init(String sensitiveWords, char separator, boolean isAsync){
|
||||
if(StrUtil.isNotBlank(sensitiveWords)){
|
||||
public static void init(String sensitiveWords, char separator, boolean isAsync) {
|
||||
if (StrUtil.isNotBlank(sensitiveWords)) {
|
||||
init(StrUtil.split(sensitiveWords, separator), isAsync);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 初始化敏感词树,使用逗号分隔每个单词
|
||||
*
|
||||
* @param sensitiveWords 敏感词列表组成的字符串
|
||||
* @param isAsync 是否异步初始化
|
||||
* @param isAsync 是否异步初始化
|
||||
*/
|
||||
public static void init(String sensitiveWords, boolean isAsync){
|
||||
public static void init(String sensitiveWords, boolean isAsync) {
|
||||
init(sensitiveWords, DEFAULT_SEPARATOR, isAsync);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 设置字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符<br>
|
||||
* 当accept为false时,此字符不参与匹配
|
||||
@@ -80,90 +87,144 @@ public final class SensitiveUtil {
|
||||
* @since 5.4.4
|
||||
*/
|
||||
public static void setCharFilter(Filter<Character> charFilter) {
|
||||
if(charFilter != null) {
|
||||
if (charFilter != null) {
|
||||
sensitiveTree.setCharFilter(charFilter);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 是否包含敏感词
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 是否包含
|
||||
*/
|
||||
public static boolean containsSensitive(String text){
|
||||
public static boolean containsSensitive(String text) {
|
||||
return sensitiveTree.isMatch(text);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 是否包含敏感词
|
||||
*
|
||||
* @param obj bean,会被转为JSON字符串
|
||||
* @return 是否包含
|
||||
*/
|
||||
public static boolean containsSensitive(Object obj){
|
||||
public static boolean containsSensitive(Object obj) {
|
||||
return sensitiveTree.isMatch(JSONUtil.toJsonStr(obj));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的第一个敏感词
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static String getFindedFirstSensitive(String text){
|
||||
public static FoundWord getFindedFirstSensitive(String text) {
|
||||
return sensitiveTree.match(text);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的第一个敏感词
|
||||
*
|
||||
* @param obj bean,会被转为JSON字符串
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static String getFindedFirstSensitive(Object obj){
|
||||
public static FoundWord getFindedFirstSensitive(Object obj) {
|
||||
return sensitiveTree.match(JSONUtil.toJsonStr(obj));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(String text){
|
||||
public static List<FoundWord> getFindedAllSensitive(String text) {
|
||||
return sensitiveTree.matchAll(text);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param text 文本
|
||||
*
|
||||
* @param text 文本
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch){
|
||||
public static List<FoundWord> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(Object bean){
|
||||
public static List<FoundWord> getFindedAllSensitive(Object bean) {
|
||||
return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch){
|
||||
public static List<FoundWord> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* 敏感词过滤
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @param isGreedMatch 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
* @param sensitiveProcessor 敏感词处理器,默认按匹配内容的字符数替换成*
|
||||
* @param <T> bean的class类型
|
||||
* @return 敏感词过滤处理后的bean对象
|
||||
*/
|
||||
public static <T> T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
|
||||
sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() {
|
||||
} : sensitiveProcessor;
|
||||
String jsonText = JSONUtil.toJsonStr(bean);
|
||||
Class<T> c = (Class) bean.getClass();
|
||||
return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param text 文本
|
||||
* @param isGreedMatch 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
* @param sensitiveProcessor 敏感词处理器,默认按匹配内容的字符数替换成*
|
||||
* @return 敏感词过滤处理后的文本
|
||||
*/
|
||||
public static String sensitiveFilter(String text, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
|
||||
if (null == text || text.trim().equals("")) {
|
||||
return text;
|
||||
}
|
||||
//敏感词过滤场景下,不需要密集匹配
|
||||
List<FoundWord> foundWordList = getFindedAllSensitive(text, false, isGreedMatch);
|
||||
if (CollectionUtil.isEmpty(foundWordList)) {
|
||||
return text;
|
||||
}
|
||||
Map<Integer, FoundWord> foundWordMap = new HashMap<>(foundWordList.size());
|
||||
foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord));
|
||||
int length = text.length();
|
||||
StringBuilder textStringBuilder = new StringBuilder();
|
||||
for (int i = 0; i < length; i++) {
|
||||
FoundWord fw = foundWordMap.get(i);
|
||||
if (fw != null) {
|
||||
textStringBuilder.append(sensitiveProcessor.process(fw));
|
||||
i = fw.getEndIndex();
|
||||
} else {
|
||||
textStringBuilder.append(text.charAt(i));
|
||||
}
|
||||
}
|
||||
return textStringBuilder.toString();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,12 +5,7 @@ import cn.hutool.core.lang.Filter;
|
||||
import cn.hutool.core.text.StrBuilder;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* DFA(Deterministic Finite Automaton 确定有穷自动机)
|
||||
@@ -140,11 +135,11 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配到的关键字
|
||||
*/
|
||||
public String match(String text) {
|
||||
public FoundWord match(String text) {
|
||||
if (null == text) {
|
||||
return null;
|
||||
}
|
||||
List<String> matchAll = matchAll(text, 1);
|
||||
List<FoundWord> matchAll = matchAll(text, 1);
|
||||
if (CollectionUtil.isNotEmpty(matchAll)) {
|
||||
return matchAll.get(0);
|
||||
}
|
||||
@@ -159,7 +154,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<String> matchAll(String text) {
|
||||
public List<FoundWord> matchAll(String text) {
|
||||
return matchAll(text, -1);
|
||||
}
|
||||
|
||||
@@ -170,7 +165,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param limit 限制匹配个数
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<String> matchAll(String text, int limit) {
|
||||
public List<FoundWord> matchAll(String text, int limit) {
|
||||
return matchAll(text, limit, false, false);
|
||||
}
|
||||
|
||||
@@ -185,20 +180,22 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
public List<FoundWord> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
if (null == text) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<String> foundWords = new ArrayList<>();
|
||||
List<FoundWord> foundWords = new ArrayList<>();
|
||||
WordTree current = this;
|
||||
int length = text.length();
|
||||
final Filter<Character> charFilter = this.charFilter;
|
||||
//存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空
|
||||
final StrBuilder wordBuffer = StrUtil.strBuilder();
|
||||
final StrBuilder keyBuffer = StrUtil.strBuilder();
|
||||
char currentChar;
|
||||
for (int i = 0; i < length; i++) {
|
||||
wordBuffer.reset();
|
||||
keyBuffer.reset();
|
||||
for (int j = i; j < length; j++) {
|
||||
currentChar = text.charAt(j);
|
||||
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
|
||||
@@ -216,9 +213,10 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
break;
|
||||
}
|
||||
wordBuffer.append(currentChar);
|
||||
keyBuffer.append(currentChar);
|
||||
if (current.isEnd(currentChar)) {
|
||||
//到达单词末尾,关键词成立,从此词的下一个位置开始查找
|
||||
foundWords.add(wordBuffer.toString());
|
||||
foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j));
|
||||
if (limit > 0 && foundWords.size() >= limit) {
|
||||
//超过匹配限制个数,直接返回
|
||||
return foundWords;
|
||||
|
||||
Reference in New Issue
Block a user