mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-08-18 20:38:02 +08:00
add FoundWord
This commit is contained in:
@@ -1,49 +1,61 @@
|
||||
package cn.hutool.dfa;
|
||||
|
||||
/**
|
||||
* @author 肖海斌
|
||||
* <p>
|
||||
* 匹配到的敏感词,包含敏感词,text中匹配敏感词的内容,以及匹配内容在text中的下标,
|
||||
* 下标可以用来做敏感词的进一步处理,如果替换成**
|
||||
*/
|
||||
public class FoundWord {
|
||||
/**
|
||||
* 生效的敏感词
|
||||
*/
|
||||
private String word;
|
||||
/**
|
||||
* 敏感词匹配到的内容
|
||||
*/
|
||||
private String foundWord;
|
||||
/**
|
||||
* 匹配内容在待分析字符串中的开始位置
|
||||
*/
|
||||
private int startIndex;
|
||||
/**
|
||||
* 匹配内容在待分析字符串中的结束位置
|
||||
*/
|
||||
private int endIndex;
|
||||
import cn.hutool.core.lang.DefaultSegment;
|
||||
|
||||
public FoundWord(String word, String foundWord, int start, int end) {
|
||||
/**
|
||||
* <p>
|
||||
* 匹配到的单词,包含单词,text中匹配单词的内容,以及匹配内容在text中的下标,
|
||||
* 下标可以用来做单词的进一步处理,如果替换成**
|
||||
*
|
||||
* @author 肖海斌
|
||||
*/
|
||||
public class FoundWord extends DefaultSegment<Integer> {
|
||||
/**
|
||||
* 生效的单词,即单词树中的词
|
||||
*/
|
||||
private final String word;
|
||||
/**
|
||||
* 单词匹配到的内容,即文中的单词
|
||||
*/
|
||||
private final String foundWord;
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*
|
||||
* @param word 生效的单词,即单词树中的词
|
||||
* @param foundWord 单词匹配到的内容,即文中的单词
|
||||
* @param startIndex 起始位置(包含)
|
||||
* @param endIndex 结束位置(包含)
|
||||
*/
|
||||
public FoundWord(String word, String foundWord, int startIndex, int endIndex) {
|
||||
super(startIndex, endIndex);
|
||||
this.word = word;
|
||||
this.foundWord = foundWord;
|
||||
this.startIndex = start;
|
||||
this.endIndex = end;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取生效的单词,即单词树中的词
|
||||
*
|
||||
* @return 生效的单词
|
||||
*/
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取单词匹配到的内容,即文中的单词
|
||||
* @return 单词匹配到的内容
|
||||
*/
|
||||
public String getFoundWord() {
|
||||
return foundWord;
|
||||
}
|
||||
|
||||
public int getStartIndex() {
|
||||
return startIndex;
|
||||
}
|
||||
|
||||
public int getEndIndex() {
|
||||
return endIndex;
|
||||
/**
|
||||
* 默认的,只输出匹配到的关键字
|
||||
* @return 匹配到的关键字
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.foundWord;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
package cn.hutool.dfa;
|
||||
|
||||
import cn.hutool.core.collection.CollectionUtil;
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
import cn.hutool.core.lang.Filter;
|
||||
import cn.hutool.core.thread.ThreadUtil;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
@@ -25,7 +25,7 @@ public final class SensitiveUtil {
|
||||
* @return 是否已经被初始化
|
||||
*/
|
||||
public static boolean isInited() {
|
||||
return !sensitiveTree.isEmpty();
|
||||
return false == sensitiveTree.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -117,19 +117,44 @@ public final class SensitiveUtil {
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
* @deprecated 请使用 {@link #getFoundFirstSensitive(String)}
|
||||
*/
|
||||
public static FoundWord getFindedFirstSensitive(String text) {
|
||||
@Deprecated
|
||||
public static String getFindedFirstSensitive(String text) {
|
||||
return sensitiveTree.match(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的第一个敏感词
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public static FoundWord getFoundFirstSensitive(String text) {
|
||||
return sensitiveTree.matchWord(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的第一个敏感词
|
||||
*
|
||||
* @param obj bean,会被转为JSON字符串
|
||||
* @return 敏感词
|
||||
* @deprecated 请使用 {@link #getFoundFirstSensitive(Object)}
|
||||
*/
|
||||
@Deprecated
|
||||
public static String getFindedFirstSensitive(Object obj) {
|
||||
return sensitiveTree.match(JSONUtil.toJsonStr(obj));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的第一个敏感词
|
||||
*
|
||||
* @param obj bean,会被转为JSON字符串
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static FoundWord getFindedFirstSensitive(Object obj) {
|
||||
return sensitiveTree.match(JSONUtil.toJsonStr(obj));
|
||||
public static FoundWord getFoundFirstSensitive(Object obj) {
|
||||
return sensitiveTree.matchWord(JSONUtil.toJsonStr(obj));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -137,11 +162,40 @@ public final class SensitiveUtil {
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
* @deprecated 请使用 {@link #getFoundAllSensitive(String)}
|
||||
*/
|
||||
public static List<FoundWord> getFindedAllSensitive(String text) {
|
||||
@Deprecated
|
||||
public static List<String> getFindedAllSensitive(String text) {
|
||||
return sensitiveTree.matchAll(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词
|
||||
*
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public static List<FoundWord> getFoundAllSensitive(String text) {
|
||||
return sensitiveTree.matchAllWords(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param text 文本
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
* @deprecated 请使用 {@link #getFoundAllSensitive(String, boolean, boolean)}
|
||||
*/
|
||||
@Deprecated
|
||||
public static List<String> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
@@ -152,8 +206,8 @@ public final class SensitiveUtil {
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<FoundWord> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
|
||||
public static List<FoundWord> getFoundAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return sensitiveTree.matchAllWords(text, -1, isDensityMatch, isGreedMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -161,11 +215,24 @@ public final class SensitiveUtil {
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @return 敏感词
|
||||
* @deprecated 请使用 {@link #getFoundAllSensitive(Object)}
|
||||
*/
|
||||
public static List<FoundWord> getFindedAllSensitive(Object bean) {
|
||||
@Deprecated
|
||||
public static List<String> getFindedAllSensitive(Object bean) {
|
||||
return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @return 敏感词
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public static List<FoundWord> getFoundAllSensitive(Object bean) {
|
||||
return sensitiveTree.matchAllWords(JSONUtil.toJsonStr(bean));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
@@ -175,9 +242,26 @@ public final class SensitiveUtil {
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
* @deprecated 请使用 {@link #getFoundAllSensitive(Object, boolean, boolean)}
|
||||
*/
|
||||
public static List<FoundWord> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
|
||||
@Deprecated
|
||||
public static List<String> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean), -1, isDensityMatch, isGreedMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public static List<FoundWord> getFoundAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
return getFoundAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -191,23 +275,27 @@ public final class SensitiveUtil {
|
||||
*/
|
||||
public static <T> T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
|
||||
String jsonText = JSONUtil.toJsonStr(bean);
|
||||
Class<T> c = (Class) bean.getClass();
|
||||
@SuppressWarnings("unchecked")
|
||||
final Class<T> c = (Class<T>) bean.getClass();
|
||||
return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c);
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理过滤文本中的敏感词,默认替换成*
|
||||
*
|
||||
* @param text 文本
|
||||
* @param isGreedMatch 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
* @param sensitiveProcessor 敏感词处理器,默认按匹配内容的字符数替换成*
|
||||
* @return 敏感词过滤处理后的文本
|
||||
*/
|
||||
public static String sensitiveFilter(String text, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
|
||||
if (null == text || text.trim().equals("")) {
|
||||
if (StrUtil.isEmpty(text)) {
|
||||
return text;
|
||||
}
|
||||
|
||||
//敏感词过滤场景下,不需要密集匹配
|
||||
List<FoundWord> foundWordList = getFindedAllSensitive(text, false, isGreedMatch);
|
||||
if (CollectionUtil.isEmpty(foundWordList)) {
|
||||
List<FoundWord> foundWordList = getFoundAllSensitive(text, false, isGreedMatch);
|
||||
if (CollUtil.isEmpty(foundWordList)) {
|
||||
return text;
|
||||
}
|
||||
sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() {
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
package cn.hutool.dfa;
|
||||
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
import cn.hutool.core.collection.CollectionUtil;
|
||||
import cn.hutool.core.lang.Filter;
|
||||
import cn.hutool.core.text.StrBuilder;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* DFA(Deterministic Finite Automaton 确定有穷自动机)
|
||||
@@ -26,7 +32,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
private static final long serialVersionUID = -4646423269465809276L;
|
||||
|
||||
/**
|
||||
* 敏感词字符末尾标识,用于标识单词末尾字符
|
||||
* 单词字符末尾标识,用于标识单词末尾字符
|
||||
*/
|
||||
private final Set<Character> endCharacterSet = new HashSet<>();
|
||||
/**
|
||||
@@ -62,26 +68,30 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* 增加一组单词
|
||||
*
|
||||
* @param words 单词集合
|
||||
* @return this
|
||||
*/
|
||||
public void addWords(Collection<String> words) {
|
||||
public WordTree addWords(Collection<String> words) {
|
||||
if (false == (words instanceof Set)) {
|
||||
words = new HashSet<>(words);
|
||||
}
|
||||
for (String word : words) {
|
||||
addWord(word);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加一组单词
|
||||
*
|
||||
* @param words 单词数组
|
||||
* @return this
|
||||
*/
|
||||
public void addWords(String... words) {
|
||||
public WordTree addWords(String... words) {
|
||||
HashSet<String> wordsSet = CollectionUtil.newHashSet(words);
|
||||
for (String word : wordsSet) {
|
||||
addWord(word);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -89,7 +99,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
*
|
||||
* @param word 单词
|
||||
*/
|
||||
public void addWord(String word) {
|
||||
public WordTree addWord(String word) {
|
||||
final Filter<Character> charFilter = this.charFilter;
|
||||
WordTree parent = null;
|
||||
WordTree current = this;
|
||||
@@ -112,8 +122,8 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
if (null != parent) {
|
||||
parent.setEnd(currentChar);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------- match
|
||||
|
||||
/**
|
||||
@@ -126,7 +136,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
if (null == text) {
|
||||
return false;
|
||||
}
|
||||
return null != match(text);
|
||||
return null != matchWord(text);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -135,15 +145,24 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配到的关键字
|
||||
*/
|
||||
public FoundWord match(String text) {
|
||||
public String match(String text) {
|
||||
final FoundWord foundWord = matchWord(text);
|
||||
return null != foundWord ? foundWord.toString() : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得第一个匹配的关键字
|
||||
*
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配到的关键字
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public FoundWord matchWord(String text) {
|
||||
if (null == text) {
|
||||
return null;
|
||||
}
|
||||
List<FoundWord> matchAll = matchAll(text, 1);
|
||||
if (CollectionUtil.isNotEmpty(matchAll)) {
|
||||
return matchAll.get(0);
|
||||
}
|
||||
return null;
|
||||
final List<FoundWord> matchAll = matchAllWords(text, 1);
|
||||
return CollUtil.get(matchAll, 0);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------- match all
|
||||
@@ -154,10 +173,21 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<FoundWord> matchAll(String text) {
|
||||
public List<String> matchAll(String text) {
|
||||
return matchAll(text, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出所有匹配的关键字
|
||||
*
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配的词列表
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public List<FoundWord> matchAllWords(String text) {
|
||||
return matchAllWords(text, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出所有匹配的关键字
|
||||
*
|
||||
@@ -165,10 +195,22 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param limit 限制匹配个数
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<FoundWord> matchAll(String text, int limit) {
|
||||
public List<String> matchAll(String text, int limit) {
|
||||
return matchAll(text, limit, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出所有匹配的关键字
|
||||
*
|
||||
* @param text 被检查的文本
|
||||
* @param limit 限制匹配个数
|
||||
* @return 匹配的词列表
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public List<FoundWord> matchAllWords(String text, int limit) {
|
||||
return matchAllWords(text, limit, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出所有匹配的关键字<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
@@ -180,7 +222,24 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<FoundWord> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
final List<FoundWord> matchAllWords = matchAllWords(text, limit, isDensityMatch, isGreedMatch);
|
||||
return CollUtil.map(matchAllWords, FoundWord::toString, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出所有匹配的关键字<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param text 被检查的文本
|
||||
* @param limit 限制匹配个数
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 匹配的词列表
|
||||
* @since 5.5.3
|
||||
*/
|
||||
public List<FoundWord> matchAllWords(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
if (null == text) {
|
||||
return null;
|
||||
}
|
||||
@@ -239,8 +298,6 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
}
|
||||
return foundWords;
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------------- Private method start
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user