add FoundWord

2025-08-18 20:38:02 +08:00 · 2020-12-08 04:54:44 +08:00
parent fbc80c27ac
commit 850c766213
12 changed files with 389 additions and 148 deletions
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java
@@ -1,49 +1,61 @@
 package cn.hutool.dfa;

-/**
- * @author 肖海斌
- * <p>
- * 匹配到的敏感词，包含敏感词，text中匹配敏感词的内容，以及匹配内容在text中的下标，
- * 下标可以用来做敏感词的进一步处理，如果替换成**
- */
-public class FoundWord {
-	/**
-	 * 生效的敏感词
-	 */
-	private String word;
-	/**
-	 * 敏感词匹配到的内容
-	 */
-	private String foundWord;
-	/**
-	 * 匹配内容在待分析字符串中的开始位置
-	 */
-	private int startIndex;
-	/**
-	 * 匹配内容在待分析字符串中的结束位置
-	 */
-	private int endIndex;
+import cn.hutool.core.lang.DefaultSegment;

-	public FoundWord(String word, String foundWord, int start, int end) {
+/**
+ * <p>
+ * 匹配到的单词，包含单词，text中匹配单词的内容，以及匹配内容在text中的下标，
+ * 下标可以用来做单词的进一步处理，如果替换成**
+ *
+ * @author 肖海斌
+ */
+public class FoundWord extends DefaultSegment<Integer> {
+	/**
+	 * 生效的单词，即单词树中的词
+	 */
+	private final String word;
+	/**
+	 * 单词匹配到的内容，即文中的单词
+	 */
+	private final String foundWord;
+
+	/**
+	 * 构造
+	 *
+	 * @param word 生效的单词，即单词树中的词
+	 * @param foundWord 单词匹配到的内容，即文中的单词
+	 * @param startIndex 起始位置（包含）
+	 * @param endIndex 结束位置（包含）
+	 */
+	public FoundWord(String word, String foundWord, int startIndex, int endIndex) {
+		super(startIndex, endIndex);
 		this.word = word;
 		this.foundWord = foundWord;
-		this.startIndex = start;
-		this.endIndex = end;
 	}

+	/**
+	 * 获取生效的单词，即单词树中的词
+	 *
+	 * @return 生效的单词
+	 */
 	public String getWord() {
 		return word;
 	}

+	/**
+	 * 获取单词匹配到的内容，即文中的单词
+	 * @return 单词匹配到的内容
+	 */
 	public String getFoundWord() {
 		return foundWord;
 	}

-	public int getStartIndex() {
-		return startIndex;
-	}
-
-	public int getEndIndex() {
-		return endIndex;
+	/**
+	 * 默认的，只输出匹配到的关键字
+	 * @return 匹配到的关键字
+	 */
+	@Override
+	public String toString() {
+		return this.foundWord;
 	}
 }
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
@@ -1,6 +1,6 @@
 package cn.hutool.dfa;

-import cn.hutool.core.collection.CollectionUtil;
+import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.lang.Filter;
 import cn.hutool.core.thread.ThreadUtil;
 import cn.hutool.core.util.StrUtil;
@@ -25,7 +25,7 @@ public final class SensitiveUtil {
 	 * @return 是否已经被初始化
 	 */
 	public static boolean isInited() {
-		return !sensitiveTree.isEmpty();
+		return false == sensitiveTree.isEmpty();
 	}

 	/**
@@ -117,19 +117,44 @@ public final class SensitiveUtil {
 	 *
 	 * @param text 文本
 	 * @return 敏感词
+	 * @deprecated 请使用 {@link #getFoundFirstSensitive(String)}
 	 */
-	public static FoundWord getFindedFirstSensitive(String text) {
+	@Deprecated
+	public static String getFindedFirstSensitive(String text) {
 		return sensitiveTree.match(text);
 	}

+	/**
+	 * 查找敏感词，返回找到的第一个敏感词
+	 *
+	 * @param text 文本
+	 * @return 敏感词
+	 * @since 5.5.3
+	 */
+	public static FoundWord getFoundFirstSensitive(String text) {
+		return sensitiveTree.matchWord(text);
+	}
+
+	/**
+	 * 查找敏感词，返回找到的第一个敏感词
+	 *
+	 * @param obj bean，会被转为JSON字符串
+	 * @return 敏感词
+	 * @deprecated 请使用 {@link #getFoundFirstSensitive(Object)}
+	 */
+	@Deprecated
+	public static String getFindedFirstSensitive(Object obj) {
+		return sensitiveTree.match(JSONUtil.toJsonStr(obj));
+	}
+
 	/**
 	 * 查找敏感词，返回找到的第一个敏感词
 	 *
 	 * @param obj bean，会被转为JSON字符串
 	 * @return 敏感词
 	 */
-	public static FoundWord getFindedFirstSensitive(Object obj) {
-		return sensitiveTree.match(JSONUtil.toJsonStr(obj));
+	public static FoundWord getFoundFirstSensitive(Object obj) {
+		return sensitiveTree.matchWord(JSONUtil.toJsonStr(obj));
 	}

 	/**
@@ -137,11 +162,40 @@ public final class SensitiveUtil {
 	 *
 	 * @param text 文本
 	 * @return 敏感词
+	 * @deprecated 请使用 {@link #getFoundAllSensitive(String)}
 	 */
-	public static List<FoundWord> getFindedAllSensitive(String text) {
+	@Deprecated
+	public static List<String> getFindedAllSensitive(String text) {
 		return sensitiveTree.matchAll(text);
 	}

+	/**
+	 * 查找敏感词，返回找到的所有敏感词
+	 *
+	 * @param text 文本
+	 * @return 敏感词
+	 * @since 5.5.3
+	 */
+	public static List<FoundWord> getFoundAllSensitive(String text) {
+		return sensitiveTree.matchAllWords(text);
+	}
+
+	/**
+	 * 查找敏感词，返回找到的所有敏感词<br>
+	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
+	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
+	 *
+	 * @param text           文本
+	 * @param isDensityMatch 是否使用密集匹配原则
+	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
+	 * @return 敏感词
+	 * @deprecated 请使用 {@link #getFoundAllSensitive(String, boolean, boolean)}
+	 */
+	@Deprecated
+	public static List<String> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
+		return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
+	}
+
 	/**
 	 * 查找敏感词，返回找到的所有敏感词<br>
 	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
@@ -152,8 +206,8 @@ public final class SensitiveUtil {
 	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
 	 * @return 敏感词
 	 */
-	public static List<FoundWord> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
-		return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
+	public static List<FoundWord> getFoundAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
+		return sensitiveTree.matchAllWords(text, -1, isDensityMatch, isGreedMatch);
 	}

 	/**
@@ -161,11 +215,24 @@ public final class SensitiveUtil {
 	 *
 	 * @param bean 对象，会被转为JSON
 	 * @return 敏感词
+	 * @deprecated 请使用 {@link #getFoundAllSensitive(Object)}
 	 */
-	public static List<FoundWord> getFindedAllSensitive(Object bean) {
+	@Deprecated
+	public static List<String> getFindedAllSensitive(Object bean) {
 		return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean));
 	}

+	/**
+	 * 查找敏感词，返回找到的所有敏感词
+	 *
+	 * @param bean 对象，会被转为JSON
+	 * @return 敏感词
+	 * @since 5.5.3
+	 */
+	public static List<FoundWord> getFoundAllSensitive(Object bean) {
+		return sensitiveTree.matchAllWords(JSONUtil.toJsonStr(bean));
+	}
+
 	/**
 	 * 查找敏感词，返回找到的所有敏感词<br>
 	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
@@ -175,9 +242,26 @@ public final class SensitiveUtil {
 	 * @param isDensityMatch 是否使用密集匹配原则
 	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
 	 * @return 敏感词
+	 * @deprecated 请使用 {@link #getFoundAllSensitive(Object, boolean, boolean)}
 	 */
-	public static List<FoundWord> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
-		return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
+	@Deprecated
+	public static List<String> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
+		return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean), -1, isDensityMatch, isGreedMatch);
+	}
+
+	/**
+	 * 查找敏感词，返回找到的所有敏感词<br>
+	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
+	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
+	 *
+	 * @param bean           对象，会被转为JSON
+	 * @param isDensityMatch 是否使用密集匹配原则
+	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
+	 * @return 敏感词
+	 * @since 5.5.3
+	 */
+	public static List<FoundWord> getFoundAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
+		return getFoundAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
 	}

 	/**
@@ -191,23 +275,27 @@ public final class SensitiveUtil {
 	 */
 	public static <T> T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
 		String jsonText = JSONUtil.toJsonStr(bean);
-		Class<T> c = (Class) bean.getClass();
+		@SuppressWarnings("unchecked")
+		final Class<T> c = (Class<T>) bean.getClass();
 		return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c);
 	}

 	/**
+	 * 处理过滤文本中的敏感词，默认替换成*
+	 *
 	 * @param text               文本
 	 * @param isGreedMatch       贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
 	 * @param sensitiveProcessor 敏感词处理器，默认按匹配内容的字符数替换成*
 	 * @return 敏感词过滤处理后的文本
 	 */
 	public static String sensitiveFilter(String text, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
-		if (null == text || text.trim().equals("")) {
+		if (StrUtil.isEmpty(text)) {
 			return text;
 		}
+
 		//敏感词过滤场景下，不需要密集匹配
-		List<FoundWord> foundWordList = getFindedAllSensitive(text, false, isGreedMatch);
-		if (CollectionUtil.isEmpty(foundWordList)) {
+		List<FoundWord> foundWordList = getFoundAllSensitive(text, false, isGreedMatch);
+		if (CollUtil.isEmpty(foundWordList)) {
 			return text;
 		}
 		sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() {
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
@@ -1,11 +1,17 @@
 package cn.hutool.dfa;

+import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.CollectionUtil;
 import cn.hutool.core.lang.Filter;
 import cn.hutool.core.text.StrBuilder;
 import cn.hutool.core.util.StrUtil;

-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;

 /**
 * DFA（Deterministic Finite Automaton 确定有穷自动机）
@@ -26,7 +32,7 @@ public class WordTree extends HashMap<Character, WordTree> {
 	private static final long serialVersionUID = -4646423269465809276L;

 	/**
-	 * 敏感词字符末尾标识，用于标识单词末尾字符
+	 * 单词字符末尾标识，用于标识单词末尾字符
 	 */
 	private final Set<Character> endCharacterSet = new HashSet<>();
 	/**
@@ -62,26 +68,30 @@ public class WordTree extends HashMap<Character, WordTree> {
 	 * 增加一组单词
 	 *
 	 * @param words 单词集合
+	 * @return this
 	 */
-	public void addWords(Collection<String> words) {
+	public WordTree addWords(Collection<String> words) {
 		if (false == (words instanceof Set)) {
 			words = new HashSet<>(words);
 		}
 		for (String word : words) {
 			addWord(word);
 		}
+		return this;
 	}

 	/**
 	 * 增加一组单词
 	 *
 	 * @param words 单词数组
+	 *              @return this
 	 */
-	public void addWords(String... words) {
+	public WordTree addWords(String... words) {
 		HashSet<String> wordsSet = CollectionUtil.newHashSet(words);
 		for (String word : wordsSet) {
 			addWord(word);
 		}
+		return this;
 	}

 	/**
@@ -89,7 +99,7 @@ public class WordTree extends HashMap<Character, WordTree> {
 	 *
 	 * @param word 单词
 	 */
-	public void addWord(String word) {
+	public WordTree addWord(String word) {
 		final Filter<Character> charFilter = this.charFilter;
 		WordTree parent = null;
 		WordTree current = this;
@@ -112,8 +122,8 @@ public class WordTree extends HashMap<Character, WordTree> {
 		if (null != parent) {
 			parent.setEnd(currentChar);
 		}
+		return this;
 	}
-
 	//------------------------------------------------------------------------------- match

 	/**
@@ -126,7 +136,7 @@ public class WordTree extends HashMap<Character, WordTree> {
 		if (null == text) {
 			return false;
 		}
-		return null != match(text);
+		return null != matchWord(text);
 	}

 	/**
@@ -135,15 +145,24 @@ public class WordTree extends HashMap<Character, WordTree> {
 	 * @param text 被检查的文本
 	 * @return 匹配到的关键字
 	 */
-	public FoundWord match(String text) {
+	public String match(String text) {
+		final FoundWord foundWord = matchWord(text);
+		return null != foundWord ? foundWord.toString() : null;
+	}
+
+	/**
+	 * 获得第一个匹配的关键字
+	 *
+	 * @param text 被检查的文本
+	 * @return 匹配到的关键字
+	 * @since 5.5.3
+	 */
+	public FoundWord matchWord(String text) {
 		if (null == text) {
 			return null;
 		}
-		List<FoundWord> matchAll = matchAll(text, 1);
-		if (CollectionUtil.isNotEmpty(matchAll)) {
-			return matchAll.get(0);
-		}
-		return null;
+		final List<FoundWord> matchAll = matchAllWords(text, 1);
+		return CollUtil.get(matchAll, 0);
 	}

 	//------------------------------------------------------------------------------- match all
@@ -154,10 +173,21 @@ public class WordTree extends HashMap<Character, WordTree> {
 	 * @param text 被检查的文本
 	 * @return 匹配的词列表
 	 */
-	public List<FoundWord> matchAll(String text) {
+	public List<String> matchAll(String text) {
 		return matchAll(text, -1);
 	}

+	/**
+	 * 找出所有匹配的关键字
+	 *
+	 * @param text 被检查的文本
+	 * @return 匹配的词列表
+	 * @since 5.5.3
+	 */
+	public List<FoundWord> matchAllWords(String text) {
+		return matchAllWords(text, -1);
+	}
+
 	/**
 	 * 找出所有匹配的关键字
 	 *
@@ -165,10 +195,22 @@ public class WordTree extends HashMap<Character, WordTree> {
 	 * @param limit 限制匹配个数
 	 * @return 匹配的词列表
 	 */
-	public List<FoundWord> matchAll(String text, int limit) {
+	public List<String> matchAll(String text, int limit) {
 		return matchAll(text, limit, false, false);
 	}

+	/**
+	 * 找出所有匹配的关键字
+	 *
+	 * @param text  被检查的文本
+	 * @param limit 限制匹配个数
+	 * @return 匹配的词列表
+	 * @since 5.5.3
+	 */
+	public List<FoundWord> matchAllWords(String text, int limit) {
+		return matchAllWords(text, limit, false, false);
+	}
+
 	/**
 	 * 找出所有匹配的关键字<br>
 	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
@@ -180,7 +222,24 @@ public class WordTree extends HashMap<Character, WordTree> {
 	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
 	 * @return 匹配的词列表
 	 */
-	public List<FoundWord> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
+	public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
+		final List<FoundWord> matchAllWords = matchAllWords(text, limit, isDensityMatch, isGreedMatch);
+		return CollUtil.map(matchAllWords, FoundWord::toString, true);
+	}
+
+	/**
+	 * 找出所有匹配的关键字<br>
+	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
+	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
+	 *
+	 * @param text           被检查的文本
+	 * @param limit          限制匹配个数
+	 * @param isDensityMatch 是否使用密集匹配原则
+	 * @param isGreedMatch   是否使用贪婪匹配（最长匹配）原则
+	 * @return 匹配的词列表
+	 * @since 5.5.3
+	 */
+	public List<FoundWord> matchAllWords(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
 		if (null == text) {
 			return null;
 		}
@@ -239,8 +298,6 @@ public class WordTree extends HashMap<Character, WordTree> {
 		}
 		return foundWords;
 	}
-
-
 	//--------------------------------------------------------------------------------------- Private method start

 	/**