clean history

2025-07-21 15:09:48 +08:00 · 2019-08-14 10:02:32 +08:00
commit 6b011af032
1215 changed files with 159913 additions and 0 deletions
--- a/hutool-dfa/pom.xml
+++ b/hutool-dfa/pom.xml
@@ -0,0 +1,29 @@
+<?xml version='1.0' encoding='utf-8'?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<packaging>jar</packaging>
+	
+	<parent>
+		<groupId>cn.hutool</groupId>
+		<artifactId>hutool-parent</artifactId>
+		<version>4.6.2-SNAPSHOT</version>
+	</parent>
+
+	<artifactId>hutool-dfa</artifactId>
+	<name>${project.artifactId}</name>
+	<description>Hutool 基于DFA的关键词查找</description>
+	
+	<dependencies>
+		<dependency>
+			<groupId>cn.hutool</groupId>
+			<artifactId>hutool-core</artifactId>
+			<version>${project.parent.version}</version>
+		</dependency>
+		<dependency>
+			<groupId>cn.hutool</groupId>
+			<artifactId>hutool-json</artifactId>
+			<version>${project.parent.version}</version>
+		</dependency>
+	</dependencies>
+</project>
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
@@ -0,0 +1,161 @@
+package cn.hutool.dfa;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import cn.hutool.core.thread.ThreadUtil;
+import cn.hutool.core.util.StrUtil;
+import cn.hutool.json.JSONUtil;
+
+/**
+ * 敏感词工具类
+ * @author Looly
+ *
+ */
+public final class SensitiveUtil {
+//	private static final Log log = LogFactory.get();
+	
+	public static final char DEFAULT_SEPARATOR = StrUtil.C_COMMA;
+	private static WordTree sensitiveTree = new WordTree();
+	
+	/**
+	 * @return 是否已经被初始化
+	 */
+	public static boolean isInited(){
+		return !sensitiveTree.isEmpty();
+	}
+	
+	/**
+	 * 初始化敏感词树
+	 * @param isAsync 是否异步初始化
+	 * @param sensitiveWords 敏感词列表
+	 */
+	public static void init(final Collection<String> sensitiveWords, boolean isAsync){
+		if(isAsync){
+			ThreadUtil.execAsync(new Callable<Boolean>(){
+				@Override
+				public Boolean call() throws Exception {
+					init(sensitiveWords);
+					return true;
+				}
+				
+			});
+		}else{
+			init(sensitiveWords);
+		}
+	}
+	
+	/**
+	 * 初始化敏感词树
+	 * @param sensitiveWords 敏感词列表
+	 */
+	public static void init(Collection<String> sensitiveWords){
+		sensitiveTree.clear();
+		sensitiveTree.addWords(sensitiveWords);
+//		log.debug("Sensitive init finished, sensitives: {}", sensitiveWords);
+	}
+	
+	/**
+	 * 初始化敏感词树
+	 * @param sensitiveWords 敏感词列表组成的字符串
+	 * @param isAsync 是否异步初始化
+	 * @param separator 分隔符
+	 */
+	public static void init(String sensitiveWords, char separator, boolean isAsync){
+		if(StrUtil.isNotBlank(sensitiveWords)){
+			init(StrUtil.split(sensitiveWords, separator), isAsync);
+		}
+	}
+	
+	/**
+	 * 初始化敏感词树，使用逗号分隔每个单词
+	 * @param sensitiveWords 敏感词列表组成的字符串
+	 * @param isAsync 是否异步初始化
+	 */
+	public static void init(String sensitiveWords, boolean isAsync){
+		init(sensitiveWords, DEFAULT_SEPARATOR, isAsync);
+	}
+	
+	/**
+	 * 是否包含敏感词
+	 * @param text 文本
+	 * @return 是否包含
+	 */
+	public static boolean containsSensitive(String text){
+		return sensitiveTree.isMatch(text);
+	}
+	
+	/**
+	 * 是否包含敏感词
+	 * @param obj bean，会被转为JSON字符串
+	 * @return 是否包含
+	 */
+	public static boolean containsSensitive(Object obj){
+		return sensitiveTree.isMatch(JSONUtil.toJsonStr(obj));
+	}
+	
+	/**
+	 * 查找敏感词，返回找到的第一个敏感词
+	 * @param text 文本
+	 * @return 敏感词
+	 */
+	public static String getFindedFirstSensitive(String text){
+		return sensitiveTree.match(text);
+	}
+	
+	/**
+	 * 查找敏感词，返回找到的第一个敏感词
+	 * @param obj bean，会被转为JSON字符串
+	 * @return 敏感词
+	 */
+	public static String getFindedFirstSensitive(Object obj){
+		return sensitiveTree.match(JSONUtil.toJsonStr(obj));
+	}
+	
+	/**
+	 * 查找敏感词，返回找到的所有敏感词
+	 * @param text 文本
+	 * @return 敏感词
+	 */
+	public static List<String> getFindedAllSensitive(String text){
+		return sensitiveTree.matchAll(text);
+	}
+	
+	/**
+	 * 查找敏感词，返回找到的所有敏感词<br>
+	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
+	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
+	 * 
+	 * @param text 文本
+	 * @param isDensityMatch 是否使用密集匹配原则
+	 * @param isGreedMatch 是否使用贪婪匹配（最长匹配）原则
+	 * @return 敏感词
+	 */
+	public static List<String> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch){
+		return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
+	}
+	
+	/**
+	 * 查找敏感词，返回找到的所有敏感词
+	 * @param bean 对象，会被转为JSON
+	 * @return 敏感词
+	 */
+	public static List<String> getFindedAllSensitive(Object bean){
+		return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean));
+	}
+	
+	/**
+	 * 查找敏感词，返回找到的所有敏感词<br>
+	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
+	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
+	 * 
+	 * @param bean 对象，会被转为JSON
+	 * @param isDensityMatch 是否使用密集匹配原则
+	 * @param isGreedMatch 是否使用贪婪匹配（最长匹配）原则
+	 * @return 敏感词
+	 */
+	public static List<String> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch){
+		return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
+	}
+}
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/StopChar.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/StopChar.java
@@ -0,0 +1,49 @@
+package cn.hutool.dfa;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * 过滤词及一些简单处理
+ * 
+ * @author Looly
+ */
+public class StopChar {
+	/** 不需要处理的词，如标点符号、空格等 */
+	public static final Set<Character> STOP_WORD = new HashSet<>(Arrays.asList(new Character[] { ' ', '\'', '、', '。',
+			'·', 'ˉ', 'ˇ', '々', '—', '～', '‖', '…', '‘', '’', '“', '”', '〔', '〕', '〈', '〉', '《', '》', '「', '」', '『',
+			'』', '〖', '〗', '【', '】', '±', '＋', '－', '×', '÷', '∧', '∨', '∑', '∏', '∪', '∩', '∈', '√', '⊥', '⊙', '∫',
+			'∮', '≡', '≌', '≈', '∽', '∝', '≠', '≮', '≯', '≤', '≥', '∞', '∶', '∵', '∴', '∷', '♂', '♀', '°', '′', '〃',
+			'℃', '＄', '¤', '￠', '￡', '‰', '§', '☆', '★', '〇', '○', '●', '◎', '◇', '◆', '□', '■', '△', '▽', '⊿', '▲',
+			'▼', '◣', '◤', '◢', '◥', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█', '▉', '▊', '▋', '▌', '▍', '▎', '▏', '▓',
+			'※', '→', '←', '↑', '↓', '↖', '↗', '↘', '↙', '〓', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ', 'ⅷ', 'ⅸ', 'ⅹ', '①',
+			'②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⒈', '⒉', '⒊', '⒋', '⒌', '⒍', '⒎', '⒏', '⒐', '⒑', '⒒', '⒓',
+			'⒔', '⒕', '⒖', '⒗', '⒘', '⒙', '⒚', '⒛', '⑴', '⑵', '⑶', '⑷', '⑸', '⑹', '⑺', '⑻', '⑼', '⑽', '⑾', '⑿', '⒀',
+			'⒁', '⒂', '⒃', '⒄', '⒅', '⒆', '⒇', 'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅴ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅹ', 'Ⅺ', 'Ⅻ', '！', '”',
+			'＃', '￥', '％', '＆', '’', '（', '）', '＊', '＋', '，', '－', '．', '／', '０', '１', '２', '３', '４', '５', '６', '７',
+			'８', '９', '：', '；', '＜', '＝', '＞', '？', '＠', '〔', '＼', '〕', '＾', '＿', '‘', '｛', '｜', '｝', '∏', 'Ρ', '∑',
+			'Υ', 'Φ', 'Χ', 'Ψ', 'Ω', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
+			'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', '（', '）', '〔', '〕', '＾', '﹊', '﹍', '╭', '╮', '╰', '╯', '', '_',
+			'', '^', '（', '^', '：', '！', '/', '\\', '\"', '<', '>', '`', '·', '。', '{', '}', '~', '～', '(', ')', '-',
+			'√', '$', '@', '*', '&', '#', '卐', '㎎', '㎏', '㎜', '㎝', '㎞', '㎡', '㏄', '㏎', '㏑', '㏒', '㏕' }));
+
+	/**
+	 * 判断指定的词是否是不处理的词。
+	 * 如果参数为空，则返回true，因为空也属于不处理的字符。
+	 * 
+	 * @param ch 指定的词
+	 * @return 是否是不处理的词
+	 */
+	public static boolean isStopChar(char ch) {
+		return Character.isWhitespace(ch) || STOP_WORD.contains(ch);
+	}
+	
+	/**
+	 * 是否为合法字符（待处理字符）
+	 * @param ch 指定的词
+	 * @return 是否为合法字符（待处理字符）
+	 */
+	public static boolean isNotStopChar(char ch) {
+		return false == isStopChar(ch);
+	}
+}
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
@@ -0,0 +1,233 @@
+package cn.hutool.dfa;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import cn.hutool.core.collection.CollectionUtil;
+import cn.hutool.core.util.StrUtil;
+
+/**
+ * DFA（Deterministic Finite Automaton 确定有穷自动机）
+ * DFA单词树（以下简称单词树），常用于在某大段文字中快速查找某几个关键词是否存在。<br>
+ * 单词树使用group区分不同的关键字集合，不同的分组可以共享树枝，避免重复建树。<br>
+ * 单词树使用树状结构表示一组单词。<br>
+ * 例如：红领巾，红河构建树后为：<br>
+ *                 红                    <br>
+ *              /      \                 <br>
+ *           领         河             <br>
+ *          /                            <br>
+ *        巾                            <br>
+ *其中每个节点都是一个WordTree对象，查找时从上向下查找。<br>
+ * @author Looly
+ *
+ */
+public class WordTree extends HashMap<Character, WordTree>{
+	private static final long serialVersionUID = -4646423269465809276L;
+	
+	/**
+	 * 敏感词字符末尾标识，用于标识单词末尾字符
+	 */
+	private Set<Character> endCharacterSet = new HashSet<>();
+	
+	//--------------------------------------------------------------------------------------- Constructor start
+	/**
+	 * 默认构造
+	 */
+	public WordTree() {
+	}
+	//--------------------------------------------------------------------------------------- Constructor start
+	
+	//------------------------------------------------------------------------------- add word
+	
+	/**
+	 * 增加一组单词
+	 * @param words 单词集合
+	 */
+	public void addWords(Collection<String> words){
+		if(false == (words instanceof Set)){
+			words = new HashSet<>(words);
+		}
+		for (String word : words) {
+			addWord(word);
+		}
+	}
+	
+	/**
+	 * 增加一组单词
+	 * @param words 单词数组
+	 */
+	public void addWords(String... words){
+		HashSet<String> wordsSet = CollectionUtil.newHashSet(words);
+		for (String word : wordsSet) {
+			addWord(word);
+		}
+	}
+	
+	/**
+	 * 添加单词，使用默认类型
+	 * @param word 单词
+	 */
+	public void addWord(String word) {
+		WordTree parent = null;
+		WordTree current = this;
+		WordTree child;
+		char currentChar = 0;
+		int length = word.length();
+		for(int i = 0; i < length; i++){
+			currentChar = word.charAt(i);
+			if(false == StopChar.isStopChar(currentChar)){//只处理合法字符
+				child = current.get(currentChar);
+				if(child == null){
+					//无子类，新建一个子节点后存放下一个字符
+					child = new WordTree();
+					current.put(currentChar, child);
+				}
+				parent = current;
+				current = child;
+			}
+		}
+		if(null != parent){
+			parent.setEnd(currentChar);
+		}
+	}
+	
+	//------------------------------------------------------------------------------- match
+	/**
+	 * 指定文本是否包含树中的词
+	 * @param text 被检查的文本
+	 * @return 是否包含
+	 */
+	public boolean isMatch(String text){
+		if(null == text){
+			return false;
+		}
+		return null != match(text);
+	}
+	
+	/**
+	 * 获得第一个匹配的关键字
+	 * @param text 被检查的文本
+	 * @return 匹配到的关键字
+	 */
+	public String match(String text){
+		if(null == text){
+			return null;
+		}
+		List<String> matchAll = matchAll(text, 1);
+		if(CollectionUtil.isNotEmpty(matchAll)){
+			return matchAll.get(0);
+		}
+		return null;
+	}
+	
+	//------------------------------------------------------------------------------- match all
+	/**
+	 * 找出所有匹配的关键字
+	 * @param text 被检查的文本
+	 * @return 匹配的词列表
+	 */
+	public List<String> matchAll(String text) {
+		return matchAll(text, -1);
+	}
+	
+	/**
+	 * 找出所有匹配的关键字
+	 * @param text 被检查的文本
+	 * @param limit 限制匹配个数
+	 * @return 匹配的词列表
+	 */
+	public List<String> matchAll(String text, int limit) {
+		return matchAll(text, limit, false, false);
+	}
+	
+	/**
+	 * 找出所有匹配的关键字<br>
+	 * 密集匹配原则：假如关键词有 ab,b，文本是abab，将匹配 [ab,b,ab]<br>
+	 * 贪婪匹配（最长匹配）原则：假如关键字a,ab，最长匹配将匹配[a, ab]
+	 * 
+	 * @param text 被检查的文本
+	 * @param limit 限制匹配个数
+	 * @param isDensityMatch 是否使用密集匹配原则
+	 * @param isGreedMatch 是否使用贪婪匹配（最长匹配）原则
+	 * @return 匹配的词列表
+	 */
+	public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
+		if(null == text){
+			return null;
+		}
+		
+		List<String> findedWords = new ArrayList<String>();
+		WordTree current = this;
+		int length = text.length();
+		StringBuilder wordBuffer;//存放查找到的字符缓存。完整出现一个词时加到findedWords中，否则清空
+		char currentChar;
+		for (int i = 0; i < length; i++) {
+			wordBuffer = StrUtil.builder();
+			for (int j = i; j < length; j++) {
+				currentChar = text.charAt(j);
+//				Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
+				if(StopChar.isStopChar(currentChar)){
+					if(wordBuffer.length() > 0){
+						//做为关键词中间的停顿词被当作关键词的一部分被返回
+						wordBuffer.append(currentChar);
+					}else{
+						//停顿词做为关键词的第一个字符时需要跳过
+						i++;
+					}
+					continue;
+				}else if(false == current.containsKey(currentChar)){
+					//非关键字符被整体略过，重新以下个字符开始检查
+					break;
+				}
+				wordBuffer.append(currentChar);
+				if(current.isEnd(currentChar)){
+					//到达单词末尾，关键词成立，从此词的下一个位置开始查找
+					findedWords.add(wordBuffer.toString());
+					if(limit > 0 && findedWords.size() >= limit){
+						//超过匹配限制个数，直接返回
+						return findedWords;
+					}
+					if(false == isDensityMatch){
+						//如果非密度匹配，跳过匹配到的词
+						i = j;
+					}
+					if(false == isGreedMatch){
+						//如果懒惰匹配（非贪婪匹配）。当遇到第一个结尾标记就结束本轮匹配
+						break;
+					}
+				}
+				current = current.get(currentChar);
+				if(null == current){
+					break;
+				}
+			}
+			current = this;
+		}
+		return findedWords;
+	}
+	
+	
+	//--------------------------------------------------------------------------------------- Private method start
+	/**
+	 * 是否末尾
+	 * @param c 检查的字符
+	 * @return 是否末尾
+	 */
+	private boolean isEnd(Character c){
+		return this.endCharacterSet.contains(c);
+	}
+	
+	/**
+	 * 设置是否到达末尾
+	 * @param c 设置结尾的字符
+	 */
+	private void setEnd(Character c){
+		if(null != c){
+			this.endCharacterSet.add(c);
+		}
+	}
+	//--------------------------------------------------------------------------------------- Private method end
+}
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/package-info.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/package-info.java
@@ -0,0 +1,9 @@
+/**
+ * DFA全称为：Deterministic Finite Automaton,即确定有穷自动机。<br>
+ * 解释起来原理其实也不难，就是用所有关键字构造一棵树，然后用正文遍历这棵树，遍历到叶子节点即表示文章中存在这个关键字。<br>
+ * 我们暂且忽略构建关键词树的时间，每次查找正文只需要O(n)复杂度就可以搞定。<br>
+ * 
+ * @author looly
+ *
+ */
+package cn.hutool.dfa;
--- a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
+++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
@@ -0,0 +1,113 @@
+package cn.hutool.dfa.test;
+
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import cn.hutool.core.collection.CollectionUtil;
+import cn.hutool.dfa.WordTree;
+
+/**
+ * DFA单元测试
+ * 
+ * @author Looly
+ *
+ */
+public class DfaTest {
+
+	// 构建被查询的文本
+	String text = "我有一颗大土豆，刚出锅的";
+
+	@Test
+	public void matchAllTest() {
+		// 构建查询树
+		WordTree tree = buildWordTree();
+
+		// -----------------------------------------------------------------------------------------------------------------------------------
+		// 情况一：标准匹配，匹配到最短关键词，并跳过已经匹配的关键词
+		// 匹配到【大】，就不再继续匹配了，因此【大土豆】不匹配
+		// 匹配到【刚出锅】，就跳过这三个字了，因此【出锅】不匹配（由于刚首先被匹配，因此长的被匹配，最短匹配只针对第一个字相同选最短）
+		List<String> matchAll = tree.matchAll(text, -1, false, false);
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅"));
+	}
+
+	/**
+	 * 密集匹配原则（最短匹配）测试
+	 */
+	@Test
+	public void densityMatchTest() {
+		// 构建查询树
+		WordTree tree = buildWordTree();
+
+		// -----------------------------------------------------------------------------------------------------------------------------------
+		// 情况二：匹配到最短关键词，不跳过已经匹配的关键词
+		// 【大】被匹配，最短匹配原则【大土豆】被跳过，【土豆继续被匹配】
+		// 【刚出锅】被匹配，由于不跳过已经匹配的词，【出锅】被匹配
+		List<String> matchAll = tree.matchAll(text, -1, true, false);
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅", "出锅"));
+	}
+
+	/**
+	 * 贪婪匹配原则测试
+	 */
+	@Test
+	public void greedMatchTest() {
+		// 构建查询树
+		WordTree tree = buildWordTree();
+
+		// -----------------------------------------------------------------------------------------------------------------------------------
+		// 情况三：匹配到最长关键词，跳过已经匹配的关键词
+		// 匹配到【大】，由于到最长匹配，因此【大土豆】接着被匹配
+		// 由于【大土豆】被匹配，【土豆】被跳过，由于【刚出锅】被匹配，【出锅】被跳过
+		List<String> matchAll = tree.matchAll(text, -1, false, true);
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "刚出锅"));
+
+	}
+
+	/**
+	 * 密集匹配原则（最短匹配）和贪婪匹配原则测试
+	 */
+	@Test
+	public void densityAndGreedMatchTest() {
+		// 构建查询树
+		WordTree tree = buildWordTree();
+
+		// -----------------------------------------------------------------------------------------------------------------------------------
+		// 情况四：匹配到最长关键词，不跳过已经匹配的关键词（最全关键词）
+		// 匹配到【大】，由于到最长匹配，因此【大土豆】接着被匹配，由于不跳过已经匹配的关键词，土豆继续被匹配
+		// 【刚出锅】被匹配，由于不跳过已经匹配的词，【出锅】被匹配
+		List<String> matchAll = tree.matchAll(text, -1, true, true);
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "土豆", "刚出锅", "出锅"));
+
+	}
+
+	/**
+	 * 停顿词测试
+	 */
+	@Test
+	public void stopWordTest() {
+		WordTree tree = new WordTree();
+		tree.addWord("tio");
+
+		List<String> all = tree.matchAll("AAAAAAAt-ioBBBBBBB");
+		Assert.assertEquals(all, CollectionUtil.newArrayList("t-io"));
+	}
+	
+	// ----------------------------------------------------------------------------------------------------------
+	/**
+	 * 构建查找树
+	 * 
+	 * @return 查找树
+	 */
+	private WordTree buildWordTree() {
+		// 构建查询树
+		WordTree tree = new WordTree();
+		tree.addWord("大");
+		tree.addWord("大土豆");
+		tree.addWord("土豆");
+		tree.addWord("刚出锅");
+		tree.addWord("出锅");
+		return tree;
+	}
+}