add filter support

2025-08-18 20:38:02 +08:00 · 2020-03-03 11:18:55 +08:00
parent 8fba51f62b
commit 2e2d43d764
5 changed files with 99 additions and 60 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@
 * 【crypto】     RSA算法中，BlockSize长度策略调整（issue#721@Github）
 * 【crypto】     删除SM2Engine，使用BC库中的对象替代
 * 【crypto】     增加PemUtil工具类
 * 【dfa   】     WordTree增加Filter，支持自定义特殊字符过滤器
 ### Bug修复
--- a/hutool-core/src/main/java/cn/hutool/core/lang/Filter.java
+++ b/hutool-core/src/main/java/cn/hutool/core/lang/Filter.java
@@ -2,13 +2,14 @@ package cn.hutool.core.lang;
 /**
 * 过滤器接口
 * @author Looly
 *
 * @author Looly
 */
@FunctionalInterface
 public interface Filter<T> {
 	/**
 	 * 是否接受对象
 	 *
 	 * @param t 检查的对象
 	 * @return 是否接受对象
 	 */
--- a/hutool-crypto/src/test/java/cn/hutool/crypto/test/BCUtilTest.java
+++ b/hutool-crypto/src/test/java/cn/hutool/crypto/test/BCUtilTest.java
@@ -9,6 +9,9 @@ import org.junit.Test;
 public class BCUtilTest {
 	/**
 	 * 密钥生成来自：https://i.goto327.top/CryptTools/SM2.aspx?tdsourcetag=s_pctim_aiomsg
 	 */
 	@Test
 	public void createECPublicKeyParametersTest() {
 		String x = "706AD9DAA3E5CEAC3DA59F583429E8043BAFC576BE10092C4EA4D8E19846CA62";
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
@@ -1,4 +1,5 @@
 package cn.hutool.dfa;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@@ -7,6 +8,7 @@ import java.util.List;
 import java.util.Set;
 import cn.hutool.core.collection.CollectionUtil;
 import cn.hutool.core.lang.Filter;
 import cn.hutool.core.text.StrBuilder;
 import cn.hutool.core.util.StrUtil;
@@ -22,8 +24,8 @@ import cn.hutool.core.util.StrUtil;
 * /                            <br>
 * 巾                            <br>
 * 其中每个节点都是一个WordTree对象，查找时从上向下查找。<br>
 * @author Looly
 *
 * @author Looly
 */
 public class WordTree extends HashMap<Character, WordTree> {
 	private static final long serialVersionUID = -4646423269465809276L;
@@ -32,8 +34,13 @@ public class WordTree extends HashMap<Character, WordTree>{
 	 * 敏感词字符末尾标识，用于标识单词末尾字符
 	 */
 	private Set<Character> endCharacterSet = new HashSet<>();
 	/**
 	 * 字符过滤规则，通过定义字符串过滤规则，过滤不需要的字符，当accept为false时，此字符不参与匹配
 	 */
 	private Filter<Character> charFilter = StopChar::isNotStopChar;
 	//--------------------------------------------------------------------------------------- Constructor start
 	/**
 	 * 默认构造
 	 */
@@ -41,10 +48,24 @@ public class WordTree extends HashMap<Character, WordTree>{
 	}
 	//--------------------------------------------------------------------------------------- Constructor start
 	/**
 	 * 设置字符过滤规则，通过定义字符串过滤规则，过滤不需要的字符<br>
 	 * 当accept为false时，此字符不参与匹配
 	 *
 	 * @param charFilter 过滤函数
 	 * @return this
 	 * @since 5.2.0
 	 */
 	public WordTree setCharFilter(Filter<Character> charFilter) {
 		this.charFilter = charFilter;
 		return this;
 	}
 	//------------------------------------------------------------------------------- add word
 	/**
 	 * 增加一组单词
 	 *
 	 * @param words 单词集合
 	 */
 	public void addWords(Collection<String> words) {
@@ -58,6 +79,7 @@ public class WordTree extends HashMap<Character, WordTree>{
 	/**
 	 * 增加一组单词
 	 *
 	 * @param words 单词数组
 	 */
 	public void addWords(String... words) {
@@ -69,9 +91,11 @@ public class WordTree extends HashMap<Character, WordTree>{
 	/**
 	 * 添加单词，使用默认类型
 	 *
 	 * @param word 单词
 	 */
 	public void addWord(String word) {
 		final Filter<Character> charFilter = this.charFilter;
 		WordTree parent = null;
 		WordTree current = this;
 		WordTree child;
@@ -79,7 +103,7 @@ public class WordTree extends HashMap<Character, WordTree>{
 		int length = word.length();
 		for (int i = 0; i < length; i++) {
 			currentChar = word.charAt(i);
-			if(false == StopChar.isStopChar(currentChar)){//只处理合法字符
+			if (charFilter.accept(currentChar)) {//只处理合法字符
 				child = current.get(currentChar);
 				if (child == null) {
 					//无子类，新建一个子节点后存放下一个字符
@@ -96,8 +120,10 @@ public class WordTree extends HashMap<Character, WordTree>{
 	}
 	//------------------------------------------------------------------------------- match
 	/**
 	 * 指定文本是否包含树中的词
 	 *
 	 * @param text 被检查的文本
 	 * @return 是否包含
 	 */
@@ -110,6 +136,7 @@ public class WordTree extends HashMap<Character, WordTree>{
 	/**
 	 * 获得第一个匹配的关键字
 	 *
 	 * @param text 被检查的文本
 	 * @return 匹配到的关键字
 	 */
@@ -125,8 +152,10 @@ public class WordTree extends HashMap<Character, WordTree>{
 	}
 	//------------------------------------------------------------------------------- match all
 	/**
 	 * 找出所有匹配的关键字
 	 *
 	 * @param text 被检查的文本
 	 * @return 匹配的词列表
 	 */
@@ -136,6 +165,7 @@ public class WordTree extends HashMap<Character, WordTree>{
 	/**
 	 * 找出所有匹配的关键字
 	 *
 	 * @param text  被检查的文本
 	 * @param limit 限制匹配个数
 	 * @return 匹配的词列表
@@ -163,6 +193,7 @@ public class WordTree extends HashMap<Character, WordTree>{
 		List<String> foundWords = new ArrayList<>();
 		WordTree current = this;
 		int length = text.length();
 		final Filter<Character> charFilter = this.charFilter;
 		//存放查找到的字符缓存。完整出现一个词时加到findedWords中，否则清空
 		final StrBuilder wordBuffer = StrUtil.strBuilder();
 		char currentChar;
@@ -171,7 +202,7 @@ public class WordTree extends HashMap<Character, WordTree>{
 			for (int j = i; j < length; j++) {
 				currentChar = text.charAt(j);
 //				Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
-				if(StopChar.isStopChar(currentChar)){
+				if (false == charFilter.accept(currentChar)) {
 					if (wordBuffer.length() > 0) {
 						//做为关键词中间的停顿词被当作关键词的一部分被返回
 						wordBuffer.append(currentChar);
@@ -213,8 +244,10 @@ public class WordTree extends HashMap<Character, WordTree>{
 	//--------------------------------------------------------------------------------------- Private method start
 	/**
 	 * 是否末尾
 	 *
 	 * @param c 检查的字符
 	 * @return 是否末尾
 	 */
@@ -224,6 +257,7 @@ public class WordTree extends HashMap<Character, WordTree>{
 	/**
 	 * 设置是否到达末尾
 	 *
 	 * @param c 设置结尾的字符
 	 */
 	private void setEnd(Character c) {
--- a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
+++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
@@ -16,8 +16,8 @@ import cn.hutool.dfa.WordTree;
 */
 public class DfaTest {
-	// 构建被查询的文本
+	// 构建被查询的文本，包含停顿词
-	String text = "我有一颗大土豆，刚出锅的";
+	String text = "我有一颗$大土^豆，刚出锅的";
 	@Test
 	public void matchAllTest() {
@@ -29,7 +29,7 @@ public class DfaTest {
 		// 匹配到【大】，就不再继续匹配了，因此【大土豆】不匹配
 		// 匹配到【刚出锅】，就跳过这三个字了，因此【出锅】不匹配（由于刚首先被匹配，因此长的被匹配，最短匹配只针对第一个字相同选最短）
 		List<String> matchAll = tree.matchAll(text, -1, false, false);
-		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅"));
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅"));
 	}
 	/**
@@ -45,7 +45,7 @@ public class DfaTest {
 		// 【大】被匹配，最短匹配原则【大土豆】被跳过，【土豆继续被匹配】
 		// 【刚出锅】被匹配，由于不跳过已经匹配的词，【出锅】被匹配
 		List<String> matchAll = tree.matchAll(text, -1, true, false);
-		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅", "出锅"));
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅"));
 	}
 	/**
@@ -61,7 +61,7 @@ public class DfaTest {
 		// 匹配到【大】，由于到最长匹配，因此【大土豆】接着被匹配
 		// 由于【大土豆】被匹配，【土豆】被跳过，由于【刚出锅】被匹配，【出锅】被跳过
 		List<String> matchAll = tree.matchAll(text, -1, false, true);
-		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "刚出锅"));
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "刚出锅"));
 	}
@@ -78,7 +78,7 @@ public class DfaTest {
 		// 匹配到【大】，由于到最长匹配，因此【大土豆】接着被匹配，由于不跳过已经匹配的关键词，土豆继续被匹配
 		// 【刚出锅】被匹配，由于不跳过已经匹配的词，【出锅】被匹配
 		List<String> matchAll = tree.matchAll(text, -1, true, true);
-		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "土豆", "刚出锅", "出锅"));
+		Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅"));
 	}