mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-07-21 15:09:48 +08:00
clean history
This commit is contained in:
29
hutool-dfa/pom.xml
Normal file
29
hutool-dfa/pom.xml
Normal file
@@ -0,0 +1,29 @@
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<parent>
|
||||
<groupId>cn.hutool</groupId>
|
||||
<artifactId>hutool-parent</artifactId>
|
||||
<version>4.6.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>hutool-dfa</artifactId>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>Hutool 基于DFA的关键词查找</description>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>cn.hutool</groupId>
|
||||
<artifactId>hutool-core</artifactId>
|
||||
<version>${project.parent.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>cn.hutool</groupId>
|
||||
<artifactId>hutool-json</artifactId>
|
||||
<version>${project.parent.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
161
hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
Normal file
161
hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
Normal file
@@ -0,0 +1,161 @@
|
||||
package cn.hutool.dfa;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
import cn.hutool.core.thread.ThreadUtil;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import cn.hutool.json.JSONUtil;
|
||||
|
||||
/**
|
||||
* 敏感词工具类
|
||||
* @author Looly
|
||||
*
|
||||
*/
|
||||
public final class SensitiveUtil {
|
||||
// private static final Log log = LogFactory.get();
|
||||
|
||||
public static final char DEFAULT_SEPARATOR = StrUtil.C_COMMA;
|
||||
private static WordTree sensitiveTree = new WordTree();
|
||||
|
||||
/**
|
||||
* @return 是否已经被初始化
|
||||
*/
|
||||
public static boolean isInited(){
|
||||
return !sensitiveTree.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化敏感词树
|
||||
* @param isAsync 是否异步初始化
|
||||
* @param sensitiveWords 敏感词列表
|
||||
*/
|
||||
public static void init(final Collection<String> sensitiveWords, boolean isAsync){
|
||||
if(isAsync){
|
||||
ThreadUtil.execAsync(new Callable<Boolean>(){
|
||||
@Override
|
||||
public Boolean call() throws Exception {
|
||||
init(sensitiveWords);
|
||||
return true;
|
||||
}
|
||||
|
||||
});
|
||||
}else{
|
||||
init(sensitiveWords);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化敏感词树
|
||||
* @param sensitiveWords 敏感词列表
|
||||
*/
|
||||
public static void init(Collection<String> sensitiveWords){
|
||||
sensitiveTree.clear();
|
||||
sensitiveTree.addWords(sensitiveWords);
|
||||
// log.debug("Sensitive init finished, sensitives: {}", sensitiveWords);
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化敏感词树
|
||||
* @param sensitiveWords 敏感词列表组成的字符串
|
||||
* @param isAsync 是否异步初始化
|
||||
* @param separator 分隔符
|
||||
*/
|
||||
public static void init(String sensitiveWords, char separator, boolean isAsync){
|
||||
if(StrUtil.isNotBlank(sensitiveWords)){
|
||||
init(StrUtil.split(sensitiveWords, separator), isAsync);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化敏感词树,使用逗号分隔每个单词
|
||||
* @param sensitiveWords 敏感词列表组成的字符串
|
||||
* @param isAsync 是否异步初始化
|
||||
*/
|
||||
public static void init(String sensitiveWords, boolean isAsync){
|
||||
init(sensitiveWords, DEFAULT_SEPARATOR, isAsync);
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否包含敏感词
|
||||
* @param text 文本
|
||||
* @return 是否包含
|
||||
*/
|
||||
public static boolean containsSensitive(String text){
|
||||
return sensitiveTree.isMatch(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否包含敏感词
|
||||
* @param obj bean,会被转为JSON字符串
|
||||
* @return 是否包含
|
||||
*/
|
||||
public static boolean containsSensitive(Object obj){
|
||||
return sensitiveTree.isMatch(JSONUtil.toJsonStr(obj));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的第一个敏感词
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static String getFindedFirstSensitive(String text){
|
||||
return sensitiveTree.match(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的第一个敏感词
|
||||
* @param obj bean,会被转为JSON字符串
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static String getFindedFirstSensitive(Object obj){
|
||||
return sensitiveTree.match(JSONUtil.toJsonStr(obj));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词
|
||||
* @param text 文本
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(String text){
|
||||
return sensitiveTree.matchAll(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param text 文本
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch){
|
||||
return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(Object bean){
|
||||
return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查找敏感词,返回找到的所有敏感词<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param bean 对象,会被转为JSON
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 敏感词
|
||||
*/
|
||||
public static List<String> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch){
|
||||
return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
|
||||
}
|
||||
}
|
49
hutool-dfa/src/main/java/cn/hutool/dfa/StopChar.java
Normal file
49
hutool-dfa/src/main/java/cn/hutool/dfa/StopChar.java
Normal file
@@ -0,0 +1,49 @@
|
||||
package cn.hutool.dfa;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 过滤词及一些简单处理
|
||||
*
|
||||
* @author Looly
|
||||
*/
|
||||
public class StopChar {
|
||||
/** 不需要处理的词,如标点符号、空格等 */
|
||||
public static final Set<Character> STOP_WORD = new HashSet<>(Arrays.asList(new Character[] { ' ', '\'', '、', '。',
|
||||
'·', 'ˉ', 'ˇ', '々', '—', '~', '‖', '…', '‘', '’', '“', '”', '〔', '〕', '〈', '〉', '《', '》', '「', '」', '『',
|
||||
'』', '〖', '〗', '【', '】', '±', '+', '-', '×', '÷', '∧', '∨', '∑', '∏', '∪', '∩', '∈', '√', '⊥', '⊙', '∫',
|
||||
'∮', '≡', '≌', '≈', '∽', '∝', '≠', '≮', '≯', '≤', '≥', '∞', '∶', '∵', '∴', '∷', '♂', '♀', '°', '′', '〃',
|
||||
'℃', '$', '¤', '¢', '£', '‰', '§', '☆', '★', '〇', '○', '●', '◎', '◇', '◆', '□', '■', '△', '▽', '⊿', '▲',
|
||||
'▼', '◣', '◤', '◢', '◥', '▁', '▂', '▃', '▄', '▅', '▆', '▇', '█', '▉', '▊', '▋', '▌', '▍', '▎', '▏', '▓',
|
||||
'※', '→', '←', '↑', '↓', '↖', '↗', '↘', '↙', '〓', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ', 'ⅷ', 'ⅸ', 'ⅹ', '①',
|
||||
'②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⒈', '⒉', '⒊', '⒋', '⒌', '⒍', '⒎', '⒏', '⒐', '⒑', '⒒', '⒓',
|
||||
'⒔', '⒕', '⒖', '⒗', '⒘', '⒙', '⒚', '⒛', '⑴', '⑵', '⑶', '⑷', '⑸', '⑹', '⑺', '⑻', '⑼', '⑽', '⑾', '⑿', '⒀',
|
||||
'⒁', '⒂', '⒃', '⒄', '⒅', '⒆', '⒇', 'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅴ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅹ', 'Ⅺ', 'Ⅻ', '!', '”',
|
||||
'#', '¥', '%', '&', '’', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', ':', ';', '<', '=', '>', '?', '@', '〔', '\', '〕', '^', '_', '‘', '{', '|', '}', '∏', 'Ρ', '∑',
|
||||
'Υ', 'Φ', 'Χ', 'Ψ', 'Ω', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
|
||||
'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', '(', ')', '〔', '〕', '^', '﹊', '﹍', '╭', '╮', '╰', '╯', '', '_',
|
||||
'', '^', '(', '^', ':', '!', '/', '\\', '\"', '<', '>', '`', '·', '。', '{', '}', '~', '~', '(', ')', '-',
|
||||
'√', '$', '@', '*', '&', '#', '卐', '㎎', '㎏', '㎜', '㎝', '㎞', '㎡', '㏄', '㏎', '㏑', '㏒', '㏕' }));
|
||||
|
||||
/**
|
||||
* 判断指定的词是否是不处理的词。
|
||||
* 如果参数为空,则返回true,因为空也属于不处理的字符。
|
||||
*
|
||||
* @param ch 指定的词
|
||||
* @return 是否是不处理的词
|
||||
*/
|
||||
public static boolean isStopChar(char ch) {
|
||||
return Character.isWhitespace(ch) || STOP_WORD.contains(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否为合法字符(待处理字符)
|
||||
* @param ch 指定的词
|
||||
* @return 是否为合法字符(待处理字符)
|
||||
*/
|
||||
public static boolean isNotStopChar(char ch) {
|
||||
return false == isStopChar(ch);
|
||||
}
|
||||
}
|
233
hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
Normal file
233
hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
Normal file
@@ -0,0 +1,233 @@
|
||||
package cn.hutool.dfa;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import cn.hutool.core.collection.CollectionUtil;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
|
||||
/**
|
||||
* DFA(Deterministic Finite Automaton 确定有穷自动机)
|
||||
* DFA单词树(以下简称单词树),常用于在某大段文字中快速查找某几个关键词是否存在。<br>
|
||||
* 单词树使用group区分不同的关键字集合,不同的分组可以共享树枝,避免重复建树。<br>
|
||||
* 单词树使用树状结构表示一组单词。<br>
|
||||
* 例如:红领巾,红河构建树后为:<br>
|
||||
* 红 <br>
|
||||
* / \ <br>
|
||||
* 领 河 <br>
|
||||
* / <br>
|
||||
* 巾 <br>
|
||||
*其中每个节点都是一个WordTree对象,查找时从上向下查找。<br>
|
||||
* @author Looly
|
||||
*
|
||||
*/
|
||||
public class WordTree extends HashMap<Character, WordTree>{
|
||||
private static final long serialVersionUID = -4646423269465809276L;
|
||||
|
||||
/**
|
||||
* 敏感词字符末尾标识,用于标识单词末尾字符
|
||||
*/
|
||||
private Set<Character> endCharacterSet = new HashSet<>();
|
||||
|
||||
//--------------------------------------------------------------------------------------- Constructor start
|
||||
/**
|
||||
* 默认构造
|
||||
*/
|
||||
public WordTree() {
|
||||
}
|
||||
//--------------------------------------------------------------------------------------- Constructor start
|
||||
|
||||
//------------------------------------------------------------------------------- add word
|
||||
|
||||
/**
|
||||
* 增加一组单词
|
||||
* @param words 单词集合
|
||||
*/
|
||||
public void addWords(Collection<String> words){
|
||||
if(false == (words instanceof Set)){
|
||||
words = new HashSet<>(words);
|
||||
}
|
||||
for (String word : words) {
|
||||
addWord(word);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加一组单词
|
||||
* @param words 单词数组
|
||||
*/
|
||||
public void addWords(String... words){
|
||||
HashSet<String> wordsSet = CollectionUtil.newHashSet(words);
|
||||
for (String word : wordsSet) {
|
||||
addWord(word);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 添加单词,使用默认类型
|
||||
* @param word 单词
|
||||
*/
|
||||
public void addWord(String word) {
|
||||
WordTree parent = null;
|
||||
WordTree current = this;
|
||||
WordTree child;
|
||||
char currentChar = 0;
|
||||
int length = word.length();
|
||||
for(int i = 0; i < length; i++){
|
||||
currentChar = word.charAt(i);
|
||||
if(false == StopChar.isStopChar(currentChar)){//只处理合法字符
|
||||
child = current.get(currentChar);
|
||||
if(child == null){
|
||||
//无子类,新建一个子节点后存放下一个字符
|
||||
child = new WordTree();
|
||||
current.put(currentChar, child);
|
||||
}
|
||||
parent = current;
|
||||
current = child;
|
||||
}
|
||||
}
|
||||
if(null != parent){
|
||||
parent.setEnd(currentChar);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------- match
|
||||
/**
|
||||
* 指定文本是否包含树中的词
|
||||
* @param text 被检查的文本
|
||||
* @return 是否包含
|
||||
*/
|
||||
public boolean isMatch(String text){
|
||||
if(null == text){
|
||||
return false;
|
||||
}
|
||||
return null != match(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得第一个匹配的关键字
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配到的关键字
|
||||
*/
|
||||
public String match(String text){
|
||||
if(null == text){
|
||||
return null;
|
||||
}
|
||||
List<String> matchAll = matchAll(text, 1);
|
||||
if(CollectionUtil.isNotEmpty(matchAll)){
|
||||
return matchAll.get(0);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------- match all
|
||||
/**
|
||||
* 找出所有匹配的关键字
|
||||
* @param text 被检查的文本
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<String> matchAll(String text) {
|
||||
return matchAll(text, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出所有匹配的关键字
|
||||
* @param text 被检查的文本
|
||||
* @param limit 限制匹配个数
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<String> matchAll(String text, int limit) {
|
||||
return matchAll(text, limit, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* 找出所有匹配的关键字<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
*
|
||||
* @param text 被检查的文本
|
||||
* @param limit 限制匹配个数
|
||||
* @param isDensityMatch 是否使用密集匹配原则
|
||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||
* @return 匹配的词列表
|
||||
*/
|
||||
public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
|
||||
if(null == text){
|
||||
return null;
|
||||
}
|
||||
|
||||
List<String> findedWords = new ArrayList<String>();
|
||||
WordTree current = this;
|
||||
int length = text.length();
|
||||
StringBuilder wordBuffer;//存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空
|
||||
char currentChar;
|
||||
for (int i = 0; i < length; i++) {
|
||||
wordBuffer = StrUtil.builder();
|
||||
for (int j = i; j < length; j++) {
|
||||
currentChar = text.charAt(j);
|
||||
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
|
||||
if(StopChar.isStopChar(currentChar)){
|
||||
if(wordBuffer.length() > 0){
|
||||
//做为关键词中间的停顿词被当作关键词的一部分被返回
|
||||
wordBuffer.append(currentChar);
|
||||
}else{
|
||||
//停顿词做为关键词的第一个字符时需要跳过
|
||||
i++;
|
||||
}
|
||||
continue;
|
||||
}else if(false == current.containsKey(currentChar)){
|
||||
//非关键字符被整体略过,重新以下个字符开始检查
|
||||
break;
|
||||
}
|
||||
wordBuffer.append(currentChar);
|
||||
if(current.isEnd(currentChar)){
|
||||
//到达单词末尾,关键词成立,从此词的下一个位置开始查找
|
||||
findedWords.add(wordBuffer.toString());
|
||||
if(limit > 0 && findedWords.size() >= limit){
|
||||
//超过匹配限制个数,直接返回
|
||||
return findedWords;
|
||||
}
|
||||
if(false == isDensityMatch){
|
||||
//如果非密度匹配,跳过匹配到的词
|
||||
i = j;
|
||||
}
|
||||
if(false == isGreedMatch){
|
||||
//如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配
|
||||
break;
|
||||
}
|
||||
}
|
||||
current = current.get(currentChar);
|
||||
if(null == current){
|
||||
break;
|
||||
}
|
||||
}
|
||||
current = this;
|
||||
}
|
||||
return findedWords;
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------------- Private method start
|
||||
/**
|
||||
* 是否末尾
|
||||
* @param c 检查的字符
|
||||
* @return 是否末尾
|
||||
*/
|
||||
private boolean isEnd(Character c){
|
||||
return this.endCharacterSet.contains(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置是否到达末尾
|
||||
* @param c 设置结尾的字符
|
||||
*/
|
||||
private void setEnd(Character c){
|
||||
if(null != c){
|
||||
this.endCharacterSet.add(c);
|
||||
}
|
||||
}
|
||||
//--------------------------------------------------------------------------------------- Private method end
|
||||
}
|
9
hutool-dfa/src/main/java/cn/hutool/dfa/package-info.java
Normal file
9
hutool-dfa/src/main/java/cn/hutool/dfa/package-info.java
Normal file
@@ -0,0 +1,9 @@
|
||||
/**
|
||||
* DFA全称为:Deterministic Finite Automaton,即确定有穷自动机。<br>
|
||||
* 解释起来原理其实也不难,就是用所有关键字构造一棵树,然后用正文遍历这棵树,遍历到叶子节点即表示文章中存在这个关键字。<br>
|
||||
* 我们暂且忽略构建关键词树的时间,每次查找正文只需要O(n)复杂度就可以搞定。<br>
|
||||
*
|
||||
* @author looly
|
||||
*
|
||||
*/
|
||||
package cn.hutool.dfa;
|
113
hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
Normal file
113
hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
Normal file
@@ -0,0 +1,113 @@
|
||||
package cn.hutool.dfa.test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import cn.hutool.core.collection.CollectionUtil;
|
||||
import cn.hutool.dfa.WordTree;
|
||||
|
||||
/**
|
||||
* DFA单元测试
|
||||
*
|
||||
* @author Looly
|
||||
*
|
||||
*/
|
||||
public class DfaTest {
|
||||
|
||||
// 构建被查询的文本
|
||||
String text = "我有一颗大土豆,刚出锅的";
|
||||
|
||||
@Test
|
||||
public void matchAllTest() {
|
||||
// 构建查询树
|
||||
WordTree tree = buildWordTree();
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------------------------
|
||||
// 情况一:标准匹配,匹配到最短关键词,并跳过已经匹配的关键词
|
||||
// 匹配到【大】,就不再继续匹配了,因此【大土豆】不匹配
|
||||
// 匹配到【刚出锅】,就跳过这三个字了,因此【出锅】不匹配(由于刚首先被匹配,因此长的被匹配,最短匹配只针对第一个字相同选最短)
|
||||
List<String> matchAll = tree.matchAll(text, -1, false, false);
|
||||
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 密集匹配原则(最短匹配)测试
|
||||
*/
|
||||
@Test
|
||||
public void densityMatchTest() {
|
||||
// 构建查询树
|
||||
WordTree tree = buildWordTree();
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------------------------
|
||||
// 情况二:匹配到最短关键词,不跳过已经匹配的关键词
|
||||
// 【大】被匹配,最短匹配原则【大土豆】被跳过,【土豆继续被匹配】
|
||||
// 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配
|
||||
List<String> matchAll = tree.matchAll(text, -1, true, false);
|
||||
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅", "出锅"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 贪婪匹配原则测试
|
||||
*/
|
||||
@Test
|
||||
public void greedMatchTest() {
|
||||
// 构建查询树
|
||||
WordTree tree = buildWordTree();
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------------------------
|
||||
// 情况三:匹配到最长关键词,跳过已经匹配的关键词
|
||||
// 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配
|
||||
// 由于【大土豆】被匹配,【土豆】被跳过,由于【刚出锅】被匹配,【出锅】被跳过
|
||||
List<String> matchAll = tree.matchAll(text, -1, false, true);
|
||||
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "刚出锅"));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 密集匹配原则(最短匹配)和贪婪匹配原则测试
|
||||
*/
|
||||
@Test
|
||||
public void densityAndGreedMatchTest() {
|
||||
// 构建查询树
|
||||
WordTree tree = buildWordTree();
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------------------------
|
||||
// 情况四:匹配到最长关键词,不跳过已经匹配的关键词(最全关键词)
|
||||
// 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配,由于不跳过已经匹配的关键词,土豆继续被匹配
|
||||
// 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配
|
||||
List<String> matchAll = tree.matchAll(text, -1, true, true);
|
||||
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "土豆", "刚出锅", "出锅"));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 停顿词测试
|
||||
*/
|
||||
@Test
|
||||
public void stopWordTest() {
|
||||
WordTree tree = new WordTree();
|
||||
tree.addWord("tio");
|
||||
|
||||
List<String> all = tree.matchAll("AAAAAAAt-ioBBBBBBB");
|
||||
Assert.assertEquals(all, CollectionUtil.newArrayList("t-io"));
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* 构建查找树
|
||||
*
|
||||
* @return 查找树
|
||||
*/
|
||||
private WordTree buildWordTree() {
|
||||
// 构建查询树
|
||||
WordTree tree = new WordTree();
|
||||
tree.addWord("大");
|
||||
tree.addWord("大土豆");
|
||||
tree.addWord("土豆");
|
||||
tree.addWord("刚出锅");
|
||||
tree.addWord("出锅");
|
||||
return tree;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user