diff --git a/hutool-core/src/main/java/cn/hutool/core/text/dfa/NFA.java b/hutool-core/src/main/java/cn/hutool/core/text/dfa/NFA.java index 468658a62..3d791679c 100644 --- a/hutool-core/src/main/java/cn/hutool/core/text/dfa/NFA.java +++ b/hutool-core/src/main/java/cn/hutool/core/text/dfa/NFA.java @@ -4,142 +4,186 @@ import java.util.*; /** *

- * * 基于非确定性有穷自动机(NFA) 实现的多模匹配工具 + *

* * @author renyp */ public class NFA { - private final Node root; + /** + * AC树的根节点 + */ + private final Node root; + /** + * 标记是否需要构建AC自动机,做树优化 + */ + private volatile boolean needBuildAc; - /** - * 默认构造 - */ - public NFA() { - this.root = new Node(); - } + /** + * 内置锁,防止并发场景,并行建AC树,造成不可预知结果 + */ + private final Object buildAcLock; - /** - * 构造函数 并 初始化词库 - * - * @param words 添加的新词 - */ - public NFA(final String... words) { - this(); - this.insert(words); - } + /** + * 内置锁,防止并行插入,新节点建立后,被挂载到树上前 被篡改 + */ + private final Object insertTreeLock; - /** - * 词库添加新词,初始化查找树 - * - * @param word 添加的新词 - */ - public void insert(final String word) { - Node p = root; - for (final char curr : word.toCharArray()) { - if (p.next.get((int) curr) == null) { - p.next.put((int) curr, new Node()); - } - p = p.next.get((int) curr); - } - p.flag = true; - p.str = word; - } + /** + * 默认构造 + */ + public NFA() { + this.root = new Node(); + this.needBuildAc = true; + this.buildAcLock = new Object(); + this.insertTreeLock = new Object(); + } - /** - * 词库批量添加新词,初始化查找树 - * - * @param words 添加的新词 - */ - public void insert(final String... words) { - for (final String word : words) { - this.insert(word); - } - } + /** + * 构造函数 并 初始化词库 + * + * @param words 添加的新词 + */ + public NFA(String... words) { + this(); + this.insert(words); + } - /** - * 构建基于NFA模型的 AC自动机 - */ - public void buildAc() { - final Queue queue = new LinkedList<>(); - final Node p = root; - for (final Integer key : p.next.keySet()) { - p.next.get(key).fail = root; - queue.offer(p.next.get(key)); - } - while (!queue.isEmpty()) { - final Node curr = queue.poll(); - for (final Integer key : curr.next.keySet()) { - Node fail = curr.fail; - // 查找当前节点匹配失败,他对应等效匹配的节点是哪个 - while (fail != null && fail.next.get(key) == null) { - fail = fail.fail; - } - // 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配) - if (fail != null) { - fail = fail.next.get(key); - } else { - fail = root; - } - curr.next.get(key).fail = fail; - queue.offer(curr.next.get(key)); - } - } - } + /** + * 词库添加新词,初始化查找树 + * + * @param word 添加的新词 + */ + public void insert(String word) { + synchronized (insertTreeLock) { + needBuildAc = true; + Node p = root; + for (char curr : word.toCharArray()) { + int ind = curr; + if (p.next.get(ind) == null) { + p.next.put(ind, new Node()); + } + p = p.next.get(ind); + } + p.flag = true; + p.str = word; + } + } - /** - * @param text 查询的文本(母串) - * @return 关键字列表 - */ - public List find(final String text) { - return this.find(text, true); - } + /** + * 词库批量添加新词,初始化查找树 + * + * @param words 添加的新词 + */ + public void insert(String... words) { + for (String word : words) { + this.insert(word); + } + } - /** - * @param text 查找的文本(母串) - * @param isDensityMatch 是否密集匹配 - * @return 关键字列表 - */ - public List find(final String text, final boolean isDensityMatch) { - final List ans = new ArrayList<>(); - Node p = root, k; - for (int i = 0, len = text.length(); i < len; i++) { - final int ind = text.charAt(i); - // 状态转移(沿着fail指针链接的链表,此处区别于DFA模型) - while (p != null && p.next.get(ind) == null) { - p = p.fail; - } - if (p == null) { - p = root; - } else { - p = p.next.get(ind); - } - // 提取结果(沿着fail指针链接的链表,此处区别于DFA模型) - k = p; - while (k != null) { - if (k.flag) { - ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i)); - if (!isDensityMatch) { - p = root; - break; - } - } - k = k.fail; - } - } - return ans; - } + /** + * 构建基于NFA模型的 AC自动机 + */ + private void buildAc() { + Queue queue = new LinkedList<>(); + Node p = root; + for (Integer key : p.next.keySet()) { + p.next.get(key).fail = root; + queue.offer(p.next.get(key)); + } + while (!queue.isEmpty()) { + Node curr = queue.poll(); + for (Integer key : curr.next.keySet()) { + Node fail = curr.fail; + // 查找当前节点匹配失败,他对应等效匹配的节点是哪个 + while (fail != null && fail.next.get(key) == null) { + fail = fail.fail; + } + // 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配) + if (fail != null) { + fail = fail.next.get(key); + } else { + fail = root; + } + curr.next.get(key).fail = fail; + queue.offer(curr.next.get(key)); + } + } + needBuildAc = false; + } - private static class Node { + /** + * @param text 查询的文本(母串) + */ + public List find(String text) { + return this.find(text, true); + } - boolean flag; - Node fail; - String str; - Map next; + /** + * @param text 查找的文本(母串) + * @param isDensityMatch 是否密集匹配 + */ + public List find(String text, boolean isDensityMatch) { + // double check,防止重复无用的 buildAC + if (needBuildAc) { + synchronized (buildAcLock) { + if (needBuildAc) { + this.buildAc(); + } + } + } + List ans = new ArrayList<>(); + Node p = root, k = null; + for (int i = 0, len = text.length(); i < len; i++) { + int ind = text.charAt(i); + // 状态转移(沿着fail指针链接的链表,此处区别于DFA模型) + while (p != null && p.next.get(ind) == null) { + p = p.fail; + } + if (p == null) { + p = root; + } else { + p = p.next.get(ind); + } + // 提取结果(沿着fail指针链接的链表,此处区别于DFA模型) + k = p; + while (k != null) { + if (k.flag) { + ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i)); + if (!isDensityMatch) { + p = root; + break; + } + } + k = k.fail; + } + } + return ans; + } - public Node() { - this.flag = false; - next = new HashMap<>(); - } - } + + private static class Node { + + /** + * 当前节点是否是一个单词的结尾 + */ + boolean flag; + /** + * 指向 当前节点匹配失败应该跳转的下个节点 + */ + Node fail; + /** + * 以当前节点结尾的单词 + */ + String str; + /** + * 当前节点的子节点 + */ + Map next; + + public Node() { + this.flag = false; + next = new HashMap<>(); + } + } } diff --git a/hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java b/hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java index e38be87f3..978d1ee23 100644 --- a/hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java +++ b/hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java @@ -16,7 +16,7 @@ public class NFATest { public void testFind() { final NFA NFA = new NFA(); NFA.insert("say", "her", "he", "she", "shr"); - NFA.buildAc(); +// NFA.buildAc(); final WordTree wordTree = new WordTree(); wordTree.addWords("say", "her", "he", "she", "shr"); @@ -53,7 +53,7 @@ public class NFATest { public void testFindNotDensity() { final NFA NFA = new NFA(); NFA.insert("say", "her", "he", "she", "shr"); - NFA.buildAc(); +// NFA.buildAc(); final WordTree wordTree = new WordTree(); wordTree.addWords("say", "her", "he", "she", "shr"); @@ -89,7 +89,7 @@ public class NFATest { stopWatch.start("automaton_char_buid_find"); final NFA NFALocal = new NFA(); NFALocal.insert("say", "her", "he", "she", "shr"); - NFALocal.buildAc(); +// NFALocal.buildAc(); final List ans1 = NFALocal.find(input); stopWatch.stop(); @@ -124,7 +124,7 @@ public class NFATest { stopWatch.start("automaton_cn_build_find"); final NFA NFALocal = new NFA(); NFALocal.insert("赵", "赵啊", "赵啊三"); - NFALocal.buildAc(); +// NFALocal.buildAc(); final List result = NFALocal.find(input); stopWatch.stop(); @@ -161,7 +161,7 @@ public class NFATest { final NFA NFALocal = new NFA(); NFALocal.insert("赵", "赵啊", "赵啊三"); - NFALocal.buildAc(); +// NFALocal.buildAc(); stopWatch.start("automaton_cn_find"); final List result = NFALocal.find(input); @@ -200,7 +200,7 @@ public class NFATest { final NFA NFALocal = new NFA(); NFALocal.insert("赵", "赵啊", "赵啊三"); - NFALocal.buildAc(); +// NFALocal.buildAc(); stopWatch.start("automaton_cn_find_not_density"); final List result = NFALocal.find(input, false);