diff --git a/CHANGELOG.md b/CHANGELOG.md
index 916243740..2c6215634 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
* 【extra】 Sftp得put方法增加进度支持(issue#518@Github)
* 【core】 ArrayUtil增加distinct方法
* 【http】 去除log模块依赖,Cookie中去除日志提示,body方法传入JSON对象废弃,未来移除json模块依赖
+* 【extra】 添加MyNLP支持(issue#519@Github)
### Bug修复
diff --git a/hutool-extra/pom.xml b/hutool-extra/pom.xml
index 038948df5..c885011bd 100644
--- a/hutool-extra/pom.xml
+++ b/hutool-extra/pom.xml
@@ -200,5 +200,11 @@
1.2
true
+
+ com.mayabot.mynlp
+ mynlp-segment
+ 3.0.0
+ true
+
diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java
index 5309ee384..9e0ef1a6b 100644
--- a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java
+++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java
@@ -10,6 +10,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
+import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
import cn.hutool.extra.tokenizer.engine.word.WordEngine;
import cn.hutool.log.StaticLog;
@@ -77,6 +78,11 @@ public class TokenizerFactory {
} catch (NoClassDefFoundError e) {
// ignore
}
+ try {
+ return new MynlpEngine();
+ } catch (NoClassDefFoundError e) {
+ // ignore
+ }
throw new TokenizerException("No tokenizer found ! Please add some tokenizer jar to your project !");
}
}
diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java
new file mode 100644
index 000000000..640a7defc
--- /dev/null
+++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java
@@ -0,0 +1,44 @@
+package cn.hutool.extra.tokenizer.engine.mynlp;
+
+import com.mayabot.nlp.segment.Lexer;
+import com.mayabot.nlp.segment.Lexers;
+import com.mayabot.nlp.segment.Sentence;
+
+import cn.hutool.core.util.StrUtil;
+import cn.hutool.extra.tokenizer.Result;
+import cn.hutool.extra.tokenizer.TokenizerEngine;
+
+/**
+ * MYNLP 中文NLP工具包分词实现
+ * 项目地址:https://github.com/mayabot/mynlp/
+ *
+ * @author looly
+ *
+ */
+public class MynlpEngine implements TokenizerEngine {
+
+ private Lexer lexer;
+
+ /**
+ * 构造
+ */
+ public MynlpEngine() {
+ this.lexer = Lexers.core();
+ }
+
+ /**
+ * 构造
+ *
+ * @param lexer 分词器接口{@link Lexer}
+ */
+ public MynlpEngine(Lexer lexer) {
+ this.lexer = lexer;
+ }
+
+ @Override
+ public Result parse(CharSequence text) {
+ final Sentence sentence = this.lexer.scan(StrUtil.str(text));
+ return new MynlpResult(sentence);
+ }
+
+}
diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java
new file mode 100644
index 000000000..5fc0f73ea
--- /dev/null
+++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java
@@ -0,0 +1,50 @@
+package cn.hutool.extra.tokenizer.engine.mynlp;
+
+import java.util.Iterator;
+
+import com.mayabot.nlp.segment.Sentence;
+import com.mayabot.nlp.segment.WordTerm;
+
+import cn.hutool.extra.tokenizer.Result;
+import cn.hutool.extra.tokenizer.Word;
+
+/**
+ * MYNLP 中文NLP工具包分词结果实现
+ * 项目地址:https://github.com/mayabot/mynlp/
+ *
+ * @author looly
+ *
+ */
+public class MynlpResult implements Result {
+
+ private Iterator result;
+
+ /**
+ * 构造
+ *
+ * @param sentence 分词结果(中文句子)
+ */
+ public MynlpResult(Sentence sentence) {
+ this.result = sentence.iterator();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return result.hasNext();
+ }
+
+ @Override
+ public Word next() {
+ return new MynlpWord(result.next());
+ }
+
+ @Override
+ public void remove() {
+ result.remove();
+ }
+
+ @Override
+ public Iterator iterator() {
+ return this;
+ }
+}
diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java
new file mode 100644
index 000000000..a273c75dd
--- /dev/null
+++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java
@@ -0,0 +1,45 @@
+package cn.hutool.extra.tokenizer.engine.mynlp;
+
+import com.mayabot.nlp.segment.WordTerm;
+
+import cn.hutool.extra.tokenizer.Word;
+
+/**
+ * mmseg分词中的一个单词包装
+ *
+ * @author looly
+ *
+ */
+public class MynlpWord implements Word {
+
+ private WordTerm word;
+
+ /**
+ * 构造
+ *
+ * @param word {@link WordTerm}
+ */
+ public MynlpWord(WordTerm word) {
+ this.word = word;
+ }
+
+ @Override
+ public String getText() {
+ return word.getWord();
+ }
+
+ @Override
+ public int getStartOffset() {
+ return this.word.offset;
+ }
+
+ @Override
+ public int getEndOffset() {
+ return getStartOffset() + word.word.length();
+ }
+
+ @Override
+ public String toString() {
+ return getText();
+ }
+}
diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java
new file mode 100644
index 000000000..9d528f00a
--- /dev/null
+++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java
@@ -0,0 +1,8 @@
+/**
+ * MYNLP 中文NLP工具包分词实现
+ * 项目地址:https://github.com/mayabot/mynlp/
+ *
+ * @author Looly
+ * @since 4.6.5
+ */
+package cn.hutool.extra.tokenizer.engine.mynlp;
\ No newline at end of file
diff --git a/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java b/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java
index cc752035a..3b061cab5 100644
--- a/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java
+++ b/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java
@@ -3,6 +3,7 @@ package cn.hutool.extra.tokenizer;
import java.util.Iterator;
import org.junit.Assert;
+import org.junit.Ignore;
import org.junit.Test;
import cn.hutool.core.collection.CollUtil;
@@ -12,6 +13,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
+import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
import cn.hutool.extra.tokenizer.engine.word.WordEngine;
/**
@@ -86,6 +88,16 @@ public class TokenizerUtilTest {
Assert.assertEquals("这两个 方法 的 区别 在于 返回值", resultStr);
}
+ @Test
+ @Ignore
+ public void mynlpTest() {
+ // 此单元测试需要JDK8,默认忽略
+ TokenizerEngine engine = new MynlpEngine();
+ Result result = engine.parse(text);
+ String resultStr = CollUtil.join((Iterator)result, " ");
+ Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);
+ }
+
private void checkResult(Result result) {
String resultStr = CollUtil.join((Iterator)result, " ");
Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);
diff --git a/pom.xml b/pom.xml
index a93108a31..efa60d0b1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -89,6 +89,7 @@
${compile.version}
${compile.version}
+ true