mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-07-21 15:09:48 +08:00
add mynlp
This commit is contained in:
@@ -10,6 +10,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.word.WordEngine;
|
||||
import cn.hutool.log.StaticLog;
|
||||
|
||||
@@ -77,6 +78,11 @@ public class TokenizerFactory {
|
||||
} catch (NoClassDefFoundError e) {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
return new MynlpEngine();
|
||||
} catch (NoClassDefFoundError e) {
|
||||
// ignore
|
||||
}
|
||||
throw new TokenizerException("No tokenizer found ! Please add some tokenizer jar to your project !");
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,44 @@
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
||||
|
||||
import com.mayabot.nlp.segment.Lexer;
|
||||
import com.mayabot.nlp.segment.Lexers;
|
||||
import com.mayabot.nlp.segment.Sentence;
|
||||
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import cn.hutool.extra.tokenizer.Result;
|
||||
import cn.hutool.extra.tokenizer.TokenizerEngine;
|
||||
|
||||
/**
|
||||
* MYNLP 中文NLP工具包分词实现<br>
|
||||
* 项目地址:https://github.com/mayabot/mynlp/
|
||||
*
|
||||
* @author looly
|
||||
*
|
||||
*/
|
||||
public class MynlpEngine implements TokenizerEngine {
|
||||
|
||||
private Lexer lexer;
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*/
|
||||
public MynlpEngine() {
|
||||
this.lexer = Lexers.core();
|
||||
}
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*
|
||||
* @param lexer 分词器接口{@link Lexer}
|
||||
*/
|
||||
public MynlpEngine(Lexer lexer) {
|
||||
this.lexer = lexer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Result parse(CharSequence text) {
|
||||
final Sentence sentence = this.lexer.scan(StrUtil.str(text));
|
||||
return new MynlpResult(sentence);
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,50 @@
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.mayabot.nlp.segment.Sentence;
|
||||
import com.mayabot.nlp.segment.WordTerm;
|
||||
|
||||
import cn.hutool.extra.tokenizer.Result;
|
||||
import cn.hutool.extra.tokenizer.Word;
|
||||
|
||||
/**
|
||||
* MYNLP 中文NLP工具包分词结果实现<br>
|
||||
* 项目地址:https://github.com/mayabot/mynlp/
|
||||
*
|
||||
* @author looly
|
||||
*
|
||||
*/
|
||||
public class MynlpResult implements Result {
|
||||
|
||||
private Iterator<WordTerm> result;
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*
|
||||
* @param sentence 分词结果(中文句子)
|
||||
*/
|
||||
public MynlpResult(Sentence sentence) {
|
||||
this.result = sentence.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return result.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Word next() {
|
||||
return new MynlpWord(result.next());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
result.remove();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Word> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
@@ -0,0 +1,45 @@
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
||||
|
||||
import com.mayabot.nlp.segment.WordTerm;
|
||||
|
||||
import cn.hutool.extra.tokenizer.Word;
|
||||
|
||||
/**
|
||||
* mmseg分词中的一个单词包装
|
||||
*
|
||||
* @author looly
|
||||
*
|
||||
*/
|
||||
public class MynlpWord implements Word {
|
||||
|
||||
private WordTerm word;
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*
|
||||
* @param word {@link WordTerm}
|
||||
*/
|
||||
public MynlpWord(WordTerm word) {
|
||||
this.word = word;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
return word.getWord();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getStartOffset() {
|
||||
return this.word.offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getEndOffset() {
|
||||
return getStartOffset() + word.word.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getText();
|
||||
}
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
/**
|
||||
* MYNLP 中文NLP工具包分词实现<br>
|
||||
* 项目地址:https://github.com/mayabot/mynlp/
|
||||
*
|
||||
* @author Looly
|
||||
* @since 4.6.5
|
||||
*/
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
Reference in New Issue
Block a user