This commit is contained in:
Looly
2021-09-25 11:25:51 +08:00
parent 35b9212103
commit e16ec27c40
21 changed files with 183 additions and 251 deletions

View File

@@ -1,56 +1,26 @@
package cn.hutool.extra.tokenizer;
import cn.hutool.core.collection.ComputeIter;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* 对于未实现{@link Iterator}接口的普通结果类,装饰为{@link Result}<br>
* 普通的结果类只需实现{@link #nextWord()} 即可
*
*
* @author looly
*
*/
public abstract class AbstractResult implements Result{
private Word cachedWord;
@Override
public boolean hasNext() {
if (this.cachedWord != null) {
return true;
}
public abstract class AbstractResult extends ComputeIter<Word> implements Result{
final Word next = nextWord();
if(null != next) {
this.cachedWord = next;
return true;
}
return false;
}
/**
* 下一个单词通过实现此方法获取下一个单词null表示无下一个结果。
* @return 下一个单词或null
*/
protected abstract Word nextWord();
@Override
public Word next() {
if (false == hasNext()) {
throw new NoSuchElementException("No more word !");
}
final Word currentWord = this.cachedWord;
this.cachedWord = null;
return currentWord;
}
@Override
public void remove() {
throw new UnsupportedOperationException("Jcseg result not allow to remove !");
}
@Override
public Iterator<Word> iterator() {
return this;
protected Word computeNext() {
return nextWord();
}
}

View File

@@ -1,14 +1,14 @@
package cn.hutool.extra.tokenizer;
import java.util.Iterator;
import cn.hutool.core.collection.IterableIter;
/**
* 分词结果接口定义<br>
* 实现此接口包装分词器的分词结果通过实现Iterator相应方法获取分词中的单词
*
*
* @author looly
*
*/
public interface Result extends Iterator<Word>, Iterable<Word>{
public interface Result extends IterableIter<Word> {
}

View File

@@ -1,25 +1,24 @@
package cn.hutool.extra.tokenizer.engine.ansj;
import java.util.Iterator;
import org.ansj.domain.Term;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.Word;
import org.ansj.domain.Term;
import java.util.Iterator;
/**
* Ansj分词结果实现<br>
* 项目地址https://github.com/NLPchina/ansj_seg
*
* @author looly
*
* @author looly
*/
public class AnsjResult implements Result{
public class AnsjResult implements Result {
private final Iterator<Term> result;
/**
* 构造
*
* @param ansjResult 分词结果
*/
public AnsjResult(org.ansj.domain.Result ansjResult) {

View File

@@ -1,24 +1,23 @@
package cn.hutool.extra.tokenizer.engine.hanlp;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.Word;
import com.hankcs.hanlp.seg.common.Term;
import java.util.Iterator;
import java.util.List;
import com.hankcs.hanlp.seg.common.Term;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.Word;
/**
* HanLP分词结果实现<br>
* 项目地址https://github.com/hankcs/HanLP
*
*
* @author looly
*
*/
public class HanLPResult implements Result {
Iterator<Term> result;
public HanLPResult(List<Term> termList) {
this.result = termList.iterator();
}
@@ -37,11 +36,4 @@ public class HanLPResult implements Result {
public void remove() {
result.remove();
}
@Override
public Iterator<Word> iterator() {
return this;
}
}

View File

@@ -11,7 +11,7 @@ import java.io.IOException;
/**
* IKAnalyzer分词结果实现<br>
* 项目地址https://github.com/yozhao/IKAnalyzer
*
*
* @author looly
*
*/
@@ -21,7 +21,7 @@ public class IKAnalyzerResult extends AbstractResult {
/**
* 构造
*
*
* @param seg 分词结果
*/
public IKAnalyzerResult(IKSegmenter seg) {
@@ -36,9 +36,9 @@ public class IKAnalyzerResult extends AbstractResult {
} catch (IOException e) {
throw new TokenizerException(e);
}
if (null != next) {
return new IKAnalyzerWord(next);
if (null == next) {
return null;
}
return null;
return new IKAnalyzerWord(next);
}
}

View File

@@ -1,27 +1,24 @@
package cn.hutool.extra.tokenizer.engine.jcseg;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.AbstractResult;
import cn.hutool.extra.tokenizer.TokenizerException;
import cn.hutool.extra.tokenizer.Word;
import org.lionsoul.jcseg.ISegment;
import org.lionsoul.jcseg.IWord;
import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Jcseg分词结果包装<br>
* 项目地址https://gitee.com/lionsoul/jcseg
*
*
* @author looly
*
*/
public class JcsegResult implements Result{
public class JcsegResult extends AbstractResult {
private final ISegment result;
private Word cachedWord;
/**
* 构造
* @param segment 分词结果
@@ -31,41 +28,16 @@ public class JcsegResult implements Result{
}
@Override
public boolean hasNext() {
if (this.cachedWord != null) {
return true;
}
IWord next;
protected Word nextWord() {
IWord word;
try {
next = this.result.next();
word = this.result.next();
} catch (IOException e) {
throw new TokenizerException(e);
throw new TokenizerException(e);
}
if(null != next) {
this.cachedWord = new JcsegWord(next);
return true;
if(null == word){
return null;
}
return false;
return new JcsegWord(word);
}
@Override
public Word next() {
if (false == hasNext()) {
throw new NoSuchElementException("No more word !");
}
final Word currentWord = this.cachedWord;
this.cachedWord = null;
return currentWord;
}
@Override
public void remove() {
throw new UnsupportedOperationException("Jcseg result not allow to remove !");
}
@Override
public Iterator<Word> iterator() {
return this;
}
}

View File

@@ -1,24 +1,23 @@
package cn.hutool.extra.tokenizer.engine.jieba;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.Word;
import com.huaban.analysis.jieba.SegToken;
import java.util.Iterator;
import java.util.List;
import com.huaban.analysis.jieba.SegToken;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.Word;
/**
* Jieba分词结果实现<br>
* 项目地址https://github.com/huaban/jieba-analysis
*
*
* @author looly
*
*/
public class JiebaResult implements Result{
Iterator<SegToken> result;
/**
* 构造
* @param segTokenList 分词结果
@@ -41,10 +40,4 @@ public class JiebaResult implements Result{
public void remove() {
result.remove();
}
@Override
public Iterator<Word> iterator() {
return this;
}
}

View File

@@ -10,7 +10,7 @@ import java.io.IOException;
/**
* mmseg4j分词结果实现<br>
* 项目地址https://github.com/chenlb/mmseg4j-core
*
*
* @author looly
*
*/
@@ -20,7 +20,7 @@ public class MmsegResult extends AbstractResult {
/**
* 构造
*
*
* @param mmSeg 分词结果
*/
public MmsegResult(MMSeg mmSeg) {
@@ -35,9 +35,9 @@ public class MmsegResult extends AbstractResult {
} catch (IOException e) {
throw new TokenizerException(e);
}
if (null != next) {
return new MmsegWord(next);
if (null == next) {
return null;
}
return null;
return new MmsegWord(next);
}
}

View File

@@ -10,17 +10,17 @@ import java.util.Iterator;
/**
* MYNLP 中文NLP工具包分词结果实现<br>
* 项目地址https://github.com/mayabot/mynlp/
*
*
* @author looly
*
*/
public class MynlpResult implements Result {
private final Iterator<WordTerm> result;
/**
* 构造
*
*
* @param sentence 分词结果(中文句子)
*/
public MynlpResult(Sentence sentence) {
@@ -41,9 +41,4 @@ public class MynlpResult implements Result {
public void remove() {
result.remove();
}
@Override
public Iterator<Word> iterator() {
return this;
}
}

View File

@@ -9,7 +9,7 @@ import java.util.List;
/**
* Word分词结果实现<br>
* 项目地址https://github.com/ysc/word
*
*
* @author looly
*
*/
@@ -19,7 +19,7 @@ public class WordResult implements Result{
/**
* 构造
*
*
* @param result 分词结果
*/
public WordResult(List<org.apdplat.word.segmentation.Word> result) {
@@ -40,10 +40,4 @@ public class WordResult implements Result{
public void remove() {
this.wordIter.remove();
}
@Override
public Iterator<Word> iterator() {
return this;
}
}