use of org.wltea.analyzer.core.Lexeme in project elasticsearch-analysis-ik by medcl.
the class IKTokenizer method incrementToken.
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
// 清除所有的词元属性
clearAttributes();
skippedPositions = 0;
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
posIncrAtt.setPositionIncrement(skippedPositions + 1);
// 将Lexeme转成Attributes
// 设置词元文本
termAtt.append(nextLexeme.getLexemeText());
// 设置词元长度
termAtt.setLength(nextLexeme.getLength());
// 设置词元位移
offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
// 记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
// 返会true告知还有下个词元
return true;
}
// 返会false告知词元输出完毕
return false;
}
use of org.wltea.analyzer.core.Lexeme in project jstarcraft-nlp by HongZhaoHua.
the class IkTokenizer method incrementToken.
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
// 清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
// 将Lexeme转成Attributes
// 设置词元文本
termAttribute.append(nextLexeme.getLexemeText());
// 设置词元长度
termAttribute.setLength(nextLexeme.getLength());
// 设置词元位移
offsetAttribute.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
// 记录词元分类
typeAttribute.setType(nextLexeme.getLexemeTypeString());
// 返会true告知还有下个词元
return true;
}
// 返会false告知词元输出完毕
return false;
}
use of org.wltea.analyzer.core.Lexeme in project jstarcraft-nlp by HongZhaoHua.
the class IkTokenizer method tokenize.
@Override
public Iterable<IkToken> tokenize(CharSequence text) {
try {
segmenter.reset(new StringReader(text.toString()));
LinkedList<Lexeme> iterator = new LinkedList<>();
while (true) {
Lexeme lexeme = segmenter.next();
if (lexeme != null) {
iterator.add(lexeme);
} else {
break;
}
}
IkToken iterable = new IkToken(iterator.iterator());
return iterable;
} catch (Exception exception) {
throw new RuntimeException(exception);
}
}
use of org.wltea.analyzer.core.Lexeme in project jivejdon by banq.
the class MessageSearchProxy method getTextDef.
public static Map<String, Integer> getTextDef(String text) throws IOException {
Map<String, Integer> wordsFren = new HashMap<String, Integer>();
IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(text), true);
Lexeme lexeme;
while ((lexeme = ikSegmenter.next()) != null) {
if (lexeme.getLexemeText().length() > 1) {
if (wordsFren.containsKey(lexeme.getLexemeText())) {
wordsFren.put(lexeme.getLexemeText(), wordsFren.get(lexeme.getLexemeText()) + 1);
} else {
wordsFren.put(lexeme.getLexemeText(), 1);
}
}
}
return wordsFren;
}
use of org.wltea.analyzer.core.Lexeme in project incubator-hugegraph by apache.
the class IKAnalyzer method segment.
@Override
public Set<String> segment(String text) {
Set<String> result = InsertionOrderUtil.newSet();
IKSegmenter ik = new IKSegmenter(new StringReader(text), this.smartSegMode);
try {
Lexeme word = null;
while ((word = ik.next()) != null) {
result.add(word.getLexemeText());
}
} catch (Exception e) {
throw new HugeException("IKAnalyzer segment text '%s' failed", e, text);
}
return result;
}
Aggregations