use of com.chenlb.mmseg4j.Word in project java-basic by tzuyichao.
the class TestMMSeg4J method main.
public static void main(String[] args) throws IOException {
Dictionary dictionary = Dictionary.getInstance();
MMSeg mmSeg = new MMSeg(new StringReader("上一堂課之後跑18km與2500rpm的挑戰"), new ComplexSeg(dictionary));
Word word = null;
boolean first = true;
while ((word = mmSeg.next()) != null) {
System.out.println(word.getString());
}
}
use of com.chenlb.mmseg4j.Word in project jstarcraft-nlp by HongZhaoHua.
the class MmsegTokenizer method incrementToken.
/*
* //lucene 2.9 以下 public Token next(Token reusableToken) throws IOException { Token token = null; Word word = mmSeg.next(); if(word != null) { //lucene 2.3 reusableToken.clear(); reusableToken.setTermBuffer(word.getSen(), word.getWordOffset(), word.getLength()); reusableToken.setStartOffset(word.getStartOffset()); reusableToken.setEndOffset(word.getEndOffset()); reusableToken.setType(word.getType()); token = reusableToken; //lucene 2.4 //token = reusableToken.reinit(word.getSen(),
* word.getWordOffset(), word.getLength(), word.getStartOffset(), word.getEndOffset(), word.getType()); } return token; }
*/
// lucene 2.9/3.0
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
Word word = mmSeg.get().next();
if (word != null) {
// lucene 3.0
// termAtt.setTermBuffer(word.getSen(), word.getWordOffset(), word.getLength());
// lucene 3.1
termAttribute.copyBuffer(word.getSen(), word.getWordOffset(), word.getLength());
offsetAttribute.setOffset(word.getStartOffset(), word.getEndOffset());
typeAttribute.setType(word.getType());
return true;
} else {
end();
return false;
}
}
use of com.chenlb.mmseg4j.Word in project jstarcraft-nlp by HongZhaoHua.
the class MmsegTokenizer method tokenize.
@Override
public Iterable<MmsegToken> tokenize(CharSequence text) {
try {
mmSeg.reset(new StringReader(text.toString()));
LinkedList<Word> iterator = new LinkedList<>();
while (true) {
Word word = mmSeg.next();
if (word != null) {
iterator.add(word);
} else {
break;
}
}
MmsegToken iterable = new MmsegToken(iterator.iterator());
return iterable;
} catch (Exception exception) {
throw new RuntimeException(exception);
}
}
use of com.chenlb.mmseg4j.Word in project incubator-hugegraph by apache.
the class MMSeg4JAnalyzer method segment.
@Override
public Set<String> segment(String text) {
Set<String> result = InsertionOrderUtil.newSet();
MMSeg mmSeg = new MMSeg(new StringReader(text), this.seg);
try {
Word word = null;
while ((word = mmSeg.next()) != null) {
result.add(word.getString());
}
} catch (Exception e) {
throw new HugeException("MMSeg4j segment text '%s' failed", e, text);
}
return result;
}
Aggregations