use of com.twitter.common.text.token.attribute.TokenGroupAttributeImpl in project commons by twitter.
the class TokenizedCharSequence method createFromTokenGroupsIn.
public static final List<TokenizedCharSequence> createFromTokenGroupsIn(TwitterTokenStream stream) {
TokenGroupAttribute groupAttr = stream.getAttribute(TokenGroupAttribute.class);
List<TokenizedCharSequence> groups = Lists.newArrayList();
while (stream.incrementToken()) {
Builder builder = new Builder(stream.term());
TwitterTokenStream groupStream = groupAttr.getTokenGroupStream();
PartOfSpeechAttribute posAttr = null;
if (groupStream.hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = groupStream.getAttribute(PartOfSpeechAttribute.class);
}
PositionIncrementAttribute incAttr = null;
if (groupStream.hasAttribute(PositionIncrementAttribute.class)) {
incAttr = groupStream.getAttribute(PositionIncrementAttribute.class);
}
TokenGroupAttributeImpl innerGroupAttr = null;
if (groupStream.hasAttribute(TokenGroupAttribute.class)) {
innerGroupAttr = (TokenGroupAttributeImpl) groupStream.getAttribute(TokenGroupAttribute.class);
}
while (groupStream.incrementToken()) {
builder.addToken(groupStream.offset() - stream.offset(), groupStream.length(), groupStream.type(), posAttr == null ? Token.DEFAULT_PART_OF_SPEECH : posAttr.getPOS(), incAttr == null ? 1 : incAttr.getPositionIncrement(), innerGroupAttr == null || innerGroupAttr.isEmpty() ? null : (innerGroupAttr.getSequence() == null ? createFrom(innerGroupAttr.getTokenGroupStream()) : innerGroupAttr.getSequence()));
}
groups.add(builder.build());
}
return groups;
}
use of com.twitter.common.text.token.attribute.TokenGroupAttributeImpl in project commons by twitter.
the class TokenizedCharSequence method createFrom.
public static final TokenizedCharSequence createFrom(TwitterTokenStream tokenizer) {
CharSequenceTermAttribute termAttr = tokenizer.getAttribute(CharSequenceTermAttribute.class);
TokenTypeAttribute typeAttr = tokenizer.getAttribute(TokenTypeAttribute.class);
PartOfSpeechAttribute posAttr = null;
if (tokenizer.hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = tokenizer.getAttribute(PartOfSpeechAttribute.class);
}
PositionIncrementAttribute incAttr = null;
if (tokenizer.hasAttribute(PositionIncrementAttribute.class)) {
incAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
}
TokenGroupAttributeImpl groupAttr = null;
if (tokenizer.hasAttribute(TokenGroupAttribute.class)) {
groupAttr = (TokenGroupAttributeImpl) tokenizer.getAttribute(TokenGroupAttribute.class);
}
//Need to wait for increment token for termAttr to have charsequence properly set
TokenizedCharSequence.Builder builder = null;
while (tokenizer.incrementToken()) {
if (builder == null) {
//Now we can set the term sequence for the builder.
builder = new TokenizedCharSequence.Builder(termAttr.getCharSequence());
}
builder.addToken(termAttr.getOffset(), termAttr.getLength(), typeAttr.getType(), posAttr == null ? Token.DEFAULT_PART_OF_SPEECH : posAttr.getPOS(), incAttr == null ? 1 : incAttr.getPositionIncrement(), groupAttr == null || groupAttr.isEmpty() ? null : (groupAttr.getSequence() == null ? createFrom(groupAttr.getTokenGroupStream()) : groupAttr.getSequence()));
}
if (builder == null) {
//Never entered tokenizer loop, build an empty string
builder = new TokenizedCharSequence.Builder("");
}
return builder.build();
}
Aggregations