use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project corelib by europeana.
the class QueryExtractor method extractTokens.
private List<QueryTermPosition> extractTokens(String text) {
List<QueryTermPosition> queryTerms = new ArrayList<>();
TokenStream ts;
try {
ts = analyzer.tokenStream("text", new StringReader(text));
OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
ts.reset();
int i = 0;
while (ts.incrementToken()) {
int start = offsetAttribute.startOffset();
int end = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
// ANDY
if (term.contains(":")) {
start = start + term.indexOf(":") + 1;
}
// END ANDY
queryTerms.add(new QueryTermPosition(start, end, term, text.substring(start, end), i++));
}
ts.end();
ts.close();
} catch (IOException e) {
e.printStackTrace();
}
return queryTerms;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project omegat by omegat-org.
the class BaseTokenizer method tokenize.
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) {
if (StringUtil.isEmpty(strOrig)) {
return EMPTY_TOKENS_LIST;
}
List<Token> result = new ArrayList<Token>(64);
try (TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed)) {
in.addAttribute(CharTermAttribute.class);
in.addAttribute(OffsetAttribute.class);
CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
OffsetAttribute off = in.getAttribute(OffsetAttribute.class);
in.reset();
while (in.incrementToken()) {
String tokenText = cattr.toString();
if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
}
}
in.end();
} catch (IOException ex) {
Log.log(ex);
}
return result.toArray(new Token[result.size()]);
}
Aggregations