use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project omegat by omegat-org.
the class BaseTokenizer method tokenizeToStrings.
protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed, boolean filterDigits, boolean filterWhitespace) {
if (StringUtil.isEmpty(str)) {
return EMPTY_STRING_LIST;
}
List<String> result = new ArrayList<String>(64);
try (TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed)) {
in.addAttribute(CharTermAttribute.class);
in.addAttribute(OffsetAttribute.class);
CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
OffsetAttribute off = in.getAttribute(OffsetAttribute.class);
Locale loc = stemsAllowed ? getEffectiveLanguage().getLocale() : null;
in.reset();
while (in.incrementToken()) {
String tokenText = cattr.toString();
if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
result.add(tokenText);
if (stemsAllowed) {
String origText = str.substring(off.startOffset(), off.endOffset());
if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
result.add(origText);
}
}
}
}
in.end();
} catch (IOException ex) {
Log.log(ex);
}
return result.toArray(new String[result.size()]);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project elasticsearch by elastic.
the class PlainHighlighter method findGoodEndForNoHighlightExcerpt.
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
// Can't split on term boundaries without offsets
return -1;
}
int end = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
if (attr.endOffset() >= noMatchSize) {
// Jump to the end of this token if it wouldn't put us past the boundary
if (attr.endOffset() == noMatchSize) {
end = noMatchSize;
}
return end;
}
end = attr.endOffset();
}
tokenStream.end();
// We've exhausted the token stream so we should just highlight everything.
return end;
}
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class JFlexTokenizerTest method testOffsetAttribute.
/**
* Helper method for {@link #testOffsetAttribute()} that runs the test on
* one single implementation class with the specified input text and
* expected tokens.
*/
private void testOffsetAttribute(Class<? extends JFlexTokenizer> klass, String inputText, String[] expectedTokens) throws Exception {
JFlexTokenizer tokenizer = klass.getConstructor(Reader.class).newInstance(new StringReader(inputText));
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
int count = 0;
while (tokenizer.incrementToken()) {
assertTrue("too many tokens", count < expectedTokens.length);
String expected = expectedTokens[count];
assertEquals("term", expected, term.toString());
assertEquals("start", inputText.indexOf(expected), offset.startOffset());
assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
count++;
}
assertEquals("wrong number of tokens", expectedTokens.length, count);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class SimpleQueryConverter method convert.
@Override
public Collection<Token> convert(String origQuery) {
Collection<Token> result = new HashSet<>();
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
while (ts.incrementToken()) {
Token tok = new Token();
tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
tok.setFlags(flagsAtt.getFlags());
tok.setPayload(payloadAtt.getPayload());
tok.setPositionIncrement(posIncAtt.getPositionIncrement());
tok.setType(typeAtt.type());
result.add(tok);
}
ts.end();
return result;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TokenOffsetPayloadTokenFilterTest method test.
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(whitespaceMockTokenizer(test));
int count = 0;
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
nptf.reset();
while (nptf.incrementToken()) {
BytesRef pay = payloadAtt.getPayload();
assertTrue("pay is null and it shouldn't be", pay != null);
byte[] data = pay.bytes;
int start = PayloadHelper.decodeInt(data, 0);
assertTrue(start + " does not equal: " + offsetAtt.startOffset(), start == offsetAtt.startOffset());
int end = PayloadHelper.decodeInt(data, 4);
assertTrue(end + " does not equal: " + offsetAtt.endOffset(), end == offsetAtt.endOffset());
count++;
}
assertTrue(count + " does not equal: " + 10, count == 10);
}
Aggregations