use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project ddf by codice.
the class ContextualEvaluator method logTokens.
private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName) throws IOException {
if (!LOGGER.isDebugEnabled()) {
return;
}
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument));
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
LOGGER.debug("----- {} tokens -----", analyzerName);
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = termAttribute.term();
LOGGER.debug(term);
}
LOGGER.debug("----- END: {} tokens -----", analyzerName);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project elasticsearch by elastic.
the class PlainHighlighter method findGoodEndForNoHighlightExcerpt.
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
// Can't split on term boundaries without offsets
return -1;
}
int end = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
if (attr.endOffset() >= noMatchSize) {
// Jump to the end of this token if it wouldn't put us past the boundary
if (attr.endOffset() == noMatchSize) {
end = noMatchSize;
}
return end;
}
end = attr.endOffset();
}
tokenStream.end();
// We've exhausted the token stream so we should just highlight everything.
return end;
}
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class JFlexTokenizerTest method testOffsetAttribute.
/**
* Helper method for {@link #testOffsetAttribute()} that runs the test on
* one single implementation class with the specified input text and
* expected tokens.
*/
private void testOffsetAttribute(Class<? extends JFlexTokenizer> klass, String inputText, String[] expectedTokens) throws Exception {
JFlexTokenizer tokenizer = klass.getConstructor(Reader.class).newInstance(new StringReader(inputText));
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
int count = 0;
while (tokenizer.incrementToken()) {
assertTrue("too many tokens", count < expectedTokens.length);
String expected = expectedTokens[count];
assertEquals("term", expected, term.toString());
assertEquals("start", inputText.indexOf(expected), offset.startOffset());
assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
count++;
}
assertEquals("wrong number of tokens", expectedTokens.length, count);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class PathTokenizerTest method testIncrementToken.
/**
* Test of incrementToken method, of class PathTokenizer.
*/
@Test
public void testIncrementToken() throws Exception {
String inputText = "alpha/beta/gamma/delta.ext";
String[] expectedTokens = inputText.split("[/.]");
PathTokenizer tokenizer = new PathTokenizer();
tokenizer.setReader(new StringReader(inputText));
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
int count = 0;
int dots = 0;
tokenizer.reset();
while (tokenizer.incrementToken()) {
if (term.toString().equals(".")) {
dots++;
break;
}
assertTrue("too many tokens", count < expectedTokens.length);
String expected = expectedTokens[count];
assertEquals("term", expected, term.toString());
assertEquals("start", inputText.indexOf(expected), offset.startOffset());
assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
count++;
}
tokenizer.end();
tokenizer.close();
assertEquals("wrong number of tokens", expectedTokens.length, count + dots);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project textdb by TextDB.
the class DataflowUtils method generatePayload.
public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
List<Span> payload = new ArrayList<>();
try {
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
int tokenPositionCounter = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
int tokenPosition = tokenPositionCounter;
int charStart = offsetAttribute.startOffset();
int charEnd = offsetAttribute.endOffset();
String analyzedTermStr = charTermAttribute.toString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
}
tokenStream.close();
} catch (IOException e) {
// return empty payload
payload.clear();
}
return payload;
}
Aggregations