Search in sources :

Example 31 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project ddf by codice.

the class ContextualEvaluator method logTokens.

private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName) throws IOException {
    if (!LOGGER.isDebugEnabled()) {
        return;
    }
    TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument));
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
    LOGGER.debug("-----  {} tokens  -----", analyzerName);
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String term = termAttribute.term();
        LOGGER.debug(term);
    }
    LOGGER.debug("-----  END:  {} tokens  -----", analyzerName);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute)

Example 32 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project elasticsearch by elastic.

the class PlainHighlighter method findGoodEndForNoHighlightExcerpt.

private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 33 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class JFlexTokenizerTest method testOffsetAttribute.

/**
     * Helper method for {@link #testOffsetAttribute()} that runs the test on
     * one single implementation class with the specified input text and
     * expected tokens.
     */
private void testOffsetAttribute(Class<? extends JFlexTokenizer> klass, String inputText, String[] expectedTokens) throws Exception {
    JFlexTokenizer tokenizer = klass.getConstructor(Reader.class).newInstance(new StringReader(inputText));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    while (tokenizer.incrementToken()) {
        assertTrue("too many tokens", count < expectedTokens.length);
        String expected = expectedTokens[count];
        assertEquals("term", expected, term.toString());
        assertEquals("start", inputText.indexOf(expected), offset.startOffset());
        assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
        count++;
    }
    assertEquals("wrong number of tokens", expectedTokens.length, count);
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Reader(java.io.Reader) StringReader(java.io.StringReader)

Example 34 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class PathTokenizerTest method testIncrementToken.

/**
     * Test of incrementToken method, of class PathTokenizer.
     */
@Test
public void testIncrementToken() throws Exception {
    String inputText = "alpha/beta/gamma/delta.ext";
    String[] expectedTokens = inputText.split("[/.]");
    PathTokenizer tokenizer = new PathTokenizer();
    tokenizer.setReader(new StringReader(inputText));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    int dots = 0;
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        if (term.toString().equals(".")) {
            dots++;
            break;
        }
        assertTrue("too many tokens", count < expectedTokens.length);
        String expected = expectedTokens[count];
        assertEquals("term", expected, term.toString());
        assertEquals("start", inputText.indexOf(expected), offset.startOffset());
        assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
        count++;
    }
    tokenizer.end();
    tokenizer.close();
    assertEquals("wrong number of tokens", expectedTokens.length, count + dots);
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Test(org.junit.Test)

Example 35 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project textdb by TextDB.

the class DataflowUtils method generatePayload.

public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
    List<Span> payload = new ArrayList<>();
    try {
        TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
        int tokenPositionCounter = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
            int tokenPosition = tokenPositionCounter;
            int charStart = offsetAttribute.startOffset();
            int charEnd = offsetAttribute.endOffset();
            String analyzedTermStr = charTermAttribute.toString();
            String originalTermStr = fieldValue.substring(charStart, charEnd);
            payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
        }
        tokenStream.close();
    } catch (IOException e) {
        // return empty payload
        payload.clear();
    }
    return payload;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) Span(edu.uci.ics.textdb.api.span.Span) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)53 TokenStream (org.apache.lucene.analysis.TokenStream)35 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)25 StringReader (java.io.StringReader)20 IOException (java.io.IOException)15 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)6 List (java.util.List)5 Analyzer (org.apache.lucene.analysis.Analyzer)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4