Examples with OffsetAttribute - org.apache.lucene.analysis.tokenattributes.OffsetAttribute

Example 66 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class CustomAssertions method assertSymbolStream.

/**
 * Asserts the specified tokenizer class produces an expected stream of
 * symbols from the specified input.
 * @param klass the test class
 * @param iss the input stream
 * @param expectedTokens the expected, ordered token list
 * @throws java.lang.Exception if an error occurs constructing a
 * {@code klass} instance or testing the stream
 */
public static void assertSymbolStream(Class<? extends JFlexSymbolMatcher> klass, InputStream iss, List<String> expectedTokens) throws Exception {
    byte[] inputCopy = copyStream(iss);
    String input = new String(inputCopy, StandardCharsets.UTF_8);
    JFlexTokenizer tokenizer = new JFlexTokenizer(klass.getConstructor(Reader.class).newInstance(new InputStreamReader(new ByteArrayInputStream(inputCopy), StandardCharsets.UTF_8)));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offs = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    List<String> tokens = new ArrayList<>();
    while (tokenizer.incrementToken()) {
        String termValue = term.toString();
        tokens.add(termValue);
        String cutValue = input.substring(offs.startOffset(), offs.endOffset());
        assertEquals("cut term" + (1 + count), cutValue, termValue);
        ++count;
    }
    count = 0;
    for (String token : tokens) {
        // 1-based offset to accord with line #
        if (count >= expectedTokens.size()) {
            printTokens(tokens);
            assertTrue("too many tokens at term" + (1 + count) + ": " + token, count < expectedTokens.size());
        }
        String expected = expectedTokens.get(count);
        if (!token.equals(expected)) {
            printTokens(tokens);
            assertEquals("term" + (1 + count), expected, token);
        }
        count++;
    }
    assertEquals("wrong number of tokens", expectedTokens.size(), count);
}

Also used : JFlexTokenizer(org.opensolaris.opengrok.analysis.JFlexTokenizer) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) ArrayList(java.util.ArrayList)

Example 67 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project SearchServices by Alfresco.

the class PathTokenFilterTest method testAttributesAfterStreamEnd.

public void testAttributesAfterStreamEnd() throws IOException {
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR, PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT, PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    // PathTokenFilter.end() will be called after all tokens consumed.
    tokenise(ts, new String[] { "uri1", "one" });
    // Check attributes cleaned up
    assertEquals("", termAtt.toString());
    // the default
    assertEquals("word", typeAtt.type());
    assertEquals(0, posIncAtt.getPositionIncrement());
    // Final offset...
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
}

Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 68 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project SearchServices by Alfresco.

the class Solr4QueryParser method getToken.

protected String getToken(String field, String value, AnalysisMode analysisMode) throws ParseException {
    try (TokenStream source = getAnalyzer().tokenStream(field, new StringReader(value))) {
        String tokenised = null;
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            PackedTokenAttributeImpl token = new PackedTokenAttributeImpl();
            token.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                token.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                token.setPositionIncrement(posIncAtt.getPositionIncrement());
            }
            tokenised = token.toString();
        }
        return tokenised;
    } catch (IOException e) {
        throw new ParseException("IO" + e.getMessage());
    }
}

Also used : PackedTokenAttributeImpl(org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) ParseException(org.apache.lucene.queryparser.classic.ParseException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 69 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project SearchServices by Alfresco.

the class Solr4QueryParser method getFirstTokenForRange.

private String getFirstTokenForRange(String string, FieldInstance field) throws IOException {
    PackedTokenAttributeImpl nextToken;
    TokenStream source = null;
    ;
    try {
        source = getAnalyzer().tokenStream(field.getField(), new StringReader(string));
        source.reset();
        while (source.incrementToken()) {
            CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAtt = null;
            if (source.hasAttribute(TypeAttribute.class)) {
                typeAtt = source.getAttribute(TypeAttribute.class);
            }
            PositionIncrementAttribute posIncAtt = null;
            if (source.hasAttribute(PositionIncrementAttribute.class)) {
                posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
            }
            nextToken = new PackedTokenAttributeImpl();
            nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
            nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            if (typeAtt != null) {
                nextToken.setType(typeAtt.type());
            }
            if (posIncAtt != null) {
                nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
            }
            return nextToken.toString();
        }
    } finally {
        try {
            if (source != null) {
                source.close();
            }
        } catch (IOException e) {
        // ignore
        }
    }
    return null;
}

Also used : PackedTokenAttributeImpl(org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 70 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project epadd by ePADD.

the class Highlighter method dumpTokenStream.

/**
 * @param content - text to be highlighted
 * @param term - This can be a generic query passed to the Lucene search, for example: elate|happy|invite, hope, "Robert Creeley", /guth.+/ , /[0-9\\-]*[0-9]{3}[- ][0-9]{2}[- ][0-9]{4}[0-9\\-]+/ are all valid terms
 * @param preTag - HTML pre-tag, for ex: <B>
 * @param postTag - HTML post-tag, for ex: </B>
 * The highlighted content would have [pre Tag] matching term [post tag]
 * When the term is "Robert Creeley", the output is "On Tue, Jun 24, 2014 at 11:56 AM, [preTag]Robert Creeley's[postTag] <creeley@acsu.buffalo.edu> wrote:"
 */
/**
 * debug method only
 */
private static String dumpTokenStream(Analyzer analyzer, TokenStream tokenStream) throws IOException {
    // taken from https://stackoverflow.com/questions/2638200/how-to-get-a-token-from-a-lucene-tokenstream
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    StringBuilder sb = new StringBuilder();
    sb.append("Tokens:\n");
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String term = charTermAttribute.toString();
        sb.append(term + "(offsets: " + startOffset + ", " + endOffset + ")\n");
    }
    tokenStream.reset();
    return sb.toString();
}

Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)82 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)59 TokenStream (org.apache.lucene.analysis.TokenStream)47 StringReader (java.io.StringReader)36 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 IOException (java.io.IOException)25 ArrayList (java.util.ArrayList)23 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)17 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)9 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)8 Analyzer (org.apache.lucene.analysis.Analyzer)7 Token (org.apache.lucene.analysis.Token)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)7 List (java.util.List)6 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5