Search in sources :

Example 46 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project omegat by omegat-org.

the class BaseTokenizer method tokenizeToStrings.

protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed, boolean filterDigits, boolean filterWhitespace) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }
    List<String> result = new ArrayList<String>(64);
    try (TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed)) {
        in.addAttribute(CharTermAttribute.class);
        in.addAttribute(OffsetAttribute.class);
        CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
        OffsetAttribute off = in.getAttribute(OffsetAttribute.class);
        Locale loc = stemsAllowed ? getEffectiveLanguage().getLocale() : null;
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(tokenText);
                if (stemsAllowed) {
                    String origText = str.substring(off.startOffset(), off.endOffset());
                    if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                        result.add(origText);
                    }
                }
            }
        }
        in.end();
    } catch (IOException ex) {
        Log.log(ex);
    }
    return result.toArray(new String[result.size()]);
}
Also used : Locale(java.util.Locale) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException)

Example 47 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project elasticsearch by elastic.

the class PlainHighlighter method findGoodEndForNoHighlightExcerpt.

private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 48 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.

the class JFlexTokenizerTest method testOffsetAttribute.

/**
     * Helper method for {@link #testOffsetAttribute()} that runs the test on
     * one single implementation class with the specified input text and
     * expected tokens.
     */
private void testOffsetAttribute(Class<? extends JFlexTokenizer> klass, String inputText, String[] expectedTokens) throws Exception {
    JFlexTokenizer tokenizer = klass.getConstructor(Reader.class).newInstance(new StringReader(inputText));
    CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
    int count = 0;
    while (tokenizer.incrementToken()) {
        assertTrue("too many tokens", count < expectedTokens.length);
        String expected = expectedTokens[count];
        assertEquals("term", expected, term.toString());
        assertEquals("start", inputText.indexOf(expected), offset.startOffset());
        assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
        count++;
    }
    assertEquals("wrong number of tokens", expectedTokens.length, count);
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Reader(java.io.Reader) StringReader(java.io.StringReader)

Example 49 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class SimpleQueryConverter method convert.

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) HashSet(java.util.HashSet)

Example 50 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TokenOffsetPayloadTokenFilterTest method test.

public void test() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";
    TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(whitespaceMockTokenizer(test));
    int count = 0;
    PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
    OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
    nptf.reset();
    while (nptf.incrementToken()) {
        BytesRef pay = payloadAtt.getPayload();
        assertTrue("pay is null and it shouldn't be", pay != null);
        byte[] data = pay.bytes;
        int start = PayloadHelper.decodeInt(data, 0);
        assertTrue(start + " does not equal: " + offsetAtt.startOffset(), start == offsetAtt.startOffset());
        int end = PayloadHelper.decodeInt(data, 4);
        assertTrue(end + " does not equal: " + offsetAtt.endOffset(), end == offsetAtt.endOffset());
        count++;
    }
    assertTrue(count + " does not equal: " + 10, count == 10);
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)82 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)59 TokenStream (org.apache.lucene.analysis.TokenStream)47 StringReader (java.io.StringReader)36 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 IOException (java.io.IOException)25 ArrayList (java.util.ArrayList)23 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)17 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)9 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)8 Analyzer (org.apache.lucene.analysis.Analyzer)7 Token (org.apache.lucene.analysis.Token)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)7 List (java.util.List)6 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5