Search in sources :

Example 16 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class GraphTokenStreamFiniteStrings method build.

/**
   * Build an automaton from the provided {@link TokenStream}.
   */
private Automaton build(final TokenStream in) throws IOException {
    Automaton.Builder builder = new Automaton.Builder();
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    in.reset();
    int pos = -1;
    int prevIncr = 1;
    int state = -1;
    while (in.incrementToken()) {
        int currentIncr = posIncAtt.getPositionIncrement();
        if (pos == -1 && currentIncr < 1) {
            throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
        }
        // always use inc 1 while building, but save original increment
        int incr = Math.min(1, currentIncr);
        if (incr > 0) {
            pos += incr;
        }
        int endPos = pos + posLengthAtt.getPositionLength();
        while (state < endPos) {
            state = builder.createState();
        }
        BytesRef term = termBytesAtt.getBytesRef();
        int id = getTermID(currentIncr, prevIncr, term);
        builder.addTransition(pos, endPos, id);
        // only save last increment on non-zero increment in case we have multiple stacked tokens
        if (currentIncr > 0) {
            prevIncr = currentIncr;
        }
    }
    in.end();
    if (state != -1) {
        builder.setAccept(state, true);
    }
    return builder.finish();
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) Automaton(org.apache.lucene.util.automaton.Automaton) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 17 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class ReadTokensTask method doLogic.

@Override
public int doLogic() throws Exception {
    List<IndexableField> fields = doc.getFields();
    Analyzer analyzer = getRunData().getAnalyzer();
    int tokenCount = 0;
    for (final IndexableField field : fields) {
        if (field.fieldType().indexOptions() == IndexOptions.NONE || field.fieldType().tokenized() == false) {
            continue;
        }
        final TokenStream stream = field.tokenStream(analyzer, null);
        // reset the TokenStream to the first token
        stream.reset();
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        while (stream.incrementToken()) {
            termAtt.getBytesRef();
            tokenCount++;
        }
        stream.end();
        stream.close();
    }
    totalTokenCount += tokenCount;
    return tokenCount;
}
Also used : IndexableField(org.apache.lucene.index.IndexableField) TokenStream(org.apache.lucene.analysis.TokenStream) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 18 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class TestLongPostings method getRandomTerm.

// Produces a realistic unicode random string that
// survives MockAnalyzer unchanged:
private String getRandomTerm(String other) throws IOException {
    Analyzer a = new MockAnalyzer(random());
    while (true) {
        String s = TestUtil.randomRealisticUnicodeString(random());
        if (other != null && s.equals(other)) {
            continue;
        }
        try (TokenStream ts = a.tokenStream("foo", s)) {
            final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
            ts.reset();
            int count = 0;
            boolean changed = false;
            while (ts.incrementToken()) {
                final BytesRef termBytes = termAtt.getBytesRef();
                if (count == 0 && !termBytes.utf8ToString().equals(s)) {
                    // The value was changed during analysis.  Keep iterating so the
                    // tokenStream is exhausted.
                    changed = true;
                }
                count++;
            }
            ts.end();
            // Did we iterate just once and the value was unchanged?
            if (!changed && count == 1) {
                return s;
            }
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef)

Example 19 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project janusgraph by JanusGraph.

the class LuceneIndex method customTokenize.

// adapted from SolrIndex
private List<String> customTokenize(Analyzer analyzer, String fieldName, String value) {
    final List<String> terms = new ArrayList<>();
    try (CachingTokenFilter stream = new CachingTokenFilter(analyzer.tokenStream(fieldName, value))) {
        final TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            terms.add(termAtt.getBytesRef().utf8ToString());
        }
        return terms;
    } catch (IOException e) {
        throw new IllegalArgumentException(e.getMessage(), e);
    }
}
Also used : CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) IOException(java.io.IOException)

Example 20 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class Analyzer method normalize.

/**
   * Normalize a string down to the representation that it would have in the
   * index.
   * <p>
   * This is typically used by query parsers in order to generate a query on
   * a given term, without tokenizing or stemming, which are undesirable if
   * the string to analyze is a partial word (eg. in case of a wildcard or
   * fuzzy query).
   * <p>
   * This method uses {@link #initReaderForNormalization(String, Reader)} in
   * order to apply necessary character-level normalization and then
   * {@link #normalize(String, TokenStream)} in order to apply the normalizing
   * token filters.
   */
public final BytesRef normalize(final String fieldName, final String text) {
    try {
        // apply char filters
        final String filteredText;
        try (Reader reader = new StringReader(text)) {
            Reader filterReader = initReaderForNormalization(fieldName, reader);
            char[] buffer = new char[64];
            StringBuilder builder = new StringBuilder();
            for (; ; ) {
                final int read = filterReader.read(buffer, 0, buffer.length);
                if (read == -1) {
                    break;
                }
                builder.append(buffer, 0, read);
            }
            filteredText = builder.toString();
        } catch (IOException e) {
            throw new IllegalStateException("Normalization threw an unexpected exeption", e);
        }
        final AttributeFactory attributeFactory = attributeFactory(fieldName);
        try (TokenStream ts = normalize(fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) {
            final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
            ts.reset();
            if (ts.incrementToken() == false) {
                throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " + this + " and input \"" + text + "\"");
            }
            final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
            if (ts.incrementToken()) {
                throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\"");
            }
            ts.end();
            return term;
        }
    } catch (IOException e) {
        throw new IllegalStateException("Normalization threw an unexpected exeption", e);
    }
}
Also used : Reader(java.io.Reader) StringReader(java.io.StringReader) AttributeFactory(org.apache.lucene.util.AttributeFactory) IOException(java.io.IOException) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) StringReader(java.io.StringReader) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)34 BytesRef (org.apache.lucene.util.BytesRef)17 TokenStream (org.apache.lucene.analysis.TokenStream)16 IOException (java.io.IOException)13 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)12 ArrayList (java.util.ArrayList)10 Term (org.apache.lucene.index.Term)9 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)6 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)4 Analyzer (org.apache.lucene.analysis.Analyzer)3 CachingTokenFilter (org.apache.lucene.analysis.CachingTokenFilter)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)3 SpanTermQuery (org.apache.lucene.search.spans.SpanTermQuery)3 StringReader (java.io.StringReader)2 HashSet (java.util.HashSet)2 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2 MultiPhraseQuery (org.apache.lucene.search.MultiPhraseQuery)2