Search in sources :

Example 1 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class TestEmptyTokenStream method testIndexWriter_LUCENE4656.

public void testIndexWriter_LUCENE4656() throws IOException {
    Directory directory = newDirectory();
    IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(null));
    TokenStream ts = new EmptyTokenStream();
    assertFalse(ts.hasAttribute(TermToBytesRefAttribute.class));
    Document doc = new Document();
    doc.add(new StringField("id", "0", Field.Store.YES));
    doc.add(new TextField("description", ts));
    // this should not fail because we have no TermToBytesRefAttribute
    writer.addDocument(doc);
    assertEquals(1, writer.numDocs());
    writer.close();
    directory.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) IndexWriter(org.apache.lucene.index.IndexWriter) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) StringField(org.apache.lucene.document.StringField) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory)

Example 2 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class GraphTokenStreamFiniteStrings method build.

/**
   * Build an automaton from the provided {@link TokenStream}.
   */
private Automaton build(final TokenStream in) throws IOException {
    Automaton.Builder builder = new Automaton.Builder();
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    in.reset();
    int pos = -1;
    int prevIncr = 1;
    int state = -1;
    while (in.incrementToken()) {
        int currentIncr = posIncAtt.getPositionIncrement();
        if (pos == -1 && currentIncr < 1) {
            throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
        }
        // always use inc 1 while building, but save original increment
        int incr = Math.min(1, currentIncr);
        if (incr > 0) {
            pos += incr;
        }
        int endPos = pos + posLengthAtt.getPositionLength();
        while (state < endPos) {
            state = builder.createState();
        }
        BytesRef term = termBytesAtt.getBytesRef();
        int id = getTermID(currentIncr, prevIncr, term);
        builder.addTransition(pos, endPos, id);
        // only save last increment on non-zero increment in case we have multiple stacked tokens
        if (currentIncr > 0) {
            prevIncr = currentIncr;
        }
    }
    in.end();
    if (state != -1) {
        builder.setAccept(state, true);
    }
    return builder.finish();
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) Automaton(org.apache.lucene.util.automaton.Automaton) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 3 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class ReadTokensTask method doLogic.

@Override
public int doLogic() throws Exception {
    List<IndexableField> fields = doc.getFields();
    Analyzer analyzer = getRunData().getAnalyzer();
    int tokenCount = 0;
    for (final IndexableField field : fields) {
        if (field.fieldType().indexOptions() == IndexOptions.NONE || field.fieldType().tokenized() == false) {
            continue;
        }
        final TokenStream stream = field.tokenStream(analyzer, null);
        // reset the TokenStream to the first token
        stream.reset();
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        while (stream.incrementToken()) {
            termAtt.getBytesRef();
            tokenCount++;
        }
        stream.end();
        stream.close();
    }
    totalTokenCount += tokenCount;
    return tokenCount;
}
Also used : IndexableField(org.apache.lucene.index.IndexableField) TokenStream(org.apache.lucene.analysis.TokenStream) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 4 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class TestLongPostings method getRandomTerm.

// Produces a realistic unicode random string that
// survives MockAnalyzer unchanged:
private String getRandomTerm(String other) throws IOException {
    Analyzer a = new MockAnalyzer(random());
    while (true) {
        String s = TestUtil.randomRealisticUnicodeString(random());
        if (other != null && s.equals(other)) {
            continue;
        }
        try (TokenStream ts = a.tokenStream("foo", s)) {
            final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
            ts.reset();
            int count = 0;
            boolean changed = false;
            while (ts.incrementToken()) {
                final BytesRef termBytes = termAtt.getBytesRef();
                if (count == 0 && !termBytes.utf8ToString().equals(s)) {
                    // The value was changed during analysis.  Keep iterating so the
                    // tokenStream is exhausted.
                    changed = true;
                }
                count++;
            }
            ts.end();
            // Did we iterate just once and the value was unchanged?
            if (!changed && count == 1) {
                return s;
            }
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef)

Example 5 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class QueryBuilder method analyzeMultiPhrase.

/** 
   * Creates complex phrase query from the cached tokenstream contents 
   */
protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException {
    MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder();
    mpqb.setSlop(slop);
    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
    int position = -1;
    List<Term> multiTerms = new ArrayList<>();
    stream.reset();
    while (stream.incrementToken()) {
        int positionIncrement = posIncrAtt.getPositionIncrement();
        if (positionIncrement > 0 && multiTerms.size() > 0) {
            if (enablePositionIncrements) {
                mpqb.add(multiTerms.toArray(new Term[0]), position);
            } else {
                mpqb.add(multiTerms.toArray(new Term[0]));
            }
            multiTerms.clear();
        }
        position += positionIncrement;
        multiTerms.add(new Term(field, termAtt.getBytesRef()));
    }
    if (enablePositionIncrements) {
        mpqb.add(multiTerms.toArray(new Term[0]), position);
    } else {
        mpqb.add(multiTerms.toArray(new Term[0]));
    }
    return mpqb.build();
}
Also used : TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) ArrayList(java.util.ArrayList) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)32 BytesRef (org.apache.lucene.util.BytesRef)17 TokenStream (org.apache.lucene.analysis.TokenStream)16 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)12 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)9 Term (org.apache.lucene.index.Term)9 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)6 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)4 Analyzer (org.apache.lucene.analysis.Analyzer)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)3 SpanTermQuery (org.apache.lucene.search.spans.SpanTermQuery)3 HashSet (java.util.HashSet)2 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2 MultiPhraseQuery (org.apache.lucene.search.MultiPhraseQuery)2 SpanNearQuery (org.apache.lucene.search.spans.SpanNearQuery)2 SpanQuery (org.apache.lucene.search.spans.SpanQuery)2