Search in sources :

Example 6 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method convertTokensToNamedLists.

/**
   * Converts the list of Tokens to a list of NamedLists representing the tokens.
   *
   * @param tokenList  Tokens to convert
   * @param context The analysis context
   *
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<>();
    final FieldType fieldType = context.getFieldType();
    final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
    // sort the tokens by absolute position
    ArrayUtil.timSort(tokens, new Comparator<AttributeSource>() {

        @Override
        public int compare(AttributeSource a, AttributeSource b) {
            return arrayCompare(a.getAttribute(TokenTrackingAttribute.class).getPositions(), b.getAttribute(TokenTrackingAttribute.class).getPositions());
        }

        private int arrayCompare(int[] a, int[] b) {
            int p = 0;
            final int stop = Math.min(a.length, b.length);
            while (p < stop) {
                int diff = a[p] - b[p];
                if (diff != 0)
                    return diff;
                p++;
            }
            // One is a prefix of the other, or, they are equal:
            return a.length - b.length;
        }
    });
    for (int i = 0; i < tokens.length; i++) {
        AttributeSource token = tokens[i];
        final NamedList<Object> tokenNamedList = new SimpleOrderedMap<>();
        final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
        BytesRef rawBytes = termAtt.getBytesRef();
        final String text = fieldType.indexedToReadable(rawBytes, new CharsRefBuilder()).toString();
        tokenNamedList.add("text", text);
        if (token.hasAttribute(CharTermAttribute.class)) {
            final String rawText = token.getAttribute(CharTermAttribute.class).toString();
            if (!rawText.equals(text)) {
                tokenNamedList.add("raw_text", rawText);
            }
        }
        tokenNamedList.add("raw_bytes", rawBytes.toString());
        if (context.getTermsToMatch().contains(rawBytes)) {
            tokenNamedList.add("match", true);
        }
        token.reflectWith(new AttributeReflector() {

            @Override
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
                // leave out position and bytes term
                if (TermToBytesRefAttribute.class.isAssignableFrom(attClass))
                    return;
                if (CharTermAttribute.class.isAssignableFrom(attClass))
                    return;
                if (PositionIncrementAttribute.class.isAssignableFrom(attClass))
                    return;
                String k = attClass.getName() + '#' + key;
                // map keys for "standard attributes":
                if (ATTRIBUTE_MAPPING.containsKey(k)) {
                    k = ATTRIBUTE_MAPPING.get(k);
                }
                if (value instanceof BytesRef) {
                    final BytesRef p = (BytesRef) value;
                    value = p.toString();
                }
                tokenNamedList.add(k, value);
            }
        });
        tokensNamedLists.add(tokenNamedList);
    }
    return tokensNamedLists;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) NamedList(org.apache.solr.common.util.NamedList) AttributeReflector(org.apache.lucene.util.AttributeReflector) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) FieldType(org.apache.solr.schema.FieldType) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 7 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class PayloadUtils method createSpanQuery.

/**
   * The generated SpanQuery will be either a SpanTermQuery or an ordered, zero slop SpanNearQuery, depending
   * on how many tokens are emitted.
   */
public static SpanQuery createSpanQuery(String field, String value, Analyzer analyzer) throws IOException {
    // adapted this from QueryBuilder.createSpanQuery (which isn't currently public) and added reset(), end(), and close() calls
    List<SpanTermQuery> terms = new ArrayList<>();
    try (TokenStream in = analyzer.tokenStream(field, value)) {
        in.reset();
        TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
        while (in.incrementToken()) {
            terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
        }
        in.end();
    }
    SpanQuery query;
    if (terms.isEmpty()) {
        query = null;
    } else if (terms.size() == 1) {
        query = terms.get(0);
    } else {
        query = new SpanNearQuery(terms.toArray(new SpanTermQuery[terms.size()]), 0, true);
    }
    return query;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery)

Example 8 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class MemoryIndex method storeTerms.

private void storeTerms(Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {
    int pos = -1;
    int offset = 0;
    if (info.numTokens > 0) {
        pos = info.lastPosition + positionIncrementGap;
        offset = info.lastOffset + offsetGap;
    }
    try (TokenStream stream = tokenStream) {
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
        stream.reset();
        while (stream.incrementToken()) {
            //        if (DEBUG) System.err.println("token='" + term + "'");
            info.numTokens++;
            final int posIncr = posIncrAttribute.getPositionIncrement();
            if (posIncr == 0) {
                info.numOverlapTokens++;
            }
            pos += posIncr;
            int ord = info.terms.add(termAtt.getBytesRef());
            if (ord < 0) {
                ord = (-ord) - 1;
                postingsWriter.reset(info.sliceArray.end[ord]);
            } else {
                info.sliceArray.start[ord] = postingsWriter.startNewSlice();
            }
            info.sliceArray.freq[ord]++;
            info.sumTotalTermFreq++;
            postingsWriter.writeInt(pos);
            if (storeOffsets) {
                postingsWriter.writeInt(offsetAtt.startOffset() + offset);
                postingsWriter.writeInt(offsetAtt.endOffset() + offset);
            }
            if (storePayloads) {
                final BytesRef payload = payloadAtt.getPayload();
                final int pIndex;
                if (payload == null || payload.length == 0) {
                    pIndex = -1;
                } else {
                    pIndex = payloadsBytesRefs.append(payload);
                }
                postingsWriter.writeInt(pIndex);
            }
            info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
        }
        stream.end();
        if (info.numTokens > 0) {
            info.lastPosition = pos;
            info.lastOffset = offsetAtt.endOffset() + offset;
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 9 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class ICUCollationField method getCollationKey.

/**
   * analyze the text with the analyzer, instead of the collator.
   * because icu collators are not thread safe, this keeps things 
   * simple (we already have a threadlocal clone in the reused TS)
   */
private BytesRef getCollationKey(String field, String text) {
    try (TokenStream source = analyzer.tokenStream(field, text)) {
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
        BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
        assert !source.incrementToken();
        source.end();
        return bytes;
    } catch (IOException e) {
        throw new RuntimeException("Unable to analyze text: " + text, e);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef)

Example 10 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class QueryBuilder method analyzeMultiPhrase.

/** 
   * Creates complex phrase query from the cached tokenstream contents 
   */
protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException {
    MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder();
    mpqb.setSlop(slop);
    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
    int position = -1;
    List<Term> multiTerms = new ArrayList<>();
    stream.reset();
    while (stream.incrementToken()) {
        int positionIncrement = posIncrAtt.getPositionIncrement();
        if (positionIncrement > 0 && multiTerms.size() > 0) {
            if (enablePositionIncrements) {
                mpqb.add(multiTerms.toArray(new Term[0]), position);
            } else {
                mpqb.add(multiTerms.toArray(new Term[0]));
            }
            multiTerms.clear();
        }
        position += positionIncrement;
        multiTerms.add(new Term(field, termAtt.getBytesRef()));
    }
    if (enablePositionIncrements) {
        mpqb.add(multiTerms.toArray(new Term[0]), position);
    } else {
        mpqb.add(multiTerms.toArray(new Term[0]));
    }
    return mpqb.build();
}
Also used : TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) ArrayList(java.util.ArrayList) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)34 BytesRef (org.apache.lucene.util.BytesRef)17 TokenStream (org.apache.lucene.analysis.TokenStream)16 IOException (java.io.IOException)13 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)12 ArrayList (java.util.ArrayList)10 Term (org.apache.lucene.index.Term)9 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)6 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)4 Analyzer (org.apache.lucene.analysis.Analyzer)3 CachingTokenFilter (org.apache.lucene.analysis.CachingTokenFilter)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)3 SpanTermQuery (org.apache.lucene.search.spans.SpanTermQuery)3 StringReader (java.io.StringReader)2 HashSet (java.util.HashSet)2 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2 MultiPhraseQuery (org.apache.lucene.search.MultiPhraseQuery)2