Search in sources :

Example 16 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method convertTokensToNamedLists.

/**
   * Converts the list of Tokens to a list of NamedLists representing the tokens.
   *
   * @param tokenList  Tokens to convert
   * @param context The analysis context
   *
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<>();
    final FieldType fieldType = context.getFieldType();
    final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
    // sort the tokens by absolute position
    ArrayUtil.timSort(tokens, new Comparator<AttributeSource>() {

        @Override
        public int compare(AttributeSource a, AttributeSource b) {
            return arrayCompare(a.getAttribute(TokenTrackingAttribute.class).getPositions(), b.getAttribute(TokenTrackingAttribute.class).getPositions());
        }

        private int arrayCompare(int[] a, int[] b) {
            int p = 0;
            final int stop = Math.min(a.length, b.length);
            while (p < stop) {
                int diff = a[p] - b[p];
                if (diff != 0)
                    return diff;
                p++;
            }
            // One is a prefix of the other, or, they are equal:
            return a.length - b.length;
        }
    });
    for (int i = 0; i < tokens.length; i++) {
        AttributeSource token = tokens[i];
        final NamedList<Object> tokenNamedList = new SimpleOrderedMap<>();
        final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
        BytesRef rawBytes = termAtt.getBytesRef();
        final String text = fieldType.indexedToReadable(rawBytes, new CharsRefBuilder()).toString();
        tokenNamedList.add("text", text);
        if (token.hasAttribute(CharTermAttribute.class)) {
            final String rawText = token.getAttribute(CharTermAttribute.class).toString();
            if (!rawText.equals(text)) {
                tokenNamedList.add("raw_text", rawText);
            }
        }
        tokenNamedList.add("raw_bytes", rawBytes.toString());
        if (context.getTermsToMatch().contains(rawBytes)) {
            tokenNamedList.add("match", true);
        }
        token.reflectWith(new AttributeReflector() {

            @Override
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
                // leave out position and bytes term
                if (TermToBytesRefAttribute.class.isAssignableFrom(attClass))
                    return;
                if (CharTermAttribute.class.isAssignableFrom(attClass))
                    return;
                if (PositionIncrementAttribute.class.isAssignableFrom(attClass))
                    return;
                String k = attClass.getName() + '#' + key;
                // map keys for "standard attributes":
                if (ATTRIBUTE_MAPPING.containsKey(k)) {
                    k = ATTRIBUTE_MAPPING.get(k);
                }
                if (value instanceof BytesRef) {
                    final BytesRef p = (BytesRef) value;
                    value = p.toString();
                }
                tokenNamedList.add(k, value);
            }
        });
        tokensNamedLists.add(tokenNamedList);
    }
    return tokensNamedLists;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) NamedList(org.apache.solr.common.util.NamedList) AttributeReflector(org.apache.lucene.util.AttributeReflector) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) FieldType(org.apache.solr.schema.FieldType) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 17 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class SimpleFacets method getFacetTermEnumCounts.

/**
   * Returns a list of terms in the specified field along with the 
   * corresponding count of documents in the set that match that constraint.
   * This method uses the FilterCache to get the intersection count between <code>docs</code>
   * and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck) throws IOException {
    /* :TODO: potential optimization...
    * cache the Terms with the highest docFreq and try them first
    * don't enum if we get our max from them
    */
    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);
    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
        SortedIntDocSet sset = (SortedIntDocSet) docs;
        fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }
    IndexSchema schema = searcher.getSchema();
    FieldType ft = schema.getFieldType(field);
    assert !ft.isPointField() : "Point Fields don't support enum method";
    LeafReader r = searcher.getSlowAtomicReader();
    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();
    // the smallest value in the top 'N' values    
    int min = mincount - 1;
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
    BytesRef prefixTermBytes = null;
    if (prefix != null) {
        String indexedPrefix = ft.toInternal(prefix);
        prefixTermBytes = new BytesRef(indexedPrefix);
    }
    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
        termsEnum = terms.iterator();
        if (prefixTermBytes != null) {
            if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
                termsEnum = null;
            } else {
                term = termsEnum.term();
            }
        } else {
            // position termsEnum on first term
            term = termsEnum.next();
        }
    }
    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();
    if (docs.size() >= mincount) {
        while (term != null) {
            if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
                break;
            if (termFilter == null || termFilter.test(term)) {
                int df = termsEnum.docFreq();
                // make a large difference (for example, many terms with df=1).
                if (df > 0 && df > min) {
                    int c;
                    if (df >= minDfFilterCache) {
                        if (deState == null) {
                            deState = new SolrIndexSearcher.DocsEnumState();
                            deState.fieldName = field;
                            deState.liveDocs = r.getLiveDocs();
                            deState.termsEnum = termsEnum;
                            deState.postingsEnum = postingsEnum;
                        }
                        if (intersectsCheck) {
                            c = searcher.intersects(docs, deState) ? 1 : 0;
                        } else {
                            c = searcher.numDocs(docs, deState);
                        }
                        postingsEnum = deState.postingsEnum;
                    } else {
                        // iterate over TermDocs to calculate the intersection
                        // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it matter for this?
                        // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
                        // TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
                        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
                        c = 0;
                        if (postingsEnum instanceof MultiPostingsEnum) {
                            MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
                            int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                            SEGMENTS_LOOP: for (int subindex = 0; subindex < numSubs; subindex++) {
                                MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                                if (sub.postingsEnum == null)
                                    continue;
                                int base = sub.slice.start;
                                int docid;
                                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                    if (fastForRandomSet.exists(docid + base)) {
                                        c++;
                                        if (intersectsCheck) {
                                            assert c == 1;
                                            break SEGMENTS_LOOP;
                                        }
                                    }
                                }
                            }
                        } else {
                            int docid;
                            while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid)) {
                                    c++;
                                    if (intersectsCheck) {
                                        assert c == 1;
                                        break;
                                    }
                                }
                            }
                        }
                    }
                    if (sortByCount) {
                        if (c > min) {
                            BytesRef termCopy = BytesRef.deepCopyOf(term);
                            queue.add(new CountPair<>(termCopy, c));
                            if (queue.size() >= maxsize)
                                min = queue.last().val;
                        }
                    } else {
                        if (c >= mincount && --off < 0) {
                            if (--lim < 0)
                                break;
                            ft.indexedToReadable(term, charsRef);
                            res.add(charsRef.toString(), c);
                        }
                    }
                }
            }
            term = termsEnum.next();
        }
    }
    if (sortByCount) {
        for (CountPair<BytesRef, Integer> p : queue) {
            if (--off >= 0)
                continue;
            if (--lim < 0)
                break;
            ft.indexedToReadable(p.key, charsRef);
            res.add(charsRef.toString(), p.val);
        }
    }
    if (missing) {
        res.add(null, getFieldMissingCount(searcher, docs, field));
    }
    return res;
}
Also used : SortedIntDocSet(org.apache.solr.search.SortedIntDocSet) HashDocSet(org.apache.solr.search.HashDocSet) TermsEnum(org.apache.lucene.index.TermsEnum) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) LeafReader(org.apache.lucene.index.LeafReader) NamedList(org.apache.solr.common.util.NamedList) Terms(org.apache.lucene.index.Terms) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) FieldType(org.apache.solr.schema.FieldType) Fields(org.apache.lucene.index.Fields) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) IndexSchema(org.apache.solr.schema.IndexSchema) HashDocSet(org.apache.solr.search.HashDocSet) DocSet(org.apache.solr.search.DocSet) SortedIntDocSet(org.apache.solr.search.SortedIntDocSet) BitDocSet(org.apache.solr.search.BitDocSet)

Example 18 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class FieldOffsetStrategy method createAutomataOffsetsFromTerms.

protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
    List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
    for (int i = 0; i < automata.length; i++) {
        automataPostings.add(new ArrayList<>());
    }
    TermsEnum termsEnum = termsIndex.iterator();
    BytesRef term;
    CharsRefBuilder refBuilder = new CharsRefBuilder();
    while ((term = termsEnum.next()) != null) {
        for (int i = 0; i < automata.length; i++) {
            CharacterRunAutomaton automaton = automata[i];
            refBuilder.copyUTF8Bytes(term);
            if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
                PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
                if (doc == postings.advance(doc)) {
                    automataPostings.get(i).add(postings);
                }
            }
        }
    }
    //will be at most this long
    List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length);
    for (int i = 0; i < automata.length; i++) {
        CharacterRunAutomaton automaton = automata[i];
        List<PostingsEnum> postingsEnums = automataPostings.get(i);
        int size = postingsEnums.size();
        if (size > 0) {
            //only add if we have offsets
            BytesRef wildcardTerm = new BytesRef(automaton.toString());
            if (size == 1) {
                //don't wrap in a composite if there's only one OffsetsEnum
                offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
            } else {
                offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
            }
        }
    }
    return offsetsEnums;
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) ArrayList(java.util.ArrayList) TermsEnum(org.apache.lucene.index.TermsEnum) ArrayList(java.util.ArrayList) List(java.util.List) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 19 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class NRTSuggester method lookup.

/**
   * Collects at most {@link TopSuggestDocsCollector#getCountToCollect()} completions that
   * match the provided {@link CompletionScorer}.
   * <p>
   * The {@link CompletionScorer#automaton} is intersected with the {@link #fst}.
   * {@link CompletionScorer#weight} is used to compute boosts and/or extract context
   * for each matched partial paths. A top N search is executed on {@link #fst} seeded with
   * the matched partial paths. Upon reaching a completed path, {@link CompletionScorer#accept(int, Bits)}
   * and {@link CompletionScorer#score(float, float)} is used on the document id, index weight
   * and query boost to filter and score the entry, before being collected via
   * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}
   */
public void lookup(final CompletionScorer scorer, Bits acceptDocs, final TopSuggestDocsCollector collector) throws IOException {
    final double liveDocsRatio = calculateLiveDocRatio(scorer.reader.numDocs(), scorer.reader.maxDoc());
    if (liveDocsRatio == -1) {
        return;
    }
    final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(scorer.automaton, fst);
    // The topN is increased by a factor of # of intersected path
    // to ensure search admissibility. For example, one suggestion can
    // have multiple contexts, resulting in num_context paths for the
    // suggestion instead of 1 in the FST. When queried for the suggestion,
    // the topN value ensures that all paths to the suggestion are evaluated
    // (in case of a match all context query).
    // Note that collectors will early terminate as soon as enough suggestions
    // have been collected, regardless of the set topN value. This value is the
    // maximum number of suggestions that can be collected.
    final int topN = collector.getCountToCollect() * prefixPaths.size();
    final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered);
    final CharsRefBuilder spare = new CharsRefBuilder();
    Comparator<Pair<Long, BytesRef>> comparator = getComparator();
    Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, topN, queueSize, comparator, new ScoringPathComparator(scorer)) {

        private final ByteArrayDataInput scratchInput = new ByteArrayDataInput();

        @Override
        protected boolean acceptPartialPath(Util.FSTPath<Pair<Long, BytesRef>> path) {
            if (collector.doSkipDuplicates()) {
                // We are removing dups
                if (path.payload == -1) {
                    // This path didn't yet see the complete surface form; let's see if it just did with the arc output we just added:
                    BytesRef arcOutput = path.arc.output.output2;
                    BytesRef output = path.output.output2;
                    for (int i = 0; i < arcOutput.length; i++) {
                        if (arcOutput.bytes[arcOutput.offset + i] == payloadSep) {
                            // OK this arc that the path was just extended by contains the payloadSep, so we now have a full surface form in this path
                            path.payload = output.length - arcOutput.length + i;
                            assert output.bytes[output.offset + path.payload] == payloadSep;
                            break;
                        }
                    }
                }
                if (path.payload != -1) {
                    BytesRef output = path.output.output2;
                    spare.copyUTF8Bytes(output.bytes, output.offset, path.payload);
                    if (collector.seenSurfaceForms.contains(spare.chars(), 0, spare.length())) {
                        return false;
                    }
                }
            }
            return true;
        }

        @Override
        protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
            BytesRef output = path.output.output2;
            int payloadSepIndex;
            if (path.payload != -1) {
                payloadSepIndex = path.payload;
                spare.copyUTF8Bytes(output.bytes, output.offset, payloadSepIndex);
            } else {
                assert collector.doSkipDuplicates() == false;
                payloadSepIndex = parseSurfaceForm(output, payloadSep, spare);
            }
            scratchInput.reset(output.bytes, output.offset + payloadSepIndex + 1, output.length - payloadSepIndex - 1);
            int docID = scratchInput.readVInt();
            if (!scorer.accept(docID, acceptDocs)) {
                return false;
            }
            if (collector.doSkipDuplicates()) {
                // now record that we've seen this surface form:
                char[] key = new char[spare.length()];
                System.arraycopy(spare.chars(), 0, key, 0, spare.length());
                if (collector.seenSurfaceForms.contains(key)) {
                    // we already collected a higher scoring document with this key, in this segment:
                    return false;
                }
                collector.seenSurfaceForms.add(key);
            }
            try {
                float score = scorer.score(decode(path.output.output1), path.boost);
                collector.collect(docID, spare.toCharsRef(), path.context, score);
                return true;
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };
    for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
        scorer.weight.setNextMatch(path.input.get());
        BytesRef output = path.output.output2;
        int payload = -1;
        if (collector.doSkipDuplicates()) {
            for (int j = 0; j < output.length; j++) {
                if (output.bytes[output.offset + j] == payloadSep) {
                    // Important to cache this, else we have a possibly O(N^2) cost where N is the length of suggestions
                    payload = j;
                    break;
                }
            }
        }
        searcher.addStartPaths(path.fstNode, path.output, false, path.input, scorer.weight.boost(), scorer.weight.context(), payload);
    }
    // hits are also returned by search()
    // we do not use it, instead collect at acceptResult
    searcher.search();
// search admissibility is not guaranteed
// see comment on getMaxTopNSearcherQueueSize
// assert  search.isComplete;
}
Also used : FSTUtil(org.apache.lucene.search.suggest.analyzing.FSTUtil) Util(org.apache.lucene.util.fst.Util) IOException(java.io.IOException) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) FSTUtil(org.apache.lucene.search.suggest.analyzing.FSTUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair)

Example 20 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class WFSTCompletionLookup method lookup.

@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    BytesRefBuilder scratch = new BytesRefBuilder();
    scratch.copyChars(key);
    int prefixLength = scratch.length();
    Arc<Long> arc = new Arc<>();
    // match the prefix portion exactly
    Long prefixOutput = null;
    try {
        prefixOutput = lookupPrefix(scratch.get(), arc);
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
    if (prefixOutput == null) {
        return Collections.emptyList();
    }
    List<LookupResult> results = new ArrayList<>(num);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (exactFirst && arc.isFinal()) {
        spare.copyUTF8Bytes(scratch.get());
        results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
        if (--num == 0) {
            // that was quick
            return results;
        }
    }
    // complete top-N
    TopResults<Long> completions = null;
    try {
        completions = Util.shortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
        assert completions.isComplete;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
    BytesRefBuilder suffix = new BytesRefBuilder();
    for (Result<Long> completion : completions) {
        scratch.setLength(prefixLength);
        // append suffix
        Util.toBytesRef(completion.input, suffix);
        scratch.append(suffix);
        spare.copyUTF8Bytes(scratch.get());
        results.add(new LookupResult(spare.toString(), decodeWeight(completion.output)));
    }
    return results;
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Arc(org.apache.lucene.util.fst.FST.Arc) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder)

Aggregations

CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)52 BytesRef (org.apache.lucene.util.BytesRef)30 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 NamedList (org.apache.solr.common.util.NamedList)10 FieldType (org.apache.solr.schema.FieldType)10 TermsEnum (org.apache.lucene.index.TermsEnum)9 SchemaField (org.apache.solr.schema.SchemaField)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 HashSet (java.util.HashSet)5 Test (org.junit.Test)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 PostingsEnum (org.apache.lucene.index.PostingsEnum)4 Terms (org.apache.lucene.index.Terms)4 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)4 LeafReader (org.apache.lucene.index.LeafReader)3 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)3 CharsRef (org.apache.lucene.util.CharsRef)3 Util (org.apache.lucene.util.fst.Util)3 SolrException (org.apache.solr.common.SolrException)3