Examples with CharsRefBuilder - org.apache.lucene.util.CharsRefBuilder

Example 11 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class NumericFacets method getCountsSingleValue.

private static NamedList<Integer> getCountsSingleValue(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
    boolean zeros = mincount <= 0;
    mincount = Math.max(mincount, 1);
    final SchemaField sf = searcher.getSchema().getField(fieldName);
    final FieldType ft = sf.getType();
    final NumberType numericType = ft.getNumberType();
    if (numericType == null) {
        throw new IllegalStateException();
    }
    // We don't return zeros when using PointFields or when index=false
    zeros = zeros && !ft.isPointField() && sf.indexed();
    final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
    // 1. accumulate
    final HashTable hashTable = new HashTable(true);
    final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
    LeafReaderContext ctx = null;
    NumericDocValues longs = null;
    int missingCount = 0;
    for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
        final int doc = docsIt.nextDoc();
        if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
            do {
                ctx = ctxIt.next();
            } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
            assert doc >= ctx.docBase;
            switch(numericType) {
                case LONG:
                case DATE:
                case INTEGER:
                    // Long, Date and Integer
                    longs = DocValues.getNumeric(ctx.reader(), fieldName);
                    break;
                case FLOAT:
                    // TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
                    longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {

                        @Override
                        public long longValue() throws IOException {
                            long bits = super.longValue();
                            if (bits < 0)
                                bits ^= 0x7fffffffffffffffL;
                            return bits;
                        }
                    };
                    break;
                case DOUBLE:
                    // TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
                    longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {

                        @Override
                        public long longValue() throws IOException {
                            long bits = super.longValue();
                            if (bits < 0)
                                bits ^= 0x7fffffffffffffffL;
                            return bits;
                        }
                    };
                    break;
                default:
                    throw new AssertionError("Unexpected type: " + numericType);
            }
        }
        int valuesDocID = longs.docID();
        if (valuesDocID < doc - ctx.docBase) {
            valuesDocID = longs.advance(doc - ctx.docBase);
        }
        if (valuesDocID == doc - ctx.docBase) {
            hashTable.add(doc, longs.longValue(), 1);
        } else {
            ++missingCount;
        }
    }
    // 2. select top-k facet values
    final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size);
    final PriorityQueue<Entry> pq;
    if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
        pq = new PriorityQueue<Entry>(pqSize) {

            @Override
            protected boolean lessThan(Entry a, Entry b) {
                if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) {
                    return true;
                } else {
                    return false;
                }
            }
        };
    } else {
        pq = new PriorityQueue<Entry>(pqSize) {

            @Override
            protected boolean lessThan(Entry a, Entry b) {
                return a.bits > b.bits;
            }
        };
    }
    Entry e = null;
    for (int i = 0; i < hashTable.bits.length; ++i) {
        if (hashTable.counts[i] >= mincount) {
            if (e == null) {
                e = new Entry();
            }
            e.bits = hashTable.bits[i];
            e.count = hashTable.counts[i];
            e.docID = hashTable.docIDs[i];
            e = pq.insertWithOverflow(e);
        }
    }
    // 4. build the NamedList
    final ValueSource vs = ft.getValueSource(sf, null);
    final NamedList<Integer> result = new NamedList<>();
    // to be merged with terms from the terms dict
    if (!zeros || FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
        // Only keep items we're interested in
        final Deque<Entry> counts = new ArrayDeque<>();
        while (pq.size() > offset) {
            counts.addFirst(pq.pop());
        }
        // Entries from the PQ first, then using the terms dictionary
        for (Entry entry : counts) {
            final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
            final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
            result.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
        }
        if (zeros && (limit < 0 || result.size() < limit)) {
            // need to merge with the term dict
            if (!sf.indexed() && !sf.hasDocValues()) {
                throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is neither indexed nor docValues");
            }
            // Add zeros until there are limit results
            final Set<String> alreadySeen = new HashSet<>();
            while (pq.size() > 0) {
                Entry entry = pq.pop();
                final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
                final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
                alreadySeen.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase));
            }
            for (int i = 0; i < result.size(); ++i) {
                alreadySeen.add(result.getName(i));
            }
            final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
            if (terms != null) {
                final String prefixStr = TrieField.getMainValuePrefix(ft);
                final BytesRef prefix;
                if (prefixStr != null) {
                    prefix = new BytesRef(prefixStr);
                } else {
                    prefix = new BytesRef();
                }
                final TermsEnum termsEnum = terms.iterator();
                BytesRef term;
                switch(termsEnum.seekCeil(prefix)) {
                    case FOUND:
                    case NOT_FOUND:
                        term = termsEnum.term();
                        break;
                    case END:
                        term = null;
                        break;
                    default:
                        throw new AssertionError();
                }
                final CharsRefBuilder spare = new CharsRefBuilder();
                for (int skipped = hashTable.size; skipped < offset && term != null && StringHelper.startsWith(term, prefix); ) {
                    ft.indexedToReadable(term, spare);
                    final String termStr = spare.toString();
                    if (!alreadySeen.contains(termStr)) {
                        ++skipped;
                    }
                    term = termsEnum.next();
                }
                for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
                    ft.indexedToReadable(term, spare);
                    final String termStr = spare.toString();
                    if (!alreadySeen.contains(termStr)) {
                        result.add(termStr, 0);
                    }
                }
            }
        }
    } else {
        // => Merge the PQ and the terms dictionary on the fly
        if (!sf.indexed()) {
            throw new IllegalStateException("Cannot use " + FacetParams.FACET_SORT + "=" + FacetParams.FACET_SORT_INDEX + " on a field which is not indexed");
        }
        final Map<String, Integer> counts = new HashMap<>();
        while (pq.size() > 0) {
            final Entry entry = pq.pop();
            final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
            final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
            counts.put(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
        }
        final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
        if (terms != null) {
            final String prefixStr = TrieField.getMainValuePrefix(ft);
            final BytesRef prefix;
            if (prefixStr != null) {
                prefix = new BytesRef(prefixStr);
            } else {
                prefix = new BytesRef();
            }
            final TermsEnum termsEnum = terms.iterator();
            BytesRef term;
            switch(termsEnum.seekCeil(prefix)) {
                case FOUND:
                case NOT_FOUND:
                    term = termsEnum.term();
                    break;
                case END:
                    term = null;
                    break;
                default:
                    throw new AssertionError();
            }
            final CharsRefBuilder spare = new CharsRefBuilder();
            for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) {
                term = termsEnum.next();
            }
            for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
                ft.indexedToReadable(term, spare);
                final String termStr = spare.toString();
                Integer count = counts.get(termStr);
                if (count == null) {
                    count = 0;
                }
                result.add(termStr, count);
            }
        }
    }
    if (missing) {
        result.add(null, missingCount);
    }
    return result;
}

Also used : FilterNumericDocValues(org.apache.lucene.index.FilterNumericDocValues) NumericDocValues(org.apache.lucene.index.NumericDocValues) SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) DocIterator(org.apache.solr.search.DocIterator) HashMap(java.util.HashMap) TermsEnum(org.apache.lucene.index.TermsEnum) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) NamedList(org.apache.solr.common.util.NamedList) Terms(org.apache.lucene.index.Terms) IOException(java.io.IOException) FilterNumericDocValues(org.apache.lucene.index.FilterNumericDocValues) ArrayDeque(java.util.ArrayDeque) FieldType(org.apache.solr.schema.FieldType) SchemaField(org.apache.solr.schema.SchemaField) NumberType(org.apache.solr.schema.NumberType) ValueSource(org.apache.lucene.queries.function.ValueSource) FunctionValues(org.apache.lucene.queries.function.FunctionValues)

Example 12 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class SolrIndexSplitter method split.

FixedBitSet[] split(LeafReaderContext readerContext) throws IOException {
    LeafReader reader = readerContext.reader();
    FixedBitSet[] docSets = new FixedBitSet[numPieces];
    for (int i = 0; i < docSets.length; i++) {
        docSets[i] = new FixedBitSet(reader.maxDoc());
    }
    Bits liveDocs = reader.getLiveDocs();
    Fields fields = reader.fields();
    Terms terms = fields == null ? null : fields.terms(field.getName());
    TermsEnum termsEnum = terms == null ? null : terms.iterator();
    if (termsEnum == null)
        return docSets;
    BytesRef term = null;
    PostingsEnum postingsEnum = null;
    int[] docsMatchingRanges = null;
    if (ranges != null) {
        // +1 because documents can belong to *zero*, one, several or all ranges in rangesArr
        docsMatchingRanges = new int[rangesArr.length + 1];
    }
    CharsRefBuilder idRef = new CharsRefBuilder();
    for (; ; ) {
        term = termsEnum.next();
        if (term == null)
            break;
        // figure out the hash for the term
        // FUTURE: if conversion to strings costs too much, we could
        // specialize and use the hash function that can work over bytes.
        field.getType().indexedToReadable(term, idRef);
        String idString = idRef.toString();
        if (splitKey != null) {
            // todo have composite routers support these kind of things instead
            String part1 = getRouteKey(idString);
            if (part1 == null)
                continue;
            if (!splitKey.equals(part1)) {
                continue;
            }
        }
        int hash = 0;
        if (hashRouter != null) {
            hash = hashRouter.sliceHash(idString, null, null, null);
        }
        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        postingsEnum = BitsFilteredPostingsEnum.wrap(postingsEnum, liveDocs);
        for (; ; ) {
            int doc = postingsEnum.nextDoc();
            if (doc == DocIdSetIterator.NO_MORE_DOCS)
                break;
            if (ranges == null) {
                docSets[currPartition].set(doc);
                currPartition = (currPartition + 1) % numPieces;
            } else {
                int matchingRangesCount = 0;
                for (int i = 0; i < rangesArr.length; i++) {
                    // inner-loop: use array here for extra speed.
                    if (rangesArr[i].includes(hash)) {
                        docSets[i].set(doc);
                        ++matchingRangesCount;
                    }
                }
                docsMatchingRanges[matchingRangesCount]++;
            }
        }
    }
    if (docsMatchingRanges != null) {
        for (int ii = 0; ii < docsMatchingRanges.length; ii++) {
            if (0 == docsMatchingRanges[ii])
                continue;
            switch(ii) {
                case 0:
                    // document loss
                    log.error("Splitting {}: {} documents belong to no shards and will be dropped", reader, docsMatchingRanges[ii]);
                    break;
                case 1:
                    // normal case, each document moves to one of the sub-shards
                    log.info("Splitting {}: {} documents will move into a sub-shard", reader, docsMatchingRanges[ii]);
                    break;
                default:
                    // document duplication
                    log.error("Splitting {}: {} documents will be moved to multiple ({}) sub-shards", reader, docsMatchingRanges[ii], ii);
                    break;
            }
        }
    }
    return docSets;
}

Also used : LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) FixedBitSet(org.apache.lucene.util.FixedBitSet) Bits(org.apache.lucene.util.Bits) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BitsFilteredPostingsEnum(org.apache.solr.search.BitsFilteredPostingsEnum) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 13 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class MoreLikeThis method addTermFrequencies.

/**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param field2termFreqMap a Map of terms and their frequencies per field
   * @param vector List of terms and their frequencies for a doc/field
   */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
    Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
    if (termFreqMap == null) {
        termFreqMap = new HashMap<>();
        field2termFreqMap.put(fieldName, termFreqMap);
    }
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        final int freq = (int) termsEnum.totalTermFreq();
        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 14 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class FreeTextSuggester method lookup.

/** Retrieve suggestions. */
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        throw new IllegalStateException("Lookup not supported at this time");
    }
    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
        //System.out.println("lookup: key='" + key + "'");
        // Run full analysis, but save only the
        // last 1gram, last 2gram, etc.:
        int maxEndOffset = -1;
        boolean sawRealToken = false;
        while (ts.incrementToken()) {
            BytesRef tokenBytes = termBytesAtt.getBytesRef();
            sawRealToken |= tokenBytes.length > 0;
            // TODO: this is somewhat iffy; today, ShingleFilter
            // sets posLen to the gram count; maybe we should make
            // a separate dedicated att for this?
            int gramCount = posLenAtt.getPositionLength();
            assert gramCount <= grams;
            // Safety: make sure the recalculated count "agrees":
            if (countGrams(tokenBytes) != gramCount) {
                throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
            }
            maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
            BytesRefBuilder b = new BytesRefBuilder();
            b.append(tokenBytes);
            lastTokens[gramCount - 1] = b;
        }
        ts.end();
        if (!sawRealToken) {
            throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
        }
        // Carefully fill last tokens with _ tokens;
        // ShingleFilter appraently won't emit "only hole"
        // tokens:
        int endPosInc = posIncAtt.getPositionIncrement();
        // Note this will also be true if input is the empty
        // string (in which case we saw no tokens and
        // maxEndOffset is still -1), which in fact works out OK
        // because we fill the unigram with an empty BytesRef
        // below:
        boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
        if (lastTokenEnded) {
            // starting with "foo":
            for (int i = grams - 1; i > 0; i--) {
                BytesRefBuilder token = lastTokens[i - 1];
                if (token == null) {
                    continue;
                }
                token.append(separator);
                lastTokens[i] = token;
            }
            lastTokens[0] = new BytesRefBuilder();
        }
        Arc<Long> arc = new Arc<>();
        BytesReader bytesReader = fst.getBytesReader();
        // Try highest order models first, and if they return
        // results, return that; else, fallback:
        double backoff = 1.0;
        List<LookupResult> results = new ArrayList<>(num);
        // We only add a given suffix once, from the highest
        // order model that saw it; for subsequent lower order
        // models we skip it:
        final Set<BytesRef> seen = new HashSet<>();
        for (int gram = grams - 1; gram >= 0; gram--) {
            BytesRefBuilder token = lastTokens[gram];
            // Don't make unigram predictions from empty string:
            if (token == null || (token.length() == 0 && key.length() > 0)) {
                //System.out.println("  gram=" + gram + ": skip: not enough input");
                continue;
            }
            if (endPosInc > 0 && gram <= endPosInc) {
                //System.out.println("  break: only holes now");
                break;
            }
            //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
            // TODO: we could add fuzziness here
            // match the prefix portion exactly
            //Pair<Long,BytesRef> prefixOutput = null;
            Long prefixOutput = null;
            try {
                prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            if (prefixOutput == null) {
                // This model never saw this prefix, e.g. the
                // trigram model never saw context "purple mushroom"
                backoff *= ALPHA;
                continue;
            }
            // TODO: we could do this division at build time, and
            // bake it into the FST?
            // Denominator for computing scores from current
            // model's predictions:
            long contextCount = totTokens;
            BytesRef lastTokenFragment = null;
            for (int i = token.length() - 1; i >= 0; i--) {
                if (token.byteAt(i) == separator) {
                    BytesRef context = new BytesRef(token.bytes(), 0, i);
                    Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
                    assert output != null;
                    contextCount = decodeWeight(output);
                    lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                    break;
                }
            }
            final BytesRefBuilder finalLastToken = new BytesRefBuilder();
            if (lastTokenFragment == null) {
                finalLastToken.copyBytes(token.get());
            } else {
                finalLastToken.copyBytes(lastTokenFragment);
            }
            CharsRefBuilder spare = new CharsRefBuilder();
            // complete top-N
            TopResults<Long> completions = null;
            try {
                // Because we store multiple models in one FST
                // (1gram, 2gram, 3gram), we must restrict the
                // search so that it only considers the current
                // model.  For highest order model, this is not
                // necessary since all completions in the FST
                // must be from this model, but for lower order
                // models we have to filter out the higher order
                // ones:
                // Must do num+seen.size() for queue depth because we may
                // reject up to seen.size() paths in acceptResult():
                Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num + seen.size(), weightComparator) {

                    BytesRefBuilder scratchBytes = new BytesRefBuilder();

                    @Override
                    protected void addIfCompetitive(Util.FSTPath<Long> path) {
                        if (path.arc.label != separator) {
                            //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                            super.addIfCompetitive(path);
                        } else {
                        //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                        }
                    }

                    @Override
                    protected boolean acceptResult(IntsRef input, Long output) {
                        Util.toBytesRef(input, scratchBytes);
                        finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
                        int lenSav = finalLastToken.length();
                        finalLastToken.append(scratchBytes);
                        //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
                        boolean ret = seen.contains(finalLastToken.get()) == false;
                        finalLastToken.setLength(lenSav);
                        return ret;
                    }
                };
                // since this search is initialized with a single start node 
                // it is okay to start with an empty input path here
                searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
                completions = searcher.search();
                assert completions.isComplete;
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            int prefixLength = token.length();
            BytesRefBuilder suffix = new BytesRefBuilder();
            nextCompletion: for (Result<Long> completion : completions) {
                token.setLength(prefixLength);
                // append suffix
                Util.toBytesRef(completion.input, suffix);
                token.append(suffix);
                //System.out.println("    completion " + token.utf8ToString());
                // Skip this path if a higher-order model already
                // saw/predicted its last token:
                BytesRef lastToken = token.get();
                for (int i = token.length() - 1; i >= 0; i--) {
                    if (token.byteAt(i) == separator) {
                        assert token.length() - i - 1 > 0;
                        lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                        break;
                    }
                }
                if (seen.contains(lastToken)) {
                    //System.out.println("      skip dup " + lastToken.utf8ToString());
                    continue nextCompletion;
                }
                seen.add(BytesRef.deepCopyOf(lastToken));
                spare.copyUTF8Bytes(token.get());
                LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
                results.add(result);
                assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
            }
            backoff *= ALPHA;
        }
        Collections.sort(results, new Comparator<LookupResult>() {

            @Override
            public int compare(LookupResult a, LookupResult b) {
                if (a.value > b.value) {
                    return -1;
                } else if (a.value < b.value) {
                    return 1;
                } else {
                    // Tie break by UTF16 sort order:
                    return ((String) a.key).compareTo((String) b.key);
                }
            }
        });
        if (results.size() > num) {
            results.subList(num, results.size()).clear();
        }
        return results;
    }
}

Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) Result(org.apache.lucene.util.fst.Util.Result) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 15 with CharsRefBuilder

use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.

the class AnalyzingSuggester method lookup.

@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;
    if (onlyMorePopular) {
        throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        return Collections.emptyList();
    }
    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
        if (key.charAt(i) == 0x1E) {
            throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
        }
        if (key.charAt(i) == 0x1F) {
            throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
        }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
        Automaton lookupAutomaton = toLookupAutomaton(key);
        final CharsRefBuilder spare = new CharsRefBuilder();
        //System.out.println("  now intersect exactFirst=" + exactFirst);
        // Intersect automaton w/ suggest wFST and get all
        // prefix starting nodes & their outputs:
        //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
        //System.out.println("  prefixPaths: " + prefixPaths.size());
        BytesReader bytesReader = fst.getBytesReader();
        FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
        final List<LookupResult> results = new ArrayList<>();
        List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
        if (exactFirst) {
            int count = 0;
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    count++;
                }
            }
            // Searcher just to find the single exact only
            // match, if present:
            Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
            searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
            // ...:
            for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
                if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
                    // This node has END_BYTE arc leaving, meaning it's an
                    // "exact" match:
                    searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
                }
            }
            TopResults<Pair<Long, BytesRef>> completions = searcher.search();
            assert completions.isComplete;
            // maxSurfaceFormsPerAnalyzedForm:
            for (Result<Pair<Long, BytesRef>> completion : completions) {
                BytesRef output2 = completion.output.output2;
                if (sameSurfaceForm(utf8Key, output2)) {
                    results.add(getLookupResult(completion.output.output1, output2, spare));
                    break;
                }
            }
            if (results.size() == num) {
                // That was quick:
                return results;
            }
        }
        Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {

            private final Set<BytesRef> seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
                // can get duplicate surface forms:
                if (seen.contains(output.output2)) {
                    return false;
                }
                seen.add(output.output2);
                if (!exactFirst) {
                    return true;
                } else {
                    // create duplicate results:
                    if (sameSurfaceForm(utf8Key, output.output2)) {
                        // have already found it in the first search:
                        assert results.size() == 1;
                        return false;
                    } else {
                        return true;
                    }
                }
            }
        };
        prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
        for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
            searcher.addStartPaths(path.fstNode, path.output, true, path.input);
        }
        TopResults<Pair<Long, BytesRef>> completions = searcher.search();
        assert completions.isComplete;
        for (Result<Pair<Long, BytesRef>> completion : completions) {
            LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
            // TODO: for fuzzy case would be nice to return
            // how many edits were required
            //System.out.println("    result=" + result);
            results.add(result);
            if (results.size() == num) {
                // produce one extra path
                break;
            }
        }
        return results;
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}

Also used : ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) ArrayUtil(org.apache.lucene.util.ArrayUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) FST(org.apache.lucene.util.fst.FST) IOException(java.io.IOException) BytesReader(org.apache.lucene.util.fst.FST.BytesReader)

Aggregations

CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)52 BytesRef (org.apache.lucene.util.BytesRef)30 ArrayList (java.util.ArrayList)11 IOException (java.io.IOException)10 NamedList (org.apache.solr.common.util.NamedList)10 FieldType (org.apache.solr.schema.FieldType)10 TermsEnum (org.apache.lucene.index.TermsEnum)9 SchemaField (org.apache.solr.schema.SchemaField)7 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)6 HashSet (java.util.HashSet)5 Test (org.junit.Test)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 PostingsEnum (org.apache.lucene.index.PostingsEnum)4 Terms (org.apache.lucene.index.Terms)4 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)4 LeafReader (org.apache.lucene.index.LeafReader)3 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)3 CharsRef (org.apache.lucene.util.CharsRef)3 Util (org.apache.lucene.util.fst.Util)3 SolrException (org.apache.solr.common.SolrException)3