Search in sources :

Example 1 with BoundedTreeSet

use of org.apache.solr.util.BoundedTreeSet in project lucene-solr by apache.

the class TermsComponent method process.

@Override
public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.get(TermsParams.TERMS, "false").equals("true")) {
        return;
    }
    String[] fields = params.getParams(TermsParams.TERMS_FIELD);
    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);
    if (fields == null || fields.length == 0)
        return;
    boolean termStats = params.getBool(TermsParams.TERMS_STATS, false);
    if (termStats) {
        NamedList<Number> stats = new SimpleOrderedMap<>();
        rb.rsp.add("indexstats", stats);
        collectStats(rb.req.getSearcher(), stats);
    }
    String termList = params.get(TermsParams.TERMS_LIST);
    if (termList != null) {
        boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
        fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
        return;
    }
    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
        limit = Integer.MAX_VALUE;
    }
    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
        freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;
    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
    final LeafReader indexReader = rb.req.getSearcher().getSlowAtomicReader();
    Fields lfields = indexReader.fields();
    for (String field : fields) {
        NamedList<Integer> fieldTerms = new NamedList<>();
        termsResult.add(field, fieldTerms);
        Terms terms = lfields.terms(field);
        if (terms == null) {
            // field does not exist
            continue;
        }
        FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
        if (ft == null)
            ft = new StrField();
        // prefix must currently be text
        BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);
        BytesRef upperBytes = null;
        if (upperStr != null) {
            BytesRefBuilder b = new BytesRefBuilder();
            ft.readableToIndexed(upperStr, b);
            upperBytes = b.get();
        }
        BytesRef lowerBytes;
        if (lowerStr == null) {
            // If no lower bound was specified, use the prefix
            lowerBytes = prefixBytes;
        } else {
            lowerBytes = new BytesRef();
            if (raw) {
                // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
                // perhaps we detect if the FieldType is non-character and expect hex if so?
                lowerBytes = new BytesRef(lowerStr);
            } else {
                BytesRefBuilder b = new BytesRefBuilder();
                ft.readableToIndexed(lowerStr, b);
                lowerBytes = b.get();
            }
        }
        TermsEnum termsEnum = terms.iterator();
        BytesRef term = null;
        if (lowerBytes != null) {
            if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
                termsEnum = null;
            } else {
                term = termsEnum.term();
                //Only advance the enum if we are excluding the lower bound and the lower Term actually matches
                if (lowerIncl == false && term.equals(lowerBytes)) {
                    term = termsEnum.next();
                }
            }
        } else {
            // position termsEnum on first term
            term = termsEnum.next();
        }
        int i = 0;
        BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
        CharsRefBuilder external = new CharsRefBuilder();
        while (term != null && (i < limit || sort)) {
            // did we fill in "external" yet for this term?
            boolean externalized = false;
            // stop if the prefix doesn't match
            if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes))
                break;
            if (pattern != null) {
                // indexed text or external text?
                // TODO: support "raw" mode?
                ft.indexedToReadable(term, external);
                externalized = true;
                if (!pattern.matcher(external.get()).matches()) {
                    term = termsEnum.next();
                    continue;
                }
            }
            if (upperBytes != null) {
                int upperCmp = term.compareTo(upperBytes);
                // if we are past the upper term, or equal to it (when don't include upper) then stop.
                if (upperCmp > 0 || (upperCmp == 0 && !upperIncl))
                    break;
            }
            // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
            int docFreq = termsEnum.docFreq();
            if (docFreq >= freqmin && docFreq <= freqmax) {
                // add the term to the list
                if (sort) {
                    queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
                } else {
                    // TODO: handle raw somehow
                    if (!externalized) {
                        ft.indexedToReadable(term, external);
                    }
                    fieldTerms.add(external.toString(), docFreq);
                    i++;
                }
            }
            term = termsEnum.next();
        }
        if (sort) {
            for (CountPair<BytesRef, Integer> item : queue) {
                if (i >= limit)
                    break;
                ft.indexedToReadable(item.key, external);
                fieldTerms.add(external.toString(), item.val);
                i++;
            }
        }
    }
}
Also used : StrField(org.apache.solr.schema.StrField) BoundedTreeSet(org.apache.solr.util.BoundedTreeSet) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) Pattern(java.util.regex.Pattern) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) NamedList(org.apache.solr.common.util.NamedList) CountPair(org.apache.solr.request.SimpleFacets.CountPair) FieldType(org.apache.solr.schema.FieldType)

Aggregations

Pattern (java.util.regex.Pattern)1 BytesRef (org.apache.lucene.util.BytesRef)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)1 NamedList (org.apache.solr.common.util.NamedList)1 SimpleOrderedMap (org.apache.solr.common.util.SimpleOrderedMap)1 CountPair (org.apache.solr.request.SimpleFacets.CountPair)1 FieldType (org.apache.solr.schema.FieldType)1 StrField (org.apache.solr.schema.StrField)1 BoundedTreeSet (org.apache.solr.util.BoundedTreeSet)1