Search in sources :

Example 26 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TFValueSource method getValues.

@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
    Fields fields = readerContext.reader().fields();
    final Terms terms = fields.terms(indexedField);
    IndexSearcher searcher = (IndexSearcher) context.get("searcher");
    final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(true), indexedField);
    if (similarity == null) {
        throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
    }
    return new FloatDocValues(this) {

        PostingsEnum docs;

        int atDoc;

        int lastDocRequested = -1;

        {
            reset();
        }

        public void reset() throws IOException {
            if (terms != null) {
                final TermsEnum termsEnum = terms.iterator();
                if (termsEnum.seekExact(indexedBytes)) {
                    docs = termsEnum.postings(null);
                } else {
                    docs = null;
                }
            } else {
                docs = null;
            }
            if (docs == null) {
                docs = new PostingsEnum() {

                    @Override
                    public int freq() {
                        return 0;
                    }

                    @Override
                    public int nextPosition() throws IOException {
                        return -1;
                    }

                    @Override
                    public int startOffset() throws IOException {
                        return -1;
                    }

                    @Override
                    public int endOffset() throws IOException {
                        return -1;
                    }

                    @Override
                    public BytesRef getPayload() throws IOException {
                        return null;
                    }

                    @Override
                    public int docID() {
                        return DocIdSetIterator.NO_MORE_DOCS;
                    }

                    @Override
                    public int nextDoc() {
                        return DocIdSetIterator.NO_MORE_DOCS;
                    }

                    @Override
                    public int advance(int target) {
                        return DocIdSetIterator.NO_MORE_DOCS;
                    }

                    @Override
                    public long cost() {
                        return 0;
                    }
                };
            }
            atDoc = -1;
        }

        @Override
        public float floatVal(int doc) {
            try {
                if (doc < lastDocRequested) {
                    // out-of-order access.... reset
                    reset();
                }
                lastDocRequested = doc;
                if (atDoc < doc) {
                    atDoc = docs.advance(doc);
                }
                if (atDoc > doc) {
                    // end, or because the next doc is after this doc.
                    return similarity.tf(0);
                }
                // a match!
                return similarity.tf(docs.freq());
            } catch (IOException e) {
                throw new RuntimeException("caught exception in function " + description() + " : doc=" + doc, e);
            }
        }
    };
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Terms(org.apache.lucene.index.Terms) IOException(java.io.IOException) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) FloatDocValues(org.apache.lucene.queries.function.docvalues.FloatDocValues) TFIDFSimilarity(org.apache.lucene.search.similarities.TFIDFSimilarity) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 27 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TermFreqValueSource method getValues.

@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
    Fields fields = readerContext.reader().fields();
    final Terms terms = fields.terms(indexedField);
    return new IntDocValues(this) {

        PostingsEnum docs;

        int atDoc;

        int lastDocRequested = -1;

        {
            reset();
        }

        public void reset() throws IOException {
            if (terms != null) {
                final TermsEnum termsEnum = terms.iterator();
                if (termsEnum.seekExact(indexedBytes)) {
                    docs = termsEnum.postings(null);
                } else {
                    docs = null;
                }
            } else {
                docs = null;
            }
            if (docs == null) {
                docs = new PostingsEnum() {

                    @Override
                    public int freq() {
                        return 0;
                    }

                    @Override
                    public int nextPosition() throws IOException {
                        return -1;
                    }

                    @Override
                    public int startOffset() throws IOException {
                        return -1;
                    }

                    @Override
                    public int endOffset() throws IOException {
                        return -1;
                    }

                    @Override
                    public BytesRef getPayload() throws IOException {
                        throw new UnsupportedOperationException();
                    }

                    @Override
                    public int docID() {
                        return DocIdSetIterator.NO_MORE_DOCS;
                    }

                    @Override
                    public int nextDoc() {
                        return DocIdSetIterator.NO_MORE_DOCS;
                    }

                    @Override
                    public int advance(int target) {
                        return DocIdSetIterator.NO_MORE_DOCS;
                    }

                    @Override
                    public long cost() {
                        return 0;
                    }
                };
            }
            atDoc = -1;
        }

        @Override
        public int intVal(int doc) {
            try {
                if (doc < lastDocRequested) {
                    // out-of-order access.... reset
                    reset();
                }
                lastDocRequested = doc;
                if (atDoc < doc) {
                    atDoc = docs.advance(doc);
                }
                if (atDoc > doc) {
                    // end, or because the next doc is after this doc.
                    return 0;
                }
                // a match!
                return docs.freq();
            } catch (IOException e) {
                throw new RuntimeException("caught exception in function " + description() + " : doc=" + doc, e);
            }
        }
    };
}
Also used : Fields(org.apache.lucene.index.Fields) Terms(org.apache.lucene.index.Terms) IOException(java.io.IOException) IntDocValues(org.apache.lucene.queries.function.docvalues.IntDocValues) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 28 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class FieldOptions method mapOneVector.

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
    NamedList<Object> fieldNL = new NamedList<>();
    docNL.add(field, fieldNL);
    BytesRef text;
    PostingsEnum dpEnum = null;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        NamedList<Object> termInfo = new NamedList<>();
        fieldNL.add(term, termInfo);
        final int freq = (int) termsEnum.totalTermFreq();
        if (fieldOptions.termFreq == true) {
            termInfo.add("tf", freq);
        }
        int dpEnumFlags = 0;
        dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
        //payloads require offsets
        dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
        dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
        dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
        boolean atNextDoc = false;
        if (dpEnum != null) {
            dpEnum.nextDoc();
            atNextDoc = true;
        }
        if (atNextDoc && dpEnumFlags != 0) {
            NamedList<Integer> positionsNL = null;
            NamedList<Number> theOffsets = null;
            NamedList<String> thePayloads = null;
            for (int i = 0; i < freq; i++) {
                final int pos = dpEnum.nextPosition();
                if (fieldOptions.positions && pos >= 0) {
                    if (positionsNL == null) {
                        positionsNL = new NamedList<>();
                        termInfo.add("positions", positionsNL);
                    }
                    positionsNL.add("position", pos);
                }
                int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
                if (startOffset >= 0) {
                    if (theOffsets == null) {
                        theOffsets = new NamedList<>();
                        termInfo.add("offsets", theOffsets);
                    }
                    theOffsets.add("start", dpEnum.startOffset());
                    theOffsets.add("end", dpEnum.endOffset());
                }
                BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
                if (payload != null) {
                    if (thePayloads == null) {
                        thePayloads = new NamedList<>();
                        termInfo.add("payloads", thePayloads);
                    }
                    thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
                }
            }
        }
        int df = 0;
        if (fieldOptions.docFreq || fieldOptions.tfIdf) {
            df = reader.docFreq(new Term(field, text));
        }
        if (fieldOptions.docFreq) {
            termInfo.add("df", df);
        }
        // TODO: this is not TF/IDF by anyone's definition!
        if (fieldOptions.tfIdf) {
            double tfIdfVal = ((double) freq) / df;
            termInfo.add("tf-idf", tfIdfVal);
        }
    }
}
Also used : NamedList(org.apache.solr.common.util.NamedList) Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 29 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class SimpleFacets method getFacetTermEnumCounts.

/**
   * Returns a list of terms in the specified field along with the 
   * corresponding count of documents in the set that match that constraint.
   * This method uses the FilterCache to get the intersection count between <code>docs</code>
   * and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck) throws IOException {
    /* :TODO: potential optimization...
    * cache the Terms with the highest docFreq and try them first
    * don't enum if we get our max from them
    */
    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);
    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
        SortedIntDocSet sset = (SortedIntDocSet) docs;
        fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }
    IndexSchema schema = searcher.getSchema();
    FieldType ft = schema.getFieldType(field);
    assert !ft.isPointField() : "Point Fields don't support enum method";
    LeafReader r = searcher.getSlowAtomicReader();
    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();
    // the smallest value in the top 'N' values    
    int min = mincount - 1;
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
    BytesRef prefixTermBytes = null;
    if (prefix != null) {
        String indexedPrefix = ft.toInternal(prefix);
        prefixTermBytes = new BytesRef(indexedPrefix);
    }
    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
        termsEnum = terms.iterator();
        if (prefixTermBytes != null) {
            if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
                termsEnum = null;
            } else {
                term = termsEnum.term();
            }
        } else {
            // position termsEnum on first term
            term = termsEnum.next();
        }
    }
    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();
    if (docs.size() >= mincount) {
        while (term != null) {
            if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
                break;
            if (termFilter == null || termFilter.test(term)) {
                int df = termsEnum.docFreq();
                // make a large difference (for example, many terms with df=1).
                if (df > 0 && df > min) {
                    int c;
                    if (df >= minDfFilterCache) {
                        if (deState == null) {
                            deState = new SolrIndexSearcher.DocsEnumState();
                            deState.fieldName = field;
                            deState.liveDocs = r.getLiveDocs();
                            deState.termsEnum = termsEnum;
                            deState.postingsEnum = postingsEnum;
                        }
                        if (intersectsCheck) {
                            c = searcher.intersects(docs, deState) ? 1 : 0;
                        } else {
                            c = searcher.numDocs(docs, deState);
                        }
                        postingsEnum = deState.postingsEnum;
                    } else {
                        // iterate over TermDocs to calculate the intersection
                        // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it matter for this?
                        // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
                        // TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
                        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
                        c = 0;
                        if (postingsEnum instanceof MultiPostingsEnum) {
                            MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
                            int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                            SEGMENTS_LOOP: for (int subindex = 0; subindex < numSubs; subindex++) {
                                MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                                if (sub.postingsEnum == null)
                                    continue;
                                int base = sub.slice.start;
                                int docid;
                                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                    if (fastForRandomSet.exists(docid + base)) {
                                        c++;
                                        if (intersectsCheck) {
                                            assert c == 1;
                                            break SEGMENTS_LOOP;
                                        }
                                    }
                                }
                            }
                        } else {
                            int docid;
                            while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid)) {
                                    c++;
                                    if (intersectsCheck) {
                                        assert c == 1;
                                        break;
                                    }
                                }
                            }
                        }
                    }
                    if (sortByCount) {
                        if (c > min) {
                            BytesRef termCopy = BytesRef.deepCopyOf(term);
                            queue.add(new CountPair<>(termCopy, c));
                            if (queue.size() >= maxsize)
                                min = queue.last().val;
                        }
                    } else {
                        if (c >= mincount && --off < 0) {
                            if (--lim < 0)
                                break;
                            ft.indexedToReadable(term, charsRef);
                            res.add(charsRef.toString(), c);
                        }
                    }
                }
            }
            term = termsEnum.next();
        }
    }
    if (sortByCount) {
        for (CountPair<BytesRef, Integer> p : queue) {
            if (--off >= 0)
                continue;
            if (--lim < 0)
                break;
            ft.indexedToReadable(p.key, charsRef);
            res.add(charsRef.toString(), p.val);
        }
    }
    if (missing) {
        res.add(null, getFieldMissingCount(searcher, docs, field));
    }
    return res;
}
Also used : SortedIntDocSet(org.apache.solr.search.SortedIntDocSet) HashDocSet(org.apache.solr.search.HashDocSet) TermsEnum(org.apache.lucene.index.TermsEnum) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) LeafReader(org.apache.lucene.index.LeafReader) NamedList(org.apache.solr.common.util.NamedList) Terms(org.apache.lucene.index.Terms) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) FieldType(org.apache.solr.schema.FieldType) Fields(org.apache.lucene.index.Fields) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) IndexSchema(org.apache.solr.schema.IndexSchema) HashDocSet(org.apache.solr.search.HashDocSet) DocSet(org.apache.solr.search.DocSet) SortedIntDocSet(org.apache.solr.search.SortedIntDocSet) BitDocSet(org.apache.solr.search.BitDocSet)

Example 30 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class SolrIndexSearcher method getDocSet.

/** @lucene.internal */
public DocSet getDocSet(DocsEnumState deState) throws IOException {
    int largestPossible = deState.termsEnum.docFreq();
    boolean useCache = filterCache != null && largestPossible >= deState.minSetSizeCached;
    TermQuery key = null;
    if (useCache) {
        key = new TermQuery(new Term(deState.fieldName, deState.termsEnum.term()));
        DocSet result = filterCache.get(key);
        if (result != null)
            return result;
    }
    int smallSetSize = DocSetUtil.smallSetSize(maxDoc());
    int scratchSize = Math.min(smallSetSize, largestPossible);
    if (deState.scratch == null || deState.scratch.length < scratchSize)
        deState.scratch = new int[scratchSize];
    final int[] docs = deState.scratch;
    int upto = 0;
    int bitsSet = 0;
    FixedBitSet fbs = null;
    PostingsEnum postingsEnum = deState.termsEnum.postings(deState.postingsEnum, PostingsEnum.NONE);
    postingsEnum = BitsFilteredPostingsEnum.wrap(postingsEnum, deState.liveDocs);
    if (deState.postingsEnum == null) {
        deState.postingsEnum = postingsEnum;
    }
    if (postingsEnum instanceof MultiPostingsEnum) {
        MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
        int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
        for (int subindex = 0; subindex < numSubs; subindex++) {
            MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
            if (sub.postingsEnum == null)
                continue;
            int base = sub.slice.start;
            int docid;
            if (largestPossible > docs.length) {
                if (fbs == null)
                    fbs = new FixedBitSet(maxDoc());
                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    fbs.set(docid + base);
                    bitsSet++;
                }
            } else {
                while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    docs[upto++] = docid + base;
                }
            }
        }
    } else {
        int docid;
        if (largestPossible > docs.length) {
            fbs = new FixedBitSet(maxDoc());
            while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                fbs.set(docid);
                bitsSet++;
            }
        } else {
            while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                docs[upto++] = docid;
            }
        }
    }
    DocSet result;
    if (fbs != null) {
        for (int i = 0; i < upto; i++) {
            fbs.set(docs[i]);
        }
        bitsSet += upto;
        result = new BitDocSet(fbs, bitsSet);
    } else {
        result = upto == 0 ? DocSet.EMPTY : new SortedIntDocSet(Arrays.copyOf(docs, upto));
    }
    if (useCache) {
        filterCache.put(key, result);
    }
    return result;
}
Also used : Term(org.apache.lucene.index.Term) IndexFingerprint(org.apache.solr.update.IndexFingerprint) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) FixedBitSet(org.apache.lucene.util.FixedBitSet) MultiPostingsEnum(org.apache.lucene.index.MultiPostingsEnum) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)73 BytesRef (org.apache.lucene.util.BytesRef)55 TermsEnum (org.apache.lucene.index.TermsEnum)50 Terms (org.apache.lucene.index.Terms)42 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)16 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)10 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 Bits (org.apache.lucene.util.Bits)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5