Examples with SortedDocValues - org.apache.lucene.index.SortedDocValues

Example 46 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class DocValuesConsumer method mergeSortedField.

/**
   * Merges the sorted docvalues from <code>toMerge</code>.
   * <p>
   * The default implementation calls {@link #addSortedField}, passing
   * an Iterable that merges ordinals and values and filters deleted documents .
   */
public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) throws IOException {
    List<SortedDocValues> toMerge = new ArrayList<>();
    for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
        SortedDocValues values = null;
        DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
        if (docValuesProducer != null) {
            FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
            if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
                values = docValuesProducer.getSorted(fieldInfo);
            }
        }
        if (values == null) {
            values = DocValues.emptySorted();
        }
        toMerge.add(values);
    }
    final int numReaders = toMerge.size();
    final SortedDocValues[] dvs = toMerge.toArray(new SortedDocValues[numReaders]);
    // step 1: iterate thru each sub and mark terms still in use
    TermsEnum[] liveTerms = new TermsEnum[dvs.length];
    long[] weights = new long[liveTerms.length];
    for (int sub = 0; sub < numReaders; sub++) {
        SortedDocValues dv = dvs[sub];
        Bits liveDocs = mergeState.liveDocs[sub];
        if (liveDocs == null) {
            liveTerms[sub] = dv.termsEnum();
            weights[sub] = dv.getValueCount();
        } else {
            LongBitSet bitset = new LongBitSet(dv.getValueCount());
            int docID;
            while ((docID = dv.nextDoc()) != NO_MORE_DOCS) {
                if (liveDocs.get(docID)) {
                    int ord = dv.ordValue();
                    if (ord >= 0) {
                        bitset.set(ord);
                    }
                }
            }
            liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
            weights[sub] = bitset.cardinality();
        }
    }
    // step 2: create ordinal map (this conceptually does the "merging")
    final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
    // step 3: add field
    addSortedField(fieldInfo, new EmptyDocValuesProducer() {

        @Override
        public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException {
            if (fieldInfoIn != fieldInfo) {
                throw new IllegalArgumentException("wrong FieldInfo");
            }
            // We must make new iterators + DocIDMerger for each iterator:
            List<SortedDocValuesSub> subs = new ArrayList<>();
            long cost = 0;
            for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
                SortedDocValues values = null;
                DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
                if (docValuesProducer != null) {
                    FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
                    if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
                        values = docValuesProducer.getSorted(readerFieldInfo);
                    }
                }
                if (values == null) {
                    values = DocValues.emptySorted();
                }
                cost += values.cost();
                subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
            }
            final long finalCost = cost;
            final DocIDMerger<SortedDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
            return new SortedDocValues() {

                private int docID = -1;

                private int ord;

                @Override
                public int docID() {
                    return docID;
                }

                @Override
                public int nextDoc() throws IOException {
                    SortedDocValuesSub sub = docIDMerger.next();
                    if (sub == null) {
                        return docID = NO_MORE_DOCS;
                    }
                    int subOrd = sub.values.ordValue();
                    assert subOrd != -1;
                    ord = (int) sub.map.get(subOrd);
                    docID = sub.mappedDocID;
                    return docID;
                }

                @Override
                public int ordValue() {
                    return ord;
                }

                @Override
                public int advance(int target) {
                    throw new UnsupportedOperationException();
                }

                @Override
                public boolean advanceExact(int target) throws IOException {
                    throw new UnsupportedOperationException();
                }

                @Override
                public long cost() {
                    return finalCost;
                }

                @Override
                public int getValueCount() {
                    return (int) map.getValueCount();
                }

                @Override
                public BytesRef lookupOrd(int ord) throws IOException {
                    int segmentNumber = map.getFirstSegmentNumber(ord);
                    int segmentOrd = (int) map.getFirstSegmentOrd(ord);
                    return dvs[segmentNumber].lookupOrd(segmentOrd);
                }
            };
        }
    });
}

Also used : ArrayList(java.util.ArrayList) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) LongBitSet(org.apache.lucene.util.LongBitSet) IOException(java.io.IOException) SortedDocValues(org.apache.lucene.index.SortedDocValues) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap) TermsEnum(org.apache.lucene.index.TermsEnum) FilteredTermsEnum(org.apache.lucene.index.FilteredTermsEnum) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) DocIDMerger(org.apache.lucene.index.DocIDMerger) Bits(org.apache.lucene.util.Bits) ArrayList(java.util.ArrayList) List(java.util.List) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef)

Example 47 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class DatasetSplitter method split.

/**
   * Split a given index into 3 indexes for training, test and cross validation tasks respectively
   *
   * @param originalIndex        an {@link org.apache.lucene.index.LeafReader} on the source index
   * @param trainingIndex        a {@link Directory} used to write the training index
   * @param testIndex            a {@link Directory} used to write the test index
   * @param crossValidationIndex a {@link Directory} used to write the cross validation index
   * @param analyzer             {@link Analyzer} used to create the new docs
   * @param termVectors          {@code true} if term vectors should be kept
   * @param classFieldName       name of the field used as the label for classification; this must be indexed with sorted doc values
   * @param fieldNames           names of fields that need to be put in the new indexes or <code>null</code> if all should be used
   * @throws IOException if any writing operation fails on any of the indexes
   */
public void split(IndexReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, boolean termVectors, String classFieldName, String... fieldNames) throws IOException {
    // create IWs for train / test / cv IDXs
    IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer));
    IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer));
    IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer));
    // get the exact no. of existing classes
    int noOfClasses = 0;
    for (LeafReaderContext leave : originalIndex.leaves()) {
        long valueCount = 0;
        SortedDocValues classValues = leave.reader().getSortedDocValues(classFieldName);
        if (classValues != null) {
            valueCount = classValues.getValueCount();
        } else {
            SortedSetDocValues sortedSetDocValues = leave.reader().getSortedSetDocValues(classFieldName);
            if (sortedSetDocValues != null) {
                valueCount = sortedSetDocValues.getValueCount();
            }
        }
        if (classValues == null) {
            // approximate with no. of terms
            noOfClasses += leave.reader().terms(classFieldName).size();
        }
        noOfClasses += valueCount;
    }
    try {
        IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
        GroupingSearch gs = new GroupingSearch(classFieldName);
        gs.setGroupSort(Sort.INDEXORDER);
        gs.setSortWithinGroup(Sort.INDEXORDER);
        gs.setAllGroups(true);
        gs.setGroupDocsLimit(originalIndex.maxDoc());
        TopGroups<Object> topGroups = gs.search(indexSearcher, new MatchAllDocsQuery(), 0, noOfClasses);
        // set the type to be indexed, stored, with term vectors
        FieldType ft = new FieldType(TextField.TYPE_STORED);
        if (termVectors) {
            ft.setStoreTermVectors(true);
            ft.setStoreTermVectorOffsets(true);
            ft.setStoreTermVectorPositions(true);
        }
        int b = 0;
        // iterate over existing documents
        for (GroupDocs<Object> group : topGroups.groups) {
            int totalHits = group.totalHits;
            double testSize = totalHits * testRatio;
            int tc = 0;
            double cvSize = totalHits * crossValidationRatio;
            int cvc = 0;
            for (ScoreDoc scoreDoc : group.scoreDocs) {
                // create a new document for indexing
                Document doc = createNewDoc(originalIndex, ft, scoreDoc, fieldNames);
                // add it to one of the IDXs
                if (b % 2 == 0 && tc < testSize) {
                    testWriter.addDocument(doc);
                    tc++;
                } else if (cvc < cvSize) {
                    cvWriter.addDocument(doc);
                    cvc++;
                } else {
                    trainingWriter.addDocument(doc);
                }
                b++;
            }
        }
        // commit
        testWriter.commit();
        cvWriter.commit();
        trainingWriter.commit();
        // merge
        testWriter.forceMerge(3);
        cvWriter.forceMerge(3);
        trainingWriter.forceMerge(3);
    } catch (Exception e) {
        throw new IOException(e);
    } finally {
        // close IWs
        testWriter.close();
        cvWriter.close();
        trainingWriter.close();
        originalIndex.close();
    }
}

Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) IOException(java.io.IOException) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Document(org.apache.lucene.document.Document) SortedDocValues(org.apache.lucene.index.SortedDocValues) IOException(java.io.IOException) FieldType(org.apache.lucene.document.FieldType) ScoreDoc(org.apache.lucene.search.ScoreDoc) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) IndexWriter(org.apache.lucene.index.IndexWriter) GroupingSearch(org.apache.lucene.search.grouping.GroupingSearch) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 48 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class ElevationComparatorSource method newComparator.

@Override
public FieldComparator<Integer> newComparator(final String fieldname, final int numHits, int sortPos, boolean reversed) {
    return new FieldComparator<Integer>() {

        private final int[] values = new int[numHits];

        int bottomVal;

        @Override
        public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
            return new LeafFieldComparator() {

                @Override
                public void setBottom(int slot) {
                    bottomVal = values[slot];
                }

                @Override
                public int compareTop(int doc) {
                    throw new UnsupportedOperationException();
                }

                private int docVal(int doc) throws IOException {
                    SortedDocValues idIndex = DocValues.getSorted(context.reader(), fieldname);
                    if (idIndex.advance(doc) == doc) {
                        final BytesRef term = idIndex.binaryValue();
                        Integer prio = priority.get(term);
                        return prio == null ? 0 : prio.intValue();
                    } else {
                        return 0;
                    }
                }

                @Override
                public int compareBottom(int doc) throws IOException {
                    return docVal(doc) - bottomVal;
                }

                @Override
                public void copy(int slot, int doc) throws IOException {
                    values[slot] = docVal(doc);
                }

                @Override
                public void setScorer(Scorer scorer) {
                }
            };
        }

        @Override
        public int compare(int slot1, int slot2) {
            // values will be small enough that there is no overflow concern
            return values[slot2] - values[slot1];
        }

        @Override
        public void setTopValue(Integer value) {
            throw new UnsupportedOperationException();
        }

        @Override
        public Integer value(int slot) {
            return Integer.valueOf(values[slot]);
        }
    };
}

Also used : LeafReaderContext(org.apache.lucene.index.LeafReaderContext) SortedDocValues(org.apache.lucene.index.SortedDocValues) BytesRef(org.apache.lucene.util.BytesRef)

Example 49 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class FacetFieldProcessorByArrayDV method findStartAndEndOrds.

@Override
protected void findStartAndEndOrds() throws IOException {
    if (multiValuedField) {
        si = FieldUtil.getSortedSetDocValues(fcontext.qcontext, sf, null);
        if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
            ordinalMap = ((MultiDocValues.MultiSortedSetDocValues) si).mapping;
        }
    } else {
        // multi-valued view
        SortedDocValues single = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null);
        si = DocValues.singleton(single);
        if (single instanceof MultiDocValues.MultiSortedDocValues) {
            ordinalMap = ((MultiDocValues.MultiSortedDocValues) single).mapping;
        }
    }
    if (si.getValueCount() >= Integer.MAX_VALUE) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Field has too many unique values. field=" + sf + " nterms= " + si.getValueCount());
    }
    if (prefixRef != null) {
        startTermIndex = (int) si.lookupTerm(prefixRef.get());
        if (startTermIndex < 0)
            startTermIndex = -startTermIndex - 1;
        prefixRef.append(UnicodeUtil.BIG_TERM);
        endTermIndex = (int) si.lookupTerm(prefixRef.get());
        assert endTermIndex < 0;
        endTermIndex = -endTermIndex - 1;
    } else {
        startTermIndex = 0;
        endTermIndex = (int) si.getValueCount();
    }
    nTerms = endTermIndex - startTermIndex;
}

Also used : MultiDocValues(org.apache.lucene.index.MultiDocValues) SortedDocValues(org.apache.lucene.index.SortedDocValues) SolrException(org.apache.solr.common.SolrException)

Example 50 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class FacetFieldProcessorByArrayDV method collectDocs.

@Override
protected void collectDocs() throws IOException {
    int domainSize = fcontext.base.size();
    if (nTerms <= 0 || domainSize < effectiveMincount) {
        // TODO: what about allBuckets? missing bucket?
        return;
    }
    // TODO: refactor some of this logic into a base class
    boolean countOnly = collectAcc == null && allBucketsAcc == null;
    boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
    // Are we expecting many hits per bucket?
    // FUTURE: pro-rate for nTerms?
    // FUTURE: better take into account number of values in multi-valued fields.  This info is available for indexed fields.
    // FUTURE: take into account that bigger ord maps are more expensive than smaller ones
    // One test: 5M doc index, faceting on a single-valued field with almost 1M unique values, crossover point where global counting was slower
    // than per-segment counting was a domain of 658k docs.  At that point, top 10 buckets had 6-7 matches each.
    // this was for heap docvalues produced by UninvertingReader
    // Since these values were randomly distributed, lets round our domain multiplier up to account for less random real world data.
    long domainMultiplier = multiValuedField ? 4L : 2L;
    // +3 to increase test coverage with small tests
    boolean manyHitsPerBucket = domainSize * domainMultiplier > (si.getValueCount() + 3);
    // If we're only calculating counts, we're not prefixing, and we expect to collect many documents per unique value,
    // then collect per-segment before mapping to global ords at the end.  This will save redundant seg->global ord mappings.
    // FUTURE: there are probably some other non "countOnly" cases where we can use this as well (i.e. those where
    // the docid is not used)
    boolean canDoPerSeg = countOnly && fullRange;
    boolean accumSeg = manyHitsPerBucket && canDoPerSeg;
    // internal - override perSeg heuristic
    if (freq.perSeg != null)
        accumSeg = canDoPerSeg && freq.perSeg;
    final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
    Filter filter = fcontext.base.getTopFilter();
    for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
        LeafReaderContext subCtx = leaves.get(subIdx);
        setNextReaderFirstPhase(subCtx);
        // solr docsets already exclude any deleted docs
        DocIdSet dis = filter.getDocIdSet(subCtx, null);
        DocIdSetIterator disi = dis.iterator();
        SortedDocValues singleDv = null;
        SortedSetDocValues multiDv = null;
        if (multiValuedField) {
            // TODO: get sub from multi?
            multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
            if (multiDv == null) {
                multiDv = DocValues.emptySortedSet();
            }
            // this will be null if this is not a wrapped single valued docvalues.
            if (unwrap_singleValued_multiDv) {
                singleDv = DocValues.unwrapSingleton(multiDv);
            }
        } else {
            singleDv = subCtx.reader().getSortedDocValues(sf.getName());
            if (singleDv == null) {
                singleDv = DocValues.emptySorted();
            }
        }
        LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
        if (singleDv != null) {
            if (accumSeg) {
                collectPerSeg(singleDv, disi, toGlobal);
            } else {
                if (canDoPerSeg && toGlobal != null) {
                    collectCounts(singleDv, disi, toGlobal);
                } else {
                    collectDocs(singleDv, disi, toGlobal);
                }
            }
        } else {
            if (accumSeg) {
                collectPerSeg(multiDv, disi, toGlobal);
            } else {
                if (canDoPerSeg && toGlobal != null) {
                    collectCounts(multiDv, disi, toGlobal);
                } else {
                    collectDocs(multiDv, disi, toGlobal);
                }
            }
        }
    }
    // better GC
    reuse = null;
}

Also used : SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Filter(org.apache.solr.search.Filter) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) DocIdSet(org.apache.lucene.search.DocIdSet) LongValues(org.apache.lucene.util.LongValues) DocIdSetIterator(org.apache.lucene.search.DocIdSetIterator) SortedDocValues(org.apache.lucene.index.SortedDocValues)

Aggregations

SortedDocValues (org.apache.lucene.index.SortedDocValues)65 BytesRef (org.apache.lucene.util.BytesRef)32 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)27 LeafReader (org.apache.lucene.index.LeafReader)22 SortedSetDocValues (org.apache.lucene.index.SortedSetDocValues)22 Document (org.apache.lucene.document.Document)21 Directory (org.apache.lucene.store.Directory)15 NumericDocValues (org.apache.lucene.index.NumericDocValues)14 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)14 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)13 IOException (java.io.IOException)12 BinaryDocValues (org.apache.lucene.index.BinaryDocValues)11 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)10 MultiDocValues (org.apache.lucene.index.MultiDocValues)10 IndexReader (org.apache.lucene.index.IndexReader)9 OrdinalMap (org.apache.lucene.index.MultiDocValues.OrdinalMap)9 ArrayList (java.util.ArrayList)8 DoublePoint (org.apache.lucene.document.DoublePoint)8 FloatPoint (org.apache.lucene.document.FloatPoint)8 IntPoint (org.apache.lucene.document.IntPoint)8