Search in sources :

Example 56 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class TestJoinUtil method testMinMaxDocs.

public void testMinMaxDocs() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)));
    int minChildDocsPerParent = 2;
    int maxChildDocsPerParent = 16;
    int numParents = RandomNumbers.randomIntBetween(random(), 16, 64);
    int[] childDocsPerParent = new int[numParents];
    for (int p = 0; p < numParents; p++) {
        String parentId = Integer.toString(p);
        Document parentDoc = new Document();
        parentDoc.add(new StringField("id", parentId, Field.Store.YES));
        parentDoc.add(new StringField("type", "to", Field.Store.NO));
        parentDoc.add(new SortedDocValuesField("join_field", new BytesRef(parentId)));
        iw.addDocument(parentDoc);
        int numChildren = RandomNumbers.randomIntBetween(random(), minChildDocsPerParent, maxChildDocsPerParent);
        childDocsPerParent[p] = numChildren;
        for (int c = 0; c < numChildren; c++) {
            String childId = Integer.toString(p + c);
            Document childDoc = new Document();
            childDoc.add(new StringField("id", childId, Field.Store.YES));
            childDoc.add(new StringField("type", "from", Field.Store.NO));
            childDoc.add(new SortedDocValuesField("join_field", new BytesRef(parentId)));
            iw.addDocument(childDoc);
        }
    }
    iw.close();
    IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir));
    SortedDocValues[] values = new SortedDocValues[searcher.getIndexReader().leaves().size()];
    for (LeafReaderContext leadContext : searcher.getIndexReader().leaves()) {
        values[leadContext.ord] = DocValues.getSorted(leadContext.reader(), "join_field");
    }
    MultiDocValues.OrdinalMap ordinalMap = MultiDocValues.OrdinalMap.build(null, values, PackedInts.DEFAULT);
    Query fromQuery = new TermQuery(new Term("type", "from"));
    Query toQuery = new TermQuery(new Term("type", "to"));
    int iters = RandomNumbers.randomIntBetween(random(), 3, 9);
    for (int i = 1; i <= iters; i++) {
        final ScoreMode scoreMode = ScoreMode.values()[random().nextInt(ScoreMode.values().length)];
        int min = RandomNumbers.randomIntBetween(random(), minChildDocsPerParent, maxChildDocsPerParent - 1);
        int max = RandomNumbers.randomIntBetween(random(), min, maxChildDocsPerParent);
        if (VERBOSE) {
            System.out.println("iter=" + i);
            System.out.println("scoreMode=" + scoreMode);
            System.out.println("min=" + min);
            System.out.println("max=" + max);
        }
        Query joinQuery = JoinUtil.createJoinQuery("join_field", fromQuery, toQuery, searcher, scoreMode, ordinalMap, min, max);
        TotalHitCountCollector collector = new TotalHitCountCollector();
        searcher.search(joinQuery, collector);
        int expectedCount = 0;
        for (int numChildDocs : childDocsPerParent) {
            if (numChildDocs >= min && numChildDocs <= max) {
                expectedCount++;
            }
        }
        assertEquals(expectedCount, collector.getTotalHits());
    }
    searcher.getIndexReader().close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) FieldValueQuery(org.apache.lucene.search.FieldValueQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) MultiDocValues(org.apache.lucene.index.MultiDocValues) DoublePoint(org.apache.lucene.document.DoublePoint) LongPoint(org.apache.lucene.document.LongPoint) IntPoint(org.apache.lucene.document.IntPoint) FloatPoint(org.apache.lucene.document.FloatPoint) SortedDocValues(org.apache.lucene.index.SortedDocValues) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringField(org.apache.lucene.document.StringField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) TotalHitCountCollector(org.apache.lucene.search.TotalHitCountCollector) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap)

Example 57 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class FieldCacheImpl method getDocTermOrds.

// TODO: this if DocTermsIndex was already created, we
// should share it...
public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException {
    // not a general purpose filtering mechanism...
    assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX;
    SortedSetDocValues dv = reader.getSortedSetDocValues(field);
    if (dv != null) {
        return dv;
    }
    SortedDocValues sdv = reader.getSortedDocValues(field);
    if (sdv != null) {
        return DocValues.singleton(sdv);
    }
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (info == null) {
        return DocValues.emptySortedSet();
    } else if (info.getDocValuesType() != DocValuesType.NONE) {
        throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    } else if (info.getIndexOptions() == IndexOptions.NONE) {
        return DocValues.emptySortedSet();
    }
    // ok we need to uninvert. check if we can optimize a bit.
    Terms terms = reader.terms(field);
    if (terms == null) {
        return DocValues.emptySortedSet();
    } else {
        // if #postings = #docswithfield we know that the field is "single valued enough".
        // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency.
        // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf
        long numPostings = terms.getSumDocFreq();
        if (numPostings != -1 && numPostings == terms.getDocCount()) {
            return DocValues.singleton(getTermsIndex(reader, field));
        }
    }
    DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix));
    return dto.iterator(reader);
}
Also used : SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Terms(org.apache.lucene.index.Terms) SortedDocValues(org.apache.lucene.index.SortedDocValues) FieldInfo(org.apache.lucene.index.FieldInfo)

Example 58 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class FieldCacheImpl method getTermsIndex.

public SortedDocValues getTermsIndex(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException {
    SortedDocValues valuesIn = reader.getSortedDocValues(field);
    if (valuesIn != null) {
        // per-thread by SegmentReader):
        return valuesIn;
    } else {
        final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
        if (info == null) {
            return DocValues.emptySorted();
        } else if (info.getDocValuesType() != DocValuesType.NONE) {
            // values because dedup can be very costly
            throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
        } else if (info.getIndexOptions() == IndexOptions.NONE) {
            return DocValues.emptySorted();
        }
        SortedDocValuesImpl impl = (SortedDocValuesImpl) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio));
        return impl.iterator();
    }
}
Also used : SortedDocValues(org.apache.lucene.index.SortedDocValues) FieldInfo(org.apache.lucene.index.FieldInfo)

Example 59 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class Lucene70DocValuesConsumer method addSortedSetField.

@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    meta.writeInt(field.number);
    meta.writeByte(Lucene70DocValuesFormat.SORTED_SET);
    SortedSetDocValues values = valuesProducer.getSortedSet(field);
    int numDocsWithField = 0;
    long numOrds = 0;
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
        numDocsWithField++;
        for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
            numOrds++;
        }
    }
    if (numDocsWithField == numOrds) {
        meta.writeByte((byte) 0);
        doAddSortedField(field, new EmptyDocValuesProducer() {

            @Override
            public SortedDocValues getSorted(FieldInfo field) throws IOException {
                return SortedSetSelector.wrap(valuesProducer.getSortedSet(field), SortedSetSelector.Type.MIN);
            }
        });
        return;
    }
    meta.writeByte((byte) 1);
    assert numDocsWithField != 0;
    if (numDocsWithField == maxDoc) {
        meta.writeLong(-1);
        meta.writeLong(0L);
    } else {
        long offset = data.getFilePointer();
        meta.writeLong(offset);
        values = valuesProducer.getSortedSet(field);
        IndexedDISI.writeBitSet(values, data);
        meta.writeLong(data.getFilePointer() - offset);
    }
    int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
    meta.writeByte((byte) numberOfBitsPerOrd);
    long start = data.getFilePointer();
    meta.writeLong(start);
    DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd);
    values = valuesProducer.getSortedSet(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
        for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
            writer.add(ord);
        }
    }
    writer.finish();
    meta.writeLong(data.getFilePointer() - start);
    meta.writeInt(numDocsWithField);
    start = data.getFilePointer();
    meta.writeLong(start);
    meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
    final DirectMonotonicWriter addressesWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
    long addr = 0;
    addressesWriter.add(addr);
    values = valuesProducer.getSortedSet(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
        values.nextOrd();
        addr++;
        while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
            addr++;
        }
        addressesWriter.add(addr);
    }
    addressesWriter.finish();
    meta.writeLong(data.getFilePointer() - start);
    addTermsDict(values);
}
Also used : SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) DirectWriter(org.apache.lucene.util.packed.DirectWriter) IOException(java.io.IOException) DirectMonotonicWriter(org.apache.lucene.util.packed.DirectMonotonicWriter) FieldInfo(org.apache.lucene.index.FieldInfo) SortedDocValues(org.apache.lucene.index.SortedDocValues)

Example 60 with SortedDocValues

use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.

the class SortedSetDocValuesRangeQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    return new ConstantScoreWeight(this, boost) {

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
            SortedSetDocValues values = getValues(context.reader(), field);
            if (values == null) {
                return null;
            }
            final long minOrd;
            if (lowerValue == null) {
                minOrd = 0;
            } else {
                final long ord = values.lookupTerm(lowerValue);
                if (ord < 0) {
                    minOrd = -1 - ord;
                } else if (lowerInclusive) {
                    minOrd = ord;
                } else {
                    minOrd = ord + 1;
                }
            }
            final long maxOrd;
            if (upperValue == null) {
                maxOrd = values.getValueCount() - 1;
            } else {
                final long ord = values.lookupTerm(upperValue);
                if (ord < 0) {
                    maxOrd = -2 - ord;
                } else if (upperInclusive) {
                    maxOrd = ord;
                } else {
                    maxOrd = ord - 1;
                }
            }
            if (minOrd > maxOrd) {
                return null;
            }
            final SortedDocValues singleton = DocValues.unwrapSingleton(values);
            final TwoPhaseIterator iterator;
            if (singleton != null) {
                iterator = new TwoPhaseIterator(singleton) {

                    @Override
                    public boolean matches() throws IOException {
                        final long ord = singleton.ordValue();
                        return ord >= minOrd && ord <= maxOrd;
                    }

                    @Override
                    public float matchCost() {
                        // 2 comparisons
                        return 2;
                    }
                };
            } else {
                iterator = new TwoPhaseIterator(values) {

                    @Override
                    public boolean matches() throws IOException {
                        for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
                            if (ord < minOrd) {
                                continue;
                            }
                            // Values are sorted, so the first ord that is >= minOrd is our best candidate
                            return ord <= maxOrd;
                        }
                        // all ords were < minOrd
                        return false;
                    }

                    @Override
                    public float matchCost() {
                        // 2 comparisons
                        return 2;
                    }
                };
            }
            return new ConstantScoreScorer(this, score(), iterator);
        }
    };
}
Also used : TwoPhaseIterator(org.apache.lucene.search.TwoPhaseIterator) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) ConstantScoreScorer(org.apache.lucene.search.ConstantScoreScorer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) IOException(java.io.IOException) ConstantScoreWeight(org.apache.lucene.search.ConstantScoreWeight) SortedDocValues(org.apache.lucene.index.SortedDocValues)

Aggregations

SortedDocValues (org.apache.lucene.index.SortedDocValues)65 BytesRef (org.apache.lucene.util.BytesRef)32 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)27 LeafReader (org.apache.lucene.index.LeafReader)22 SortedSetDocValues (org.apache.lucene.index.SortedSetDocValues)22 Document (org.apache.lucene.document.Document)21 Directory (org.apache.lucene.store.Directory)15 NumericDocValues (org.apache.lucene.index.NumericDocValues)14 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)14 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)13 IOException (java.io.IOException)12 BinaryDocValues (org.apache.lucene.index.BinaryDocValues)11 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)10 MultiDocValues (org.apache.lucene.index.MultiDocValues)10 IndexReader (org.apache.lucene.index.IndexReader)9 OrdinalMap (org.apache.lucene.index.MultiDocValues.OrdinalMap)9 ArrayList (java.util.ArrayList)8 DoublePoint (org.apache.lucene.document.DoublePoint)8 FloatPoint (org.apache.lucene.document.FloatPoint)8 IntPoint (org.apache.lucene.document.IntPoint)8