use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.
the class TestJoinUtil method testMinMaxDocs.
public void testMinMaxDocs() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)));
int minChildDocsPerParent = 2;
int maxChildDocsPerParent = 16;
int numParents = RandomNumbers.randomIntBetween(random(), 16, 64);
int[] childDocsPerParent = new int[numParents];
for (int p = 0; p < numParents; p++) {
String parentId = Integer.toString(p);
Document parentDoc = new Document();
parentDoc.add(new StringField("id", parentId, Field.Store.YES));
parentDoc.add(new StringField("type", "to", Field.Store.NO));
parentDoc.add(new SortedDocValuesField("join_field", new BytesRef(parentId)));
iw.addDocument(parentDoc);
int numChildren = RandomNumbers.randomIntBetween(random(), minChildDocsPerParent, maxChildDocsPerParent);
childDocsPerParent[p] = numChildren;
for (int c = 0; c < numChildren; c++) {
String childId = Integer.toString(p + c);
Document childDoc = new Document();
childDoc.add(new StringField("id", childId, Field.Store.YES));
childDoc.add(new StringField("type", "from", Field.Store.NO));
childDoc.add(new SortedDocValuesField("join_field", new BytesRef(parentId)));
iw.addDocument(childDoc);
}
}
iw.close();
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir));
SortedDocValues[] values = new SortedDocValues[searcher.getIndexReader().leaves().size()];
for (LeafReaderContext leadContext : searcher.getIndexReader().leaves()) {
values[leadContext.ord] = DocValues.getSorted(leadContext.reader(), "join_field");
}
MultiDocValues.OrdinalMap ordinalMap = MultiDocValues.OrdinalMap.build(null, values, PackedInts.DEFAULT);
Query fromQuery = new TermQuery(new Term("type", "from"));
Query toQuery = new TermQuery(new Term("type", "to"));
int iters = RandomNumbers.randomIntBetween(random(), 3, 9);
for (int i = 1; i <= iters; i++) {
final ScoreMode scoreMode = ScoreMode.values()[random().nextInt(ScoreMode.values().length)];
int min = RandomNumbers.randomIntBetween(random(), minChildDocsPerParent, maxChildDocsPerParent - 1);
int max = RandomNumbers.randomIntBetween(random(), min, maxChildDocsPerParent);
if (VERBOSE) {
System.out.println("iter=" + i);
System.out.println("scoreMode=" + scoreMode);
System.out.println("min=" + min);
System.out.println("max=" + max);
}
Query joinQuery = JoinUtil.createJoinQuery("join_field", fromQuery, toQuery, searcher, scoreMode, ordinalMap, min, max);
TotalHitCountCollector collector = new TotalHitCountCollector();
searcher.search(joinQuery, collector);
int expectedCount = 0;
for (int numChildDocs : childDocsPerParent) {
if (numChildDocs >= min && numChildDocs <= max) {
expectedCount++;
}
}
assertEquals(expectedCount, collector.getTotalHits());
}
searcher.getIndexReader().close();
dir.close();
}
use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.
the class FieldCacheImpl method getDocTermOrds.
// TODO: this if DocTermsIndex was already created, we
// should share it...
public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException {
// not a general purpose filtering mechanism...
assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX;
SortedSetDocValues dv = reader.getSortedSetDocValues(field);
if (dv != null) {
return dv;
}
SortedDocValues sdv = reader.getSortedDocValues(field);
if (sdv != null) {
return DocValues.singleton(sdv);
}
final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info == null) {
return DocValues.emptySortedSet();
} else if (info.getDocValuesType() != DocValuesType.NONE) {
throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
} else if (info.getIndexOptions() == IndexOptions.NONE) {
return DocValues.emptySortedSet();
}
// ok we need to uninvert. check if we can optimize a bit.
Terms terms = reader.terms(field);
if (terms == null) {
return DocValues.emptySortedSet();
} else {
// if #postings = #docswithfield we know that the field is "single valued enough".
// it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency.
// it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf
long numPostings = terms.getSumDocFreq();
if (numPostings != -1 && numPostings == terms.getDocCount()) {
return DocValues.singleton(getTermsIndex(reader, field));
}
}
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix));
return dto.iterator(reader);
}
use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.
the class FieldCacheImpl method getTermsIndex.
public SortedDocValues getTermsIndex(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException {
SortedDocValues valuesIn = reader.getSortedDocValues(field);
if (valuesIn != null) {
// per-thread by SegmentReader):
return valuesIn;
} else {
final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info == null) {
return DocValues.emptySorted();
} else if (info.getDocValuesType() != DocValuesType.NONE) {
// values because dedup can be very costly
throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
} else if (info.getIndexOptions() == IndexOptions.NONE) {
return DocValues.emptySorted();
}
SortedDocValuesImpl impl = (SortedDocValuesImpl) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio));
return impl.iterator();
}
}
use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.
the class Lucene70DocValuesConsumer method addSortedSetField.
@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene70DocValuesFormat.SORTED_SET);
SortedSetDocValues values = valuesProducer.getSortedSet(field);
int numDocsWithField = 0;
long numOrds = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
numDocsWithField++;
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
numOrds++;
}
}
if (numDocsWithField == numOrds) {
meta.writeByte((byte) 0);
doAddSortedField(field, new EmptyDocValuesProducer() {
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
return SortedSetSelector.wrap(valuesProducer.getSortedSet(field), SortedSetSelector.Type.MIN);
}
});
return;
}
meta.writeByte((byte) 1);
assert numDocsWithField != 0;
if (numDocsWithField == maxDoc) {
meta.writeLong(-1);
meta.writeLong(0L);
} else {
long offset = data.getFilePointer();
meta.writeLong(offset);
values = valuesProducer.getSortedSet(field);
IndexedDISI.writeBitSet(values, data);
meta.writeLong(data.getFilePointer() - offset);
}
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
meta.writeByte((byte) numberOfBitsPerOrd);
long start = data.getFilePointer();
meta.writeLong(start);
DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd);
values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
writer.add(ord);
}
}
writer.finish();
meta.writeLong(data.getFilePointer() - start);
meta.writeInt(numDocsWithField);
start = data.getFilePointer();
meta.writeLong(start);
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
final DirectMonotonicWriter addressesWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
long addr = 0;
addressesWriter.add(addr);
values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
values.nextOrd();
addr++;
while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
addr++;
}
addressesWriter.add(addr);
}
addressesWriter.finish();
meta.writeLong(data.getFilePointer() - start);
addTermsDict(values);
}
use of org.apache.lucene.index.SortedDocValues in project lucene-solr by apache.
the class SortedSetDocValuesRangeQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
SortedSetDocValues values = getValues(context.reader(), field);
if (values == null) {
return null;
}
final long minOrd;
if (lowerValue == null) {
minOrd = 0;
} else {
final long ord = values.lookupTerm(lowerValue);
if (ord < 0) {
minOrd = -1 - ord;
} else if (lowerInclusive) {
minOrd = ord;
} else {
minOrd = ord + 1;
}
}
final long maxOrd;
if (upperValue == null) {
maxOrd = values.getValueCount() - 1;
} else {
final long ord = values.lookupTerm(upperValue);
if (ord < 0) {
maxOrd = -2 - ord;
} else if (upperInclusive) {
maxOrd = ord;
} else {
maxOrd = ord - 1;
}
}
if (minOrd > maxOrd) {
return null;
}
final SortedDocValues singleton = DocValues.unwrapSingleton(values);
final TwoPhaseIterator iterator;
if (singleton != null) {
iterator = new TwoPhaseIterator(singleton) {
@Override
public boolean matches() throws IOException {
final long ord = singleton.ordValue();
return ord >= minOrd && ord <= maxOrd;
}
@Override
public float matchCost() {
// 2 comparisons
return 2;
}
};
} else {
iterator = new TwoPhaseIterator(values) {
@Override
public boolean matches() throws IOException {
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
if (ord < minOrd) {
continue;
}
// Values are sorted, so the first ord that is >= minOrd is our best candidate
return ord <= maxOrd;
}
// all ords were < minOrd
return false;
}
@Override
public float matchCost() {
// 2 comparisons
return 2;
}
};
}
return new ConstantScoreScorer(this, score(), iterator);
}
};
}
Aggregations