use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class TestLucene70DocValuesFormat method testSortedSetAroundBlockSize.
@Slow
public void testSortedSetAroundBlockSize() throws IOException {
final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
final Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
RAMFile buffer = new RAMFile();
RAMOutputStream out = new RAMOutputStream(buffer, false);
Document doc = new Document();
SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
doc.add(field1);
SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
doc.add(field2);
for (int i = 0; i < maxDoc; ++i) {
BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
field1.setBytesValue(s1);
field2.setBytesValue(s2);
w.addDocument(doc);
Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
out.writeVInt(set.size());
for (BytesRef ref : set) {
out.writeVInt(ref.length);
out.writeBytes(ref.bytes, ref.offset, ref.length);
}
}
out.close();
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
w.close();
LeafReader sr = getOnlyLeafReader(r);
assertEquals(maxDoc, sr.maxDoc());
SortedSetDocValues values = sr.getSortedSetDocValues("sset");
assertNotNull(values);
RAMInputStream in = new RAMInputStream("", buffer);
BytesRefBuilder b = new BytesRefBuilder();
for (int i = 0; i < maxDoc; ++i) {
assertEquals(i, values.nextDoc());
final int numValues = in.readVInt();
for (int j = 0; j < numValues; ++j) {
b.setLength(in.readVInt());
b.grow(b.length());
in.readBytes(b.bytes(), 0, b.length());
assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
}
assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
}
r.close();
dir.close();
}
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class TestLucene70DocValuesFormat method doTestSparseDocValuesVsStoredFields.
private void doTestSparseDocValuesVsStoredFields() throws Exception {
final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
for (int i = 0; i < values.length; ++i) {
values[i] = random().nextLong();
}
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// sparse compression is only enabled if less than 1% of docs have a value
final int avgGap = 100;
final int numDocs = atLeast(200);
for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
writer.addDocument(new Document());
}
final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
// single-valued
long docValue = values[random().nextInt(values.length)];
doc.add(new NumericDocValuesField("numeric", docValue));
doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("value", docValue));
// multi-valued
final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
for (int j = 0; j < numValues; ++j) {
docValue = values[random().nextInt(values.length)];
doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
doc.add(new StoredField("values", docValue));
}
writer.addDocument(doc);
// add a gap
for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
writer.addDocument(new Document());
}
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
final IndexReader indexReader = writer.getReader();
writer.close();
for (LeafReaderContext context : indexReader.leaves()) {
final LeafReader reader = context.reader();
final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
for (int i = 0; i < reader.maxDoc(); ++i) {
final Document doc = reader.document(i);
final IndexableField valueField = doc.getField("value");
final Long value = valueField == null ? null : valueField.numericValue().longValue();
if (value == null) {
assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
} else {
assertEquals(i, numeric.nextDoc());
assertEquals(i, binary.nextDoc());
assertEquals(i, sorted.nextDoc());
assertEquals(value.longValue(), numeric.longValue());
assertTrue(sorted.ordValue() >= 0);
assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
}
final IndexableField[] valuesFields = doc.getFields("values");
if (valuesFields.length == 0) {
assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
} else {
final Set<Long> valueSet = new HashSet<>();
for (IndexableField sf : valuesFields) {
valueSet.add(sf.numericValue().longValue());
}
assertEquals(i, sortedNumeric.nextDoc());
assertEquals(valuesFields.length, sortedNumeric.docValueCount());
for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
assertTrue(valueSet.contains(sortedNumeric.nextValue()));
}
assertEquals(i, sortedSet.nextDoc());
int sortedSetCount = 0;
while (true) {
long ord = sortedSet.nextOrd();
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
break;
}
assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
sortedSetCount++;
}
assertEquals(valueSet.size(), sortedSetCount);
}
}
}
indexReader.close();
dir.close();
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class TestLucene70DocValuesFormat method doTestTermsEnumRandom.
// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMergeScheduler(new SerialMergeScheduler());
// set to duel against a codec which has ordinals:
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
final DocValuesFormat dv = new Lucene70DocValuesFormat();
conf.setCodec(new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return pf;
}
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return dv;
}
});
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// index some docs
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
doc.add(idField);
int numValues = random().nextInt(17);
// create a random list of strings
List<String> values = new ArrayList<>();
for (int v = 0; v < numValues; v++) {
values.add(valuesProducer.get());
}
// add in any order to the indexed field
ArrayList<String> unordered = new ArrayList<>(values);
Collections.shuffle(unordered, random());
for (String v : values) {
doc.add(newStringField("indexed", v, Field.Store.NO));
}
// add in any order to the dv field
ArrayList<String> unordered2 = new ArrayList<>(values);
Collections.shuffle(unordered2, random());
for (String v : unordered2) {
doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
}
writer.addDocument(doc);
if (random().nextInt(31) == 0) {
writer.commit();
}
}
// delete some docs
int numDeletions = random().nextInt(numDocs / 10);
for (int i = 0; i < numDeletions; i++) {
int id = random().nextInt(numDocs);
writer.deleteDocuments(new Term("id", Integer.toString(id)));
}
// compare per-segment
DirectoryReader ir = writer.getReader();
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
Terms terms = r.terms("indexed");
if (terms != null) {
SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
assertEquals(terms.size(), ssdv.getValueCount());
TermsEnum expected = terms.iterator();
TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
assertEquals(terms.size(), expected, actual);
doTestSortedSetEnumAdvanceIndependently(ssdv);
}
}
ir.close();
writer.forceMerge(1);
// now compare again after the merge
ir = writer.getReader();
LeafReader ar = getOnlyLeafReader(ir);
Terms terms = ar.terms("indexed");
if (terms != null) {
assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
TermsEnum expected = terms.iterator();
TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
assertEquals(terms.size(), expected, actual);
}
ir.close();
writer.close();
dir.close();
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class SimpleTextDocValuesWriter method addSortedSetField.
@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
assert fieldSeen(field.name);
assert field.getDocValuesType() == DocValuesType.SORTED_SET;
writeFieldEntry(field, DocValuesType.SORTED_SET);
long valueCount = 0;
int maxLength = 0;
TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
maxLength = Math.max(maxLength, value.length);
valueCount++;
}
// write numValues
SimpleTextUtil.write(data, NUMVALUES);
SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
SimpleTextUtil.writeNewline(data);
// write maxLength
SimpleTextUtil.write(data, MAXLENGTH);
SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
SimpleTextUtil.writeNewline(data);
int maxBytesLength = Integer.toString(maxLength).length();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < maxBytesLength; i++) {
sb.append('0');
}
// write our pattern for encoding lengths
SimpleTextUtil.write(data, PATTERN);
SimpleTextUtil.write(data, sb.toString(), scratch);
SimpleTextUtil.writeNewline(data);
final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
// compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
int maxOrdListLength = 0;
StringBuilder sb2 = new StringBuilder();
SortedSetDocValues values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
sb2.setLength(0);
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
if (sb2.length() > 0) {
sb2.append(",");
}
sb2.append(Long.toString(ord));
}
maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
}
sb2.setLength(0);
for (int i = 0; i < maxOrdListLength; i++) {
sb2.append('X');
}
// write our pattern for ord lists
SimpleTextUtil.write(data, ORDPATTERN);
SimpleTextUtil.write(data, sb2.toString(), scratch);
SimpleTextUtil.writeNewline(data);
// for asserts:
long valuesSeen = 0;
terms = valuesProducer.getSortedSet(field).termsEnum();
for (BytesRef value = terms.next(); value != null; value = terms.next()) {
// write length
SimpleTextUtil.write(data, LENGTH);
SimpleTextUtil.write(data, encoder.format(value.length), scratch);
SimpleTextUtil.writeNewline(data);
// write bytes -- don't use SimpleText.write
// because it escapes:
data.writeBytes(value.bytes, value.offset, value.length);
// pad to fit
for (int i = value.length; i < maxLength; i++) {
data.writeByte((byte) ' ');
}
SimpleTextUtil.writeNewline(data);
valuesSeen++;
assert valuesSeen <= valueCount;
}
assert valuesSeen == valueCount;
values = valuesProducer.getSortedSet(field);
// write the ords for each doc comma-separated
for (int i = 0; i < numDocs; ++i) {
if (values.docID() < i) {
values.nextDoc();
assert values.docID() >= i;
}
sb2.setLength(0);
if (values.docID() == i) {
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
if (sb2.length() > 0) {
sb2.append(",");
}
sb2.append(Long.toString(ord));
}
}
// now pad to fit: these are numbers so spaces work well. reader calls trim()
int numPadding = maxOrdListLength - sb2.length();
for (int j = 0; j < numPadding; j++) {
sb2.append(' ');
}
SimpleTextUtil.write(data, sb2.toString(), scratch);
SimpleTextUtil.writeNewline(data);
}
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class DocValuesConsumer method mergeSortedSetField.
/**
* Merges the sortedset docvalues from <code>toMerge</code>.
* <p>
* The default implementation calls {@link #addSortedSetField}, passing
* an Iterable that merges ordinals and values and filters deleted documents .
*/
public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
List<SortedSetDocValues> toMerge = new ArrayList<>();
for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
SortedSetDocValues values = null;
DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
if (docValuesProducer != null) {
FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
values = docValuesProducer.getSortedSet(fieldInfo);
}
}
if (values == null) {
values = DocValues.emptySortedSet();
}
toMerge.add(values);
}
// step 1: iterate thru each sub and mark terms still in use
TermsEnum[] liveTerms = new TermsEnum[toMerge.size()];
long[] weights = new long[liveTerms.length];
for (int sub = 0; sub < liveTerms.length; sub++) {
SortedSetDocValues dv = toMerge.get(sub);
Bits liveDocs = mergeState.liveDocs[sub];
if (liveDocs == null) {
liveTerms[sub] = dv.termsEnum();
weights[sub] = dv.getValueCount();
} else {
LongBitSet bitset = new LongBitSet(dv.getValueCount());
int docID;
while ((docID = dv.nextDoc()) != NO_MORE_DOCS) {
if (liveDocs.get(docID)) {
long ord;
while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
bitset.set(ord);
}
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
weights[sub] = bitset.cardinality();
}
}
// step 2: create ordinal map (this conceptually does the "merging")
final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
// step 3: add field
addSortedSetField(mergeFieldInfo, new EmptyDocValuesProducer() {
@Override
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
if (fieldInfo != mergeFieldInfo) {
throw new IllegalArgumentException("wrong FieldInfo");
}
// We must make new iterators + DocIDMerger for each iterator:
List<SortedSetDocValuesSub> subs = new ArrayList<>();
long cost = 0;
for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
SortedSetDocValues values = null;
DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
if (docValuesProducer != null) {
FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
values = docValuesProducer.getSortedSet(readerFieldInfo);
}
}
if (values == null) {
values = DocValues.emptySortedSet();
}
cost += values.cost();
subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
}
final DocIDMerger<SortedSetDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
final long finalCost = cost;
return new SortedSetDocValues() {
private int docID = -1;
private SortedSetDocValuesSub currentSub;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
currentSub = docIDMerger.next();
if (currentSub == null) {
docID = NO_MORE_DOCS;
} else {
docID = currentSub.mappedDocID;
}
return docID;
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public boolean advanceExact(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long nextOrd() throws IOException {
long subOrd = currentSub.values.nextOrd();
if (subOrd == NO_MORE_ORDS) {
return NO_MORE_ORDS;
}
return currentSub.map.get(subOrd);
}
@Override
public long cost() {
return finalCost;
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
int segmentNumber = map.getFirstSegmentNumber(ord);
long segmentOrd = map.getFirstSegmentOrd(ord);
return toMerge.get(segmentNumber).lookupOrd(segmentOrd);
}
@Override
public long getValueCount() {
return map.getValueCount();
}
};
}
});
}
Aggregations