use of org.apache.lucene.document.SortedDocValuesField in project lucene-solr by apache.
the class TestIndexSorting method testRandom2.
public void testRandom2() throws Exception {
int numDocs = atLeast(100);
FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
POSITIONS_TYPE.freeze();
FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
TERM_VECTORS_TYPE.setStoreTermVectors(true);
TERM_VECTORS_TYPE.freeze();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
List<Document> docs = new ArrayList<>();
for (int i = 0; i < numDocs; i++) {
int id = i * 10;
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(id), Store.YES));
doc.add(new StringField("docs", "#all#", Store.NO));
PositionsTokenStream positions = new PositionsTokenStream();
positions.setId(id);
doc.add(new Field("positions", positions, POSITIONS_TYPE));
doc.add(new NumericDocValuesField("numeric", id));
String value = IntStream.range(0, id).mapToObj(k -> Integer.toString(id)).collect(Collectors.joining(" "));
TextField norms = new TextField("norms", value, Store.NO);
doc.add(norms);
doc.add(new BinaryDocValuesField("binary", new BytesRef(Integer.toString(id))));
doc.add(new SortedDocValuesField("sorted", new BytesRef(Integer.toString(id))));
doc.add(new SortedSetDocValuesField("multi_valued_string", new BytesRef(Integer.toString(id))));
doc.add(new SortedSetDocValuesField("multi_valued_string", new BytesRef(Integer.toString(id + 1))));
doc.add(new SortedNumericDocValuesField("multi_valued_numeric", id));
doc.add(new SortedNumericDocValuesField("multi_valued_numeric", id + 1));
doc.add(new Field("term_vectors", Integer.toString(id), TERM_VECTORS_TYPE));
byte[] bytes = new byte[4];
NumericUtils.intToSortableBytes(id, bytes, 0);
doc.add(new BinaryPoint("points", bytes));
docs.add(doc);
}
// Must use the same seed for both RandomIndexWriters so they behave identically
long seed = random().nextLong();
// We add document alread in ID order for the first writer:
Directory dir1 = newFSDirectory(createTempDir());
Random random1 = new Random(seed);
IndexWriterConfig iwc1 = newIndexWriterConfig(random1, a);
// for testing norms field
iwc1.setSimilarity(new NormsSimilarity(iwc1.getSimilarity()));
// preserve docIDs
iwc1.setMergePolicy(newLogMergePolicy());
if (VERBOSE) {
System.out.println("TEST: now index pre-sorted");
}
RandomIndexWriter w1 = new RandomIndexWriter(random1, dir1, iwc1);
for (Document doc : docs) {
((PositionsTokenStream) ((Field) doc.getField("positions")).tokenStreamValue()).setId(Integer.parseInt(doc.get("id")));
w1.addDocument(doc);
}
// We shuffle documents, but set index sort, for the second writer:
Directory dir2 = newFSDirectory(createTempDir());
Random random2 = new Random(seed);
IndexWriterConfig iwc2 = newIndexWriterConfig(random2, a);
// for testing norms field
iwc2.setSimilarity(new NormsSimilarity(iwc2.getSimilarity()));
Sort sort = new Sort(new SortField("numeric", SortField.Type.INT));
iwc2.setIndexSort(sort);
Collections.shuffle(docs, random());
if (VERBOSE) {
System.out.println("TEST: now index with index-time sorting");
}
RandomIndexWriter w2 = new RandomIndexWriter(random2, dir2, iwc2);
int count = 0;
int commitAtCount = TestUtil.nextInt(random(), 1, numDocs - 1);
for (Document doc : docs) {
((PositionsTokenStream) ((Field) doc.getField("positions")).tokenStreamValue()).setId(Integer.parseInt(doc.get("id")));
if (count++ == commitAtCount) {
// Ensure forceMerge really does merge
w2.commit();
}
w2.addDocument(doc);
}
if (VERBOSE) {
System.out.println("TEST: now force merge");
}
w2.forceMerge(1);
DirectoryReader r1 = w1.getReader();
DirectoryReader r2 = w2.getReader();
if (VERBOSE) {
System.out.println("TEST: now compare r1=" + r1 + " r2=" + r2);
}
assertEquals(sort, getOnlyLeafReader(r2).getMetaData().getSort());
assertReaderEquals("left: sorted by hand; right: sorted by Lucene", r1, r2);
IOUtils.close(w1, w2, r1, r2, dir1, dir2);
}
use of org.apache.lucene.document.SortedDocValuesField in project lucene-solr by apache.
the class TestIndexSorting method testRandom3.
// pits index time sorting against query time sorting
public void testRandom3() throws Exception {
int numDocs;
if (TEST_NIGHTLY) {
numDocs = atLeast(100000);
} else {
numDocs = atLeast(1000);
}
List<RandomDoc> docs = new ArrayList<>();
Sort sort = randomSort();
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " use sort=" + sort);
}
// no index sorting, all search-time sorting:
Directory dir1 = newFSDirectory(createTempDir());
IndexWriterConfig iwc1 = newIndexWriterConfig(new MockAnalyzer(random()));
IndexWriter w1 = new IndexWriter(dir1, iwc1);
// use index sorting:
Directory dir2 = newFSDirectory(createTempDir());
IndexWriterConfig iwc2 = newIndexWriterConfig(new MockAnalyzer(random()));
iwc2.setIndexSort(sort);
IndexWriter w2 = new IndexWriter(dir2, iwc2);
Set<Integer> toDelete = new HashSet<>();
double deleteChance = random().nextDouble();
for (int id = 0; id < numDocs; id++) {
RandomDoc docValues = new RandomDoc(id);
docs.add(docValues);
if (VERBOSE) {
System.out.println("TEST: doc id=" + id);
System.out.println(" int=" + docValues.intValue);
System.out.println(" long=" + docValues.longValue);
System.out.println(" float=" + docValues.floatValue);
System.out.println(" double=" + docValues.doubleValue);
System.out.println(" bytes=" + new BytesRef(docValues.bytesValue));
}
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
doc.add(new NumericDocValuesField("id", id));
doc.add(new NumericDocValuesField("int", docValues.intValue));
doc.add(new NumericDocValuesField("long", docValues.longValue));
doc.add(new DoubleDocValuesField("double", docValues.doubleValue));
doc.add(new FloatDocValuesField("float", docValues.floatValue));
doc.add(new SortedDocValuesField("bytes", new BytesRef(docValues.bytesValue)));
for (int value : docValues.intValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_int", value));
}
for (long value : docValues.longValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_long", value));
}
for (float value : docValues.floatValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_float", NumericUtils.floatToSortableInt(value)));
}
for (double value : docValues.doubleValues) {
doc.add(new SortedNumericDocValuesField("multi_valued_double", NumericUtils.doubleToSortableLong(value)));
}
for (byte[] value : docValues.bytesValues) {
doc.add(new SortedSetDocValuesField("multi_valued_bytes", new BytesRef(value)));
}
w1.addDocument(doc);
w2.addDocument(doc);
if (random().nextDouble() < deleteChance) {
toDelete.add(id);
}
}
for (int id : toDelete) {
w1.deleteDocuments(new Term("id", Integer.toString(id)));
w2.deleteDocuments(new Term("id", Integer.toString(id)));
}
DirectoryReader r1 = DirectoryReader.open(w1);
IndexSearcher s1 = newSearcher(r1);
if (random().nextBoolean()) {
int maxSegmentCount = TestUtil.nextInt(random(), 1, 5);
if (VERBOSE) {
System.out.println("TEST: now forceMerge(" + maxSegmentCount + ")");
}
w2.forceMerge(maxSegmentCount);
}
DirectoryReader r2 = DirectoryReader.open(w2);
IndexSearcher s2 = newSearcher(r2);
for (int iter = 0; iter < 100; iter++) {
int numHits = TestUtil.nextInt(random(), 1, numDocs);
if (VERBOSE) {
System.out.println("TEST: iter=" + iter + " numHits=" + numHits);
}
TopFieldCollector c1 = TopFieldCollector.create(sort, numHits, true, true, true);
s1.search(new MatchAllDocsQuery(), c1);
TopDocs hits1 = c1.topDocs();
TopFieldCollector c2 = TopFieldCollector.create(sort, numHits, true, true, true);
EarlyTerminatingSortingCollector c3 = new EarlyTerminatingSortingCollector(c2, sort, numHits);
s2.search(new MatchAllDocsQuery(), c3);
TopDocs hits2 = c2.topDocs();
if (VERBOSE) {
System.out.println(" topDocs query-time sort: totalHits=" + hits1.totalHits);
for (ScoreDoc scoreDoc : hits1.scoreDocs) {
System.out.println(" " + scoreDoc.doc);
}
System.out.println(" topDocs index-time sort: totalHits=" + hits2.totalHits);
for (ScoreDoc scoreDoc : hits2.scoreDocs) {
System.out.println(" " + scoreDoc.doc);
}
}
assertTrue(hits2.totalHits <= hits1.totalHits);
assertEquals(hits2.scoreDocs.length, hits1.scoreDocs.length);
for (int i = 0; i < hits2.scoreDocs.length; i++) {
ScoreDoc hit1 = hits1.scoreDocs[i];
ScoreDoc hit2 = hits2.scoreDocs[i];
assertEquals(r1.document(hit1.doc).get("id"), r2.document(hit2.doc).get("id"));
assertEquals(((FieldDoc) hit1).fields, ((FieldDoc) hit2).fields);
}
}
IOUtils.close(r1, r2, w1, w2, dir1, dir2);
}
use of org.apache.lucene.document.SortedDocValuesField in project lucene-solr by apache.
the class TestIndexSorting method testBasicString.
public void testBasicString() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING));
iwc.setIndexSort(indexSort);
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new SortedDocValuesField("foo", new BytesRef("zzz")));
w.addDocument(doc);
// so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging:
w.commit();
doc = new Document();
doc.add(new SortedDocValuesField("foo", new BytesRef("aaa")));
w.addDocument(doc);
w.commit();
doc = new Document();
doc.add(new SortedDocValuesField("foo", new BytesRef("mmm")));
w.addDocument(doc);
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
LeafReader leaf = getOnlyLeafReader(r);
assertEquals(3, leaf.maxDoc());
SortedDocValues values = leaf.getSortedDocValues("foo");
assertEquals(0, values.nextDoc());
assertEquals("aaa", values.binaryValue().utf8ToString());
assertEquals(1, values.nextDoc());
assertEquals("mmm", values.binaryValue().utf8ToString());
assertEquals(2, values.nextDoc());
assertEquals("zzz", values.binaryValue().utf8ToString());
r.close();
w.close();
dir.close();
}
use of org.apache.lucene.document.SortedDocValuesField in project lucene-solr by apache.
the class TestIndexSorting method testTieBreak.
public void testTieBreak() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setIndexSort(new Sort(new SortField("foo", SortField.Type.STRING)));
iwc.setMergePolicy(newLogMergePolicy());
IndexWriter w = new IndexWriter(dir, iwc);
for (int id = 0; id < 1000; id++) {
Document doc = new Document();
doc.add(new StoredField("id", id));
String value;
if (id < 500) {
value = "bar2";
} else {
value = "bar1";
}
doc.add(new SortedDocValuesField("foo", new BytesRef(value)));
w.addDocument(doc);
if (id == 500) {
w.commit();
}
}
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
for (int docID = 0; docID < 1000; docID++) {
int expectedID;
if (docID < 500) {
expectedID = 500 + docID;
} else {
expectedID = docID - 500;
}
assertEquals(expectedID, r.document(docID).getField("id").numericValue().intValue());
}
IOUtils.close(r, w, dir);
}
use of org.apache.lucene.document.SortedDocValuesField in project lucene-solr by apache.
the class BaseTestRangeFilter method build.
private static IndexReader build(Random random, TestIndex index) throws IOException {
/* build an index */
Document doc = new Document();
Field idField = newStringField(random, "id", "", Field.Store.YES);
Field idDVField = new SortedDocValuesField("id", new BytesRef());
Field intIdField = new IntPoint("id_int", 0);
Field intDVField = new NumericDocValuesField("id_int", 0);
Field floatIdField = new FloatPoint("id_float", 0);
Field floatDVField = new NumericDocValuesField("id_float", 0);
Field longIdField = new LongPoint("id_long", 0);
Field longDVField = new NumericDocValuesField("id_long", 0);
Field doubleIdField = new DoublePoint("id_double", 0);
Field doubleDVField = new NumericDocValuesField("id_double", 0);
Field randField = newStringField(random, "rand", "", Field.Store.YES);
Field randDVField = new SortedDocValuesField("rand", new BytesRef());
Field bodyField = newStringField(random, "body", "", Field.Store.NO);
Field bodyDVField = new SortedDocValuesField("body", new BytesRef());
doc.add(idField);
doc.add(idDVField);
doc.add(intIdField);
doc.add(intDVField);
doc.add(floatIdField);
doc.add(floatDVField);
doc.add(longIdField);
doc.add(longDVField);
doc.add(doubleIdField);
doc.add(doubleDVField);
doc.add(randField);
doc.add(randDVField);
doc.add(bodyField);
doc.add(bodyDVField);
RandomIndexWriter writer = new RandomIndexWriter(random, index.index, newIndexWriterConfig(random, new MockAnalyzer(random)).setOpenMode(OpenMode.CREATE).setMaxBufferedDocs(TestUtil.nextInt(random, 50, 1000)).setMergePolicy(newLogMergePolicy()));
TestUtil.reduceOpenFiles(writer.w);
while (true) {
int minCount = 0;
int maxCount = 0;
for (int d = minId; d <= maxId; d++) {
idField.setStringValue(pad(d));
idDVField.setBytesValue(new BytesRef(pad(d)));
intIdField.setIntValue(d);
intDVField.setLongValue(d);
floatIdField.setFloatValue(d);
floatDVField.setLongValue(Float.floatToRawIntBits(d));
longIdField.setLongValue(d);
longDVField.setLongValue(d);
doubleIdField.setDoubleValue(d);
doubleDVField.setLongValue(Double.doubleToRawLongBits(d));
int r = index.allowNegativeRandomInts ? random.nextInt() : random.nextInt(Integer.MAX_VALUE);
if (index.maxR < r) {
index.maxR = r;
maxCount = 1;
} else if (index.maxR == r) {
maxCount++;
}
if (r < index.minR) {
index.minR = r;
minCount = 1;
} else if (r == index.minR) {
minCount++;
}
randField.setStringValue(pad(r));
randDVField.setBytesValue(new BytesRef(pad(r)));
bodyField.setStringValue("body");
bodyDVField.setBytesValue(new BytesRef("body"));
writer.addDocument(doc);
}
if (minCount == 1 && maxCount == 1) {
// our subclasses rely on only 1 doc having the min or
// max, so, we loop until we satisfy that. it should be
// exceedingly rare (Yonik calculates 1 in ~429,000)
// times) that this loop requires more than one try:
IndexReader ir = writer.getReader();
writer.close();
return ir;
}
// try again
writer.deleteAll();
}
}
Aggregations