use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class DatasetSplitter method split.
/**
* Split a given index into 3 indexes for training, test and cross validation tasks respectively
*
* @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index
* @param trainingIndex a {@link Directory} used to write the training index
* @param testIndex a {@link Directory} used to write the test index
* @param crossValidationIndex a {@link Directory} used to write the cross validation index
* @param analyzer {@link Analyzer} used to create the new docs
* @param termVectors {@code true} if term vectors should be kept
* @param classFieldName name of the field used as the label for classification; this must be indexed with sorted doc values
* @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used
* @throws IOException if any writing operation fails on any of the indexes
*/
public void split(IndexReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, boolean termVectors, String classFieldName, String... fieldNames) throws IOException {
// create IWs for train / test / cv IDXs
IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer));
IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer));
IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer));
// get the exact no. of existing classes
int noOfClasses = 0;
for (LeafReaderContext leave : originalIndex.leaves()) {
long valueCount = 0;
SortedDocValues classValues = leave.reader().getSortedDocValues(classFieldName);
if (classValues != null) {
valueCount = classValues.getValueCount();
} else {
SortedSetDocValues sortedSetDocValues = leave.reader().getSortedSetDocValues(classFieldName);
if (sortedSetDocValues != null) {
valueCount = sortedSetDocValues.getValueCount();
}
}
if (classValues == null) {
// approximate with no. of terms
noOfClasses += leave.reader().terms(classFieldName).size();
}
noOfClasses += valueCount;
}
try {
IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
GroupingSearch gs = new GroupingSearch(classFieldName);
gs.setGroupSort(Sort.INDEXORDER);
gs.setSortWithinGroup(Sort.INDEXORDER);
gs.setAllGroups(true);
gs.setGroupDocsLimit(originalIndex.maxDoc());
TopGroups<Object> topGroups = gs.search(indexSearcher, new MatchAllDocsQuery(), 0, noOfClasses);
// set the type to be indexed, stored, with term vectors
FieldType ft = new FieldType(TextField.TYPE_STORED);
if (termVectors) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
}
int b = 0;
// iterate over existing documents
for (GroupDocs<Object> group : topGroups.groups) {
int totalHits = group.totalHits;
double testSize = totalHits * testRatio;
int tc = 0;
double cvSize = totalHits * crossValidationRatio;
int cvc = 0;
for (ScoreDoc scoreDoc : group.scoreDocs) {
// create a new document for indexing
Document doc = createNewDoc(originalIndex, ft, scoreDoc, fieldNames);
// add it to one of the IDXs
if (b % 2 == 0 && tc < testSize) {
testWriter.addDocument(doc);
tc++;
} else if (cvc < cvSize) {
cvWriter.addDocument(doc);
cvc++;
} else {
trainingWriter.addDocument(doc);
}
b++;
}
}
// commit
testWriter.commit();
cvWriter.commit();
trainingWriter.commit();
// merge
testWriter.forceMerge(3);
cvWriter.forceMerge(3);
trainingWriter.forceMerge(3);
} catch (Exception e) {
throw new IOException(e);
} finally {
// close IWs
testWriter.close();
cvWriter.close();
trainingWriter.close();
originalIndex.close();
}
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class FacetFieldProcessorByArrayDV method collectDocs.
@Override
protected void collectDocs() throws IOException {
int domainSize = fcontext.base.size();
if (nTerms <= 0 || domainSize < effectiveMincount) {
// TODO: what about allBuckets? missing bucket?
return;
}
// TODO: refactor some of this logic into a base class
boolean countOnly = collectAcc == null && allBucketsAcc == null;
boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
// Are we expecting many hits per bucket?
// FUTURE: pro-rate for nTerms?
// FUTURE: better take into account number of values in multi-valued fields. This info is available for indexed fields.
// FUTURE: take into account that bigger ord maps are more expensive than smaller ones
// One test: 5M doc index, faceting on a single-valued field with almost 1M unique values, crossover point where global counting was slower
// than per-segment counting was a domain of 658k docs. At that point, top 10 buckets had 6-7 matches each.
// this was for heap docvalues produced by UninvertingReader
// Since these values were randomly distributed, lets round our domain multiplier up to account for less random real world data.
long domainMultiplier = multiValuedField ? 4L : 2L;
// +3 to increase test coverage with small tests
boolean manyHitsPerBucket = domainSize * domainMultiplier > (si.getValueCount() + 3);
// If we're only calculating counts, we're not prefixing, and we expect to collect many documents per unique value,
// then collect per-segment before mapping to global ords at the end. This will save redundant seg->global ord mappings.
// FUTURE: there are probably some other non "countOnly" cases where we can use this as well (i.e. those where
// the docid is not used)
boolean canDoPerSeg = countOnly && fullRange;
boolean accumSeg = manyHitsPerBucket && canDoPerSeg;
// internal - override perSeg heuristic
if (freq.perSeg != null)
accumSeg = canDoPerSeg && freq.perSeg;
final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
Filter filter = fcontext.base.getTopFilter();
for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
LeafReaderContext subCtx = leaves.get(subIdx);
setNextReaderFirstPhase(subCtx);
// solr docsets already exclude any deleted docs
DocIdSet dis = filter.getDocIdSet(subCtx, null);
DocIdSetIterator disi = dis.iterator();
SortedDocValues singleDv = null;
SortedSetDocValues multiDv = null;
if (multiValuedField) {
// TODO: get sub from multi?
multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
if (multiDv == null) {
multiDv = DocValues.emptySortedSet();
}
// this will be null if this is not a wrapped single valued docvalues.
if (unwrap_singleValued_multiDv) {
singleDv = DocValues.unwrapSingleton(multiDv);
}
} else {
singleDv = subCtx.reader().getSortedDocValues(sf.getName());
if (singleDv == null) {
singleDv = DocValues.emptySorted();
}
}
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
if (singleDv != null) {
if (accumSeg) {
collectPerSeg(singleDv, disi, toGlobal);
} else {
if (canDoPerSeg && toGlobal != null) {
collectCounts(singleDv, disi, toGlobal);
} else {
collectDocs(singleDv, disi, toGlobal);
}
}
} else {
if (accumSeg) {
collectPerSeg(multiDv, disi, toGlobal);
} else {
if (canDoPerSeg && toGlobal != null) {
collectCounts(multiDv, disi, toGlobal);
} else {
collectDocs(multiDv, disi, toGlobal);
}
}
}
}
// better GC
reuse = null;
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class TestMemoryIndex method testDocValues.
public void testDocValues() throws Exception {
Document doc = new Document();
doc.add(new NumericDocValuesField("numeric", 29L));
doc.add(new SortedNumericDocValuesField("sorted_numeric", 33L));
doc.add(new SortedNumericDocValuesField("sorted_numeric", 32L));
doc.add(new SortedNumericDocValuesField("sorted_numeric", 32L));
doc.add(new SortedNumericDocValuesField("sorted_numeric", 31L));
doc.add(new SortedNumericDocValuesField("sorted_numeric", 30L));
doc.add(new BinaryDocValuesField("binary", new BytesRef("a")));
doc.add(new SortedDocValuesField("sorted", new BytesRef("b")));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef("f")));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef("d")));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef("d")));
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef("c")));
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer);
LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
NumericDocValues numericDocValues = leafReader.getNumericDocValues("numeric");
assertEquals(0, numericDocValues.nextDoc());
assertEquals(29L, numericDocValues.longValue());
SortedNumericDocValues sortedNumericDocValues = leafReader.getSortedNumericDocValues("sorted_numeric");
assertEquals(0, sortedNumericDocValues.nextDoc());
assertEquals(5, sortedNumericDocValues.docValueCount());
assertEquals(30L, sortedNumericDocValues.nextValue());
assertEquals(31L, sortedNumericDocValues.nextValue());
assertEquals(32L, sortedNumericDocValues.nextValue());
assertEquals(32L, sortedNumericDocValues.nextValue());
assertEquals(33L, sortedNumericDocValues.nextValue());
BinaryDocValues binaryDocValues = leafReader.getBinaryDocValues("binary");
assertEquals(0, binaryDocValues.nextDoc());
assertEquals("a", binaryDocValues.binaryValue().utf8ToString());
SortedDocValues sortedDocValues = leafReader.getSortedDocValues("sorted");
assertEquals(0, sortedDocValues.nextDoc());
assertEquals("b", sortedDocValues.binaryValue().utf8ToString());
assertEquals(0, sortedDocValues.ordValue());
assertEquals("b", sortedDocValues.lookupOrd(0).utf8ToString());
SortedSetDocValues sortedSetDocValues = leafReader.getSortedSetDocValues("sorted_set");
assertEquals(3, sortedSetDocValues.getValueCount());
assertEquals(0, sortedSetDocValues.nextDoc());
assertEquals(0L, sortedSetDocValues.nextOrd());
assertEquals(1L, sortedSetDocValues.nextOrd());
assertEquals(2L, sortedSetDocValues.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSetDocValues.nextOrd());
assertEquals("c", sortedSetDocValues.lookupOrd(0L).utf8ToString());
assertEquals("d", sortedSetDocValues.lookupOrd(1L).utf8ToString());
assertEquals("f", sortedSetDocValues.lookupOrd(2L).utf8ToString());
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class DocValuesMultiTest method testDocValues.
@Test
public void testDocValues() throws IOException {
assertU(adoc("id", "1", "floatdv", "4.5", "intdv", "-1", "intdv", "3", "stringdv", "value1", "stringdv", "value2", "booldv", "false", "booldv", "true"));
assertU(commit());
try (SolrCore core = h.getCoreInc()) {
final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true);
final SolrIndexSearcher searcher = searcherRef.get();
try {
final LeafReader reader = searcher.getSlowAtomicReader();
assertEquals(1, reader.numDocs());
final FieldInfos infos = reader.getFieldInfos();
assertEquals(DocValuesType.SORTED_SET, infos.fieldInfo("stringdv").getDocValuesType());
assertEquals(DocValuesType.SORTED_SET, infos.fieldInfo("booldv").getDocValuesType());
assertEquals(DocValuesType.SORTED_SET, infos.fieldInfo("floatdv").getDocValuesType());
assertEquals(DocValuesType.SORTED_SET, infos.fieldInfo("intdv").getDocValuesType());
SortedSetDocValues dv = reader.getSortedSetDocValues("stringdv");
assertEquals(0, dv.nextDoc());
assertEquals(0, dv.nextOrd());
assertEquals(1, dv.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd());
dv = reader.getSortedSetDocValues("booldv");
assertEquals(0, dv.nextDoc());
assertEquals(0, dv.nextOrd());
assertEquals(1, dv.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd());
} finally {
searcherRef.decref();
}
}
}
use of org.apache.lucene.index.SortedSetDocValues in project lucene-solr by apache.
the class TestFieldCacheVsDocValues method doTestSortedSetVsUninvertedField.
private void doTestSortedSetVsUninvertedField(int minLength, int maxLength) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random()));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
// index some docs
int numDocs = atLeast(300);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
doc.add(idField);
final int length = TestUtil.nextInt(random(), minLength, maxLength);
int numValues = random().nextInt(17);
// create a random list of strings
List<String> values = new ArrayList<>();
for (int v = 0; v < numValues; v++) {
values.add(TestUtil.randomSimpleString(random(), minLength, length));
}
// add in any order to the indexed field
ArrayList<String> unordered = new ArrayList<>(values);
Collections.shuffle(unordered, random());
for (String v : values) {
doc.add(newStringField("indexed", v, Field.Store.NO));
}
// add in any order to the dv field
ArrayList<String> unordered2 = new ArrayList<>(values);
Collections.shuffle(unordered2, random());
for (String v : unordered2) {
doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
}
writer.addDocument(doc);
if (random().nextInt(31) == 0) {
writer.commit();
}
}
// delete some docs
int numDeletions = random().nextInt(numDocs / 10);
for (int i = 0; i < numDeletions; i++) {
int id = random().nextInt(numDocs);
writer.deleteDocuments(new Term("id", Integer.toString(id)));
}
// compare per-segment
DirectoryReader ir = writer.getReader();
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
SortedSetDocValues expected = FieldCache.DEFAULT.getDocTermOrds(r, "indexed", null);
SortedSetDocValues actual = r.getSortedSetDocValues("dv");
assertEquals(r.maxDoc(), expected, actual);
}
ir.close();
writer.forceMerge(1);
// now compare again after the merge
ir = writer.getReader();
LeafReader ar = getOnlyLeafReader(ir);
SortedSetDocValues expected = FieldCache.DEFAULT.getDocTermOrds(ar, "indexed", null);
SortedSetDocValues actual = ar.getSortedSetDocValues("dv");
assertEquals(ir.maxDoc(), expected, actual);
ir.close();
writer.close();
dir.close();
}
Aggregations