use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.
the class PointInSetIncludingScoreQuery method createWeight.
@Override
public final Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new Weight(this) {
@Override
public void extractTerms(Set<Term> terms) {
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context);
if (scorer != null) {
int target = scorer.iterator().advance(doc);
if (doc == target) {
return Explanation.match(scorer.score(), "A match");
}
}
return Explanation.noMatch("Not a match");
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo == null) {
return null;
}
if (fieldInfo.getPointDimensionCount() != 1) {
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this query has numDims=1");
}
if (fieldInfo.getPointNumBytes() != bytesPerDim) {
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
}
PointValues values = reader.getPointValues(field);
if (values == null) {
return null;
}
FixedBitSet result = new FixedBitSet(reader.maxDoc());
float[] scores = new float[reader.maxDoc()];
values.intersect(new MergePointVisitor(sortedPackedPoints, result, scores));
return new Scorer(this) {
DocIdSetIterator disi = new BitSetIterator(result, 10L);
@Override
public float score() throws IOException {
return scores[docID()];
}
@Override
public int freq() throws IOException {
return 1;
}
@Override
public int docID() {
return disi.docID();
}
@Override
public DocIdSetIterator iterator() {
return disi;
}
};
}
};
}
use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.
the class AbstractPrefixTreeQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
DocIdSet docSet = getDocIdSet(context);
if (docSet == null) {
return null;
}
DocIdSetIterator disi = docSet.iterator();
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, score(), disi);
}
};
}
use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.
the class TestRandomSamplingFacetsCollector method testRandomSampling.
public void testRandomSampling() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
Random random = random();
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
RandomIndexWriter writer = new RandomIndexWriter(random, dir);
FacetsConfig config = new FacetsConfig();
final int numCategories = 10;
int numDocs = atLeast(10000);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
writer.addDocument(config.build(taxoWriter, doc));
}
writer.forceMerge(CHI_SQUARE_VALUES.length - 1);
// NRT open
IndexSearcher searcher = newSearcher(writer.getReader());
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
IOUtils.close(writer, taxoWriter);
// Test empty results
RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
// There should be no divisions by zero
searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
// There should be no divisions by zero and no null result
assertNotNull(collectRandomZeroResults.getMatchingDocs());
// There should be no results at all
for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) {
assertEquals(0, doc.totalHits);
}
// Now start searching and retrieve results.
// Use a query to select half of the documents.
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
// 10% of total docs, 20% of the hits
RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
FacetsCollector fc = new FacetsCollector();
searcher.search(query, MultiCollector.wrap(fc, random10Percent));
final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();
// count the total hits and sampled docs, also store the number of sampled
// docs per segment
int totalSampledDocs = 0, totalHits = 0;
int[] numSampledDocs = new int[matchingDocs.size()];
// System.out.println("numSegments=" + numSampledDocs.length);
for (int i = 0; i < numSampledDocs.length; i++) {
MatchingDocs md = matchingDocs.get(i);
final DocIdSetIterator iter = md.bits.iterator();
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
totalSampledDocs += numSampledDocs[i];
totalHits += md.totalHits;
}
// compute the chi-square value for the sampled documents' distribution
float chi_square = 0;
for (int i = 0; i < numSampledDocs.length; i++) {
MatchingDocs md = matchingDocs.get(i);
float ei = (float) md.totalHits / totalHits;
if (ei > 0.0f) {
float oi = (float) numSampledDocs[i] / totalSampledDocs;
chi_square += (Math.pow(ei - oi, 2) / ei);
}
}
// Verify that the chi-square value isn't too big. According to
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
// we basically verify that there is a really small chance of hitting a very
// bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
// on the number of segments.
assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);
// Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
for (int i = 0; i < amortized10Result.labelValues.length; i++) {
LabelAndValue amortized = amortized10Result.labelValues[i];
LabelAndValue sampled = random10Result.labelValues[i];
// since numDocs may not divide by 10 exactly, allow for some slack in the amortized count
assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
}
IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
}
use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.
the class UnifiedHighlighter method highlightFieldsAsObjects.
/**
* Expert: highlights the top-N passages from multiple fields,
* for the provided int[] docids, to custom Object as
* returned by the {@link PassageFormatter}. Use
* this API to render to something other than String.
*
* @param fieldsIn field names to highlight. Must have a stored string value.
* @param query query to highlight.
* @param docIdsIn containing the document IDs to highlight.
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>docIdsIn</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} from the field will
* be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
protected Map<String, Object[]> highlightFieldsAsObjects(String[] fieldsIn, Query query, int[] docIdsIn, int[] maxPassagesIn) throws IOException {
if (fieldsIn.length < 1) {
throw new IllegalArgumentException("fieldsIn must not be empty");
}
if (fieldsIn.length != maxPassagesIn.length) {
throw new IllegalArgumentException("invalid number of maxPassagesIn");
}
if (searcher == null) {
throw new IllegalStateException("This method requires that an indexSearcher was passed in the " + "constructor. Perhaps you mean to call highlightWithoutSearcher?");
}
// Sort docs & fields for sequential i/o
// Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
int[] docIds = new int[docIdsIn.length];
// fill in ascending order; points into docIdsIn[]
int[] docInIndexes = new int[docIds.length];
// latter 2 are "out" params
copyAndSortDocIdsWithIndex(docIdsIn, docIds, docInIndexes);
// Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
final String[] fields = new String[fieldsIn.length];
final int[] maxPassages = new int[maxPassagesIn.length];
// latter 2 are "out" params
copyAndSortFieldsWithMaxPassages(fieldsIn, maxPassagesIn, fields, maxPassages);
// Init field highlighters (where most of the highlight logic lives, and on a per field basis)
Set<Term> queryTerms = extractTerms(query);
FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
int numTermVectors = 0;
int numPostings = 0;
for (int f = 0; f < fields.length; f++) {
FieldHighlighter fieldHighlighter = getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
fieldHighlighters[f] = fieldHighlighter;
switch(fieldHighlighter.getOffsetSource()) {
case TERM_VECTORS:
numTermVectors++;
break;
case POSTINGS:
numPostings++;
break;
case POSTINGS_WITH_TERM_VECTORS:
numTermVectors++;
numPostings++;
break;
case ANALYSIS:
case NONE_NEEDED:
default:
//do nothing
break;
}
}
int cacheCharsThreshold = calculateOptimalCacheCharsThreshold(numTermVectors, numPostings);
IndexReader indexReaderWithTermVecCache = (numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;
// [fieldIdx][docIdInIndex] of highlightDoc result
Object[][] highlightDocsInByField = new Object[fields.length][docIds.length];
// Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
// Load the field values of the first batch of document(s) (note: commonly all docs are in this batch)
List<CharSequence[]> fieldValsByDoc = loadFieldValues(fields, docIdIter, cacheCharsThreshold);
// Highlight in per-field order first, then by doc (better I/O pattern)
for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
//parallel to docIdsIn
Object[] resultByDocIn = highlightDocsInByField[fieldIdx];
FieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
//sorted order
int docId = docIds[docIdx];
CharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
if (content == null) {
continue;
}
IndexReader indexReader = (fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS && indexReaderWithTermVecCache != null) ? indexReaderWithTermVecCache : searcher.getIndexReader();
//original input order
int docInIndex = docInIndexes[docIdx];
assert resultByDocIn[docInIndex] == null;
resultByDocIn[docInIndex] = fieldHighlighter.highlightFieldForDoc(indexReader, docId, content.toString());
}
}
batchDocIdx += fieldValsByDoc.size();
}
assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS || docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
// TODO reconsider the return type; since this is an "advanced" method, lets not return a Map? Notice the only
// caller simply iterates it to build another structure.
// field -> object highlights parallel to docIdsIn
Map<String, Object[]> resultMap = new HashMap<>(fields.length);
for (int f = 0; f < fields.length; f++) {
resultMap.put(fields[f], highlightDocsInByField[f]);
}
return resultMap;
}
use of org.apache.lucene.search.DocIdSetIterator in project lucene-solr by apache.
the class FacetFieldProcessorByArrayDV method collectDocs.
@Override
protected void collectDocs() throws IOException {
int domainSize = fcontext.base.size();
if (nTerms <= 0 || domainSize < effectiveMincount) {
// TODO: what about allBuckets? missing bucket?
return;
}
// TODO: refactor some of this logic into a base class
boolean countOnly = collectAcc == null && allBucketsAcc == null;
boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
// Are we expecting many hits per bucket?
// FUTURE: pro-rate for nTerms?
// FUTURE: better take into account number of values in multi-valued fields. This info is available for indexed fields.
// FUTURE: take into account that bigger ord maps are more expensive than smaller ones
// One test: 5M doc index, faceting on a single-valued field with almost 1M unique values, crossover point where global counting was slower
// than per-segment counting was a domain of 658k docs. At that point, top 10 buckets had 6-7 matches each.
// this was for heap docvalues produced by UninvertingReader
// Since these values were randomly distributed, lets round our domain multiplier up to account for less random real world data.
long domainMultiplier = multiValuedField ? 4L : 2L;
// +3 to increase test coverage with small tests
boolean manyHitsPerBucket = domainSize * domainMultiplier > (si.getValueCount() + 3);
// If we're only calculating counts, we're not prefixing, and we expect to collect many documents per unique value,
// then collect per-segment before mapping to global ords at the end. This will save redundant seg->global ord mappings.
// FUTURE: there are probably some other non "countOnly" cases where we can use this as well (i.e. those where
// the docid is not used)
boolean canDoPerSeg = countOnly && fullRange;
boolean accumSeg = manyHitsPerBucket && canDoPerSeg;
// internal - override perSeg heuristic
if (freq.perSeg != null)
accumSeg = canDoPerSeg && freq.perSeg;
final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
Filter filter = fcontext.base.getTopFilter();
for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
LeafReaderContext subCtx = leaves.get(subIdx);
setNextReaderFirstPhase(subCtx);
// solr docsets already exclude any deleted docs
DocIdSet dis = filter.getDocIdSet(subCtx, null);
DocIdSetIterator disi = dis.iterator();
SortedDocValues singleDv = null;
SortedSetDocValues multiDv = null;
if (multiValuedField) {
// TODO: get sub from multi?
multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
if (multiDv == null) {
multiDv = DocValues.emptySortedSet();
}
// this will be null if this is not a wrapped single valued docvalues.
if (unwrap_singleValued_multiDv) {
singleDv = DocValues.unwrapSingleton(multiDv);
}
} else {
singleDv = subCtx.reader().getSortedDocValues(sf.getName());
if (singleDv == null) {
singleDv = DocValues.emptySorted();
}
}
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
if (singleDv != null) {
if (accumSeg) {
collectPerSeg(singleDv, disi, toGlobal);
} else {
if (canDoPerSeg && toGlobal != null) {
collectCounts(singleDv, disi, toGlobal);
} else {
collectDocs(singleDv, disi, toGlobal);
}
}
} else {
if (accumSeg) {
collectPerSeg(multiDv, disi, toGlobal);
} else {
if (canDoPerSeg && toGlobal != null) {
collectCounts(multiDv, disi, toGlobal);
} else {
collectDocs(multiDv, disi, toGlobal);
}
}
}
}
// better GC
reuse = null;
}
Aggregations