use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.
the class DfsPhase method execute.
@Override
public void execute(SearchContext context) {
final ObjectHashSet<Term> termsSet = new ObjectHashSet<>();
try {
context.searcher().createNormalizedWeight(context.query(), true).extractTerms(new DelegateSet(termsSet));
for (RescoreSearchContext rescoreContext : context.rescore()) {
rescoreContext.rescorer().extractTerms(context, rescoreContext, new DelegateSet(termsSet));
}
Term[] terms = termsSet.toArray(Term.class);
TermStatistics[] termStatistics = new TermStatistics[terms.length];
IndexReaderContext indexReaderContext = context.searcher().getTopReaderContext();
for (int i = 0; i < terms.length; i++) {
if (context.isCancelled()) {
throw new TaskCancelledException("cancelled");
}
// LUCENE 4 UPGRADE: cache TermContext?
TermContext termContext = TermContext.build(indexReaderContext, terms[i]);
termStatistics[i] = context.searcher().termStatistics(terms[i], termContext);
}
ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap();
for (Term term : terms) {
assert term.field() != null : "field is null";
if (!fieldStatistics.containsKey(term.field())) {
final CollectionStatistics collectionStatistics = context.searcher().collectionStatistics(term.field());
fieldStatistics.put(term.field(), collectionStatistics);
if (context.isCancelled()) {
throw new TaskCancelledException("cancelled");
}
}
}
context.dfsResult().termsStatistics(terms, termStatistics).fieldStatistics(fieldStatistics).maxDoc(context.searcher().getIndexReader().maxDoc());
} catch (Exception e) {
throw new DfsPhaseExecutionException(context, "Exception during dfs phase", e);
} finally {
// don't hold on to terms
termsSet.clear();
}
}
use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.
the class BlendedTermQuery method adjustTTF.
private TermContext adjustTTF(IndexReaderContext readerContext, TermContext termContext, long sumTTF) {
assert termContext.wasBuiltFor(readerContext);
if (sumTTF == -1 && termContext.totalTermFreq() == -1) {
return termContext;
}
TermContext newTermContext = new TermContext(readerContext);
List<LeafReaderContext> leaves = readerContext.leaves();
final int len;
if (leaves == null) {
len = 1;
} else {
len = leaves.size();
}
int df = termContext.docFreq();
long ttf = sumTTF;
for (int i = 0; i < len; i++) {
TermState termState = termContext.get(i);
if (termState == null) {
continue;
}
newTermContext.register(termState, i, df, ttf);
df = 0;
ttf = 0;
}
return newTermContext;
}
use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.
the class BlendedTermQuery method blend.
protected void blend(final TermContext[] contexts, int maxDoc, IndexReader reader) throws IOException {
if (contexts.length <= 1) {
return;
}
int max = 0;
long minSumTTF = Long.MAX_VALUE;
for (int i = 0; i < contexts.length; i++) {
TermContext ctx = contexts[i];
int df = ctx.docFreq();
// we use the max here since it's the only "true" estimation we can make here
// at least max(df) documents have that term. Sum or Averages don't seem
// to have a significant meaning here.
// TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have:
// df = df1 + df2 - (df1 * df2 / maxDoc)?
max = Math.max(df, max);
if (minSumTTF != -1 && ctx.totalTermFreq() != -1) {
// we need to find out the minimum sumTTF to adjust the statistics
// otherwise the statistics don't match
minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
} else {
minSumTTF = -1;
}
}
if (minSumTTF != -1 && maxDoc > minSumTTF) {
maxDoc = (int) minSumTTF;
}
if (max == 0) {
// we are done that term doesn't exist at all
return;
}
long sumTTF = minSumTTF == -1 ? -1 : 0;
final int[] tieBreak = new int[contexts.length];
for (int i = 0; i < tieBreak.length; ++i) {
tieBreak[i] = i;
}
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
final int tmp = tieBreak[i];
tieBreak[i] = tieBreak[j];
tieBreak[j] = tmp;
}
@Override
protected int compare(int i, int j) {
return Integer.compare(contexts[tieBreak[j]].docFreq(), contexts[tieBreak[i]].docFreq());
}
}.sort(0, tieBreak.length);
int prev = contexts[tieBreak[0]].docFreq();
int actualDf = Math.min(maxDoc, max);
assert actualDf >= 0 : "DF must be >= 0";
// that acts as a tie breaker
for (int i : tieBreak) {
TermContext ctx = contexts[i];
if (ctx.docFreq() == 0) {
break;
}
final int current = ctx.docFreq();
if (prev > current) {
actualDf++;
}
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
prev = current;
if (sumTTF >= 0 && ctx.totalTermFreq() >= 0) {
sumTTF += ctx.totalTermFreq();
} else {
// omit once TF is omitted anywhere!
sumTTF = -1;
}
}
sumTTF = Math.min(sumTTF, minSumTTF);
for (int i = 0; i < contexts.length; i++) {
int df = contexts[i].docFreq();
if (df == 0) {
continue;
}
// the blended sumTTF can't be greater than the sumTTTF on the field
final long fixedTTF = sumTTF == -1 ? -1 : sumTTF;
contexts[i] = adjustTTF(reader.getContext(), contexts[i], fixedTTF);
}
}
use of org.apache.lucene.index.TermContext in project elasticsearch by elastic.
the class AllTermQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
if (needsScores == false) {
return new TermQuery(term).createWeight(searcher, needsScores);
}
final TermContext termStates = TermContext.build(searcher.getTopReaderContext(), term);
final CollectionStatistics collectionStats = searcher.collectionStatistics(term.field());
final TermStatistics termStats = searcher.termStatistics(term, termStates);
final Similarity similarity = searcher.getSimilarity(needsScores);
final SimWeight stats = similarity.computeWeight(collectionStats, termStats);
return new Weight(this) {
@Override
public float getValueForNormalization() throws IOException {
return stats.getValueForNormalization();
}
@Override
public void normalize(float norm, float topLevelBoost) {
stats.normalize(norm, topLevelBoost);
}
@Override
public void extractTerms(Set<Term> terms) {
terms.add(term);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
AllTermScorer scorer = scorer(context);
if (scorer != null) {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float score = scorer.score();
float freq = scorer.freq();
SimScorer docScorer = similarity.simScorer(stats, context);
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
Explanation termScoreExplanation = docScorer.explain(doc, freqExplanation);
Explanation payloadBoostExplanation = Explanation.match(scorer.payloadBoost(), "payloadBoost=" + scorer.payloadBoost());
return Explanation.match(score, "weight(" + getQuery() + " in " + doc + ") [" + similarity.getClass().getSimpleName() + "], product of:", termScoreExplanation, payloadBoostExplanation);
}
}
return Explanation.noMatch("no matching term");
}
@Override
public AllTermScorer scorer(LeafReaderContext context) throws IOException {
final Terms terms = context.reader().terms(term.field());
if (terms == null) {
return null;
}
final TermsEnum termsEnum = terms.iterator();
if (termsEnum == null) {
return null;
}
final TermState state = termStates.get(context.ord);
if (state == null) {
// Term does not exist in this segment
return null;
}
termsEnum.seekExact(term.bytes(), state);
PostingsEnum docs = termsEnum.postings(null, PostingsEnum.PAYLOADS);
assert docs != null;
return new AllTermScorer(this, docs, similarity.simScorer(stats, context));
}
};
}
use of org.apache.lucene.index.TermContext in project lucene-solr by apache.
the class TermAutomatonQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
IndexReaderContext context = searcher.getTopReaderContext();
Map<Integer, TermContext> termStates = new HashMap<>();
for (Map.Entry<BytesRef, Integer> ent : termToID.entrySet()) {
if (ent.getKey() != null) {
termStates.put(ent.getValue(), TermContext.build(context, new Term(field, ent.getKey())));
}
}
return new TermAutomatonWeight(det, searcher, termStates, boost);
}
Aggregations