use of org.apache.lucene.index.TermContext in project lucene-solr by apache.
the class TopTermsRewrite method rewrite.
@Override
public final Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, getMaxSize());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
collectTerms(reader, query, new TermCollector() {
private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes.addAttribute(MaxNonCompetitiveBoostAttribute.class);
private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<>();
private TermsEnum termsEnum;
private BoostAttribute boostAtt;
private ScoreTerm st;
@Override
public void setNextEnum(TermsEnum termsEnum) {
this.termsEnum = termsEnum;
assert compareToLastTerm(null);
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(new TermContext(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
// for assert:
private BytesRefBuilder lastTerm;
private boolean compareToLastTerm(BytesRef t) {
if (lastTerm == null && t != null) {
lastTerm = new BytesRefBuilder();
lastTerm.append(t);
} else if (t == null) {
lastTerm = null;
} else {
assert lastTerm.get().compareTo(t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
lastTerm.copyBytes(t);
}
return true;
}
@Override
public boolean collect(BytesRef bytes) throws IOException {
final float boost = boostAtt.getBoost();
// terms in order
assert compareToLastTerm(bytes);
// ignore uncompetitive hits
if (stQueue.size() == maxSize) {
final ScoreTerm t = stQueue.peek();
if (boost < t.boost)
return true;
if (boost == t.boost && bytes.compareTo(t.bytes.get()) > 0)
return true;
}
ScoreTerm t = visitedTerms.get(bytes);
final TermState state = termsEnum.termState();
assert state != null;
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copyBytes(bytes);
st.boost = boost;
visitedTerms.put(st.bytes.get(), st);
assert st.termState.docFreq() == 0;
st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
st = stQueue.poll();
visitedTerms.remove(st.bytes.get());
// reset the termstate!
st.termState.clear();
} else {
st = new ScoreTerm(new TermContext(topReaderContext));
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
if (stQueue.size() == maxSize) {
t = stQueue.peek();
maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
maxBoostAtt.setCompetitiveTerm(t.bytes.get());
}
}
return true;
}
});
final B b = getTopLevelBuilder();
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef());
// We allow negative term scores (fuzzy query does this, for example) while collecting the terms,
// but truncate such boosts to 0.0f when building the query:
// add to query
addClause(b, term, st.termState.docFreq(), Math.max(0.0f, st.boost), st.termState);
}
return build(b);
}
use of org.apache.lucene.index.TermContext in project lucene-solr by apache.
the class TestSimilarityBase method testLengthEncodingBackwardCompatibility.
public void testLengthEncodingBackwardCompatibility() throws IOException {
Similarity similarity = RandomPicks.randomFrom(random(), sims);
for (int indexCreatedVersionMajor : new int[] { Version.LUCENE_6_0_0.major, Version.LATEST.major }) {
for (int length : new int[] { 1, 2, 4 }) {
// these length values are encoded accurately on both cases
Directory dir = newDirectory();
// set the version on the directory
new SegmentInfos(indexCreatedVersionMajor).commit(dir);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setSimilarity(similarity));
Document doc = new Document();
String value = IntStream.range(0, length).mapToObj(i -> "b").collect(Collectors.joining(" "));
doc.add(new TextField("foo", value, Store.NO));
w.addDocument(doc);
IndexReader reader = DirectoryReader.open(w);
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(similarity);
Term term = new Term("foo", "b");
TermContext context = TermContext.build(reader.getContext(), term);
SimWeight simWeight = similarity.computeWeight(1f, searcher.collectionStatistics("foo"), searcher.termStatistics(term, context));
SimilarityBase.BasicSimScorer simScorer = (SimilarityBase.BasicSimScorer) similarity.simScorer(simWeight, reader.leaves().get(0));
float docLength = simScorer.getLengthValue(0);
assertEquals(length, (int) docLength);
w.close();
reader.close();
dir.close();
}
}
}
use of org.apache.lucene.index.TermContext in project lucene-solr by apache.
the class CommonTermsQuery method collectTermContext.
public void collectTermContext(IndexReader reader, List<LeafReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException {
TermsEnum termsEnum = null;
for (LeafReaderContext context : leaves) {
final Fields fields = context.reader().fields();
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
TermContext termContext = contextArray[i];
final Terms terms = fields.terms(term.field());
if (terms == null) {
// field does not exist
continue;
}
termsEnum = terms.iterator();
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY)
continue;
if (termsEnum.seekExact(term.bytes())) {
if (termContext == null) {
contextArray[i] = new TermContext(reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
}
}
use of org.apache.lucene.index.TermContext in project lucene-solr by apache.
the class CommonTermsQuery method buildQuery.
protected Query buildQuery(final int maxDoc, final TermContext[] contextArray, final Term[] queryTerms) {
List<Query> lowFreqQueries = new ArrayList<>();
List<Query> highFreqQueries = new ArrayList<>();
for (int i = 0; i < queryTerms.length; i++) {
TermContext termContext = contextArray[i];
if (termContext == null) {
lowFreqQueries.add(newTermQuery(queryTerms[i], null));
} else {
if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency) || (termContext.docFreq() > (int) Math.ceil(maxTermFrequency * (float) maxDoc))) {
highFreqQueries.add(newTermQuery(queryTerms[i], termContext));
} else {
lowFreqQueries.add(newTermQuery(queryTerms[i], termContext));
}
}
}
final int numLowFreqClauses = lowFreqQueries.size();
final int numHighFreqClauses = highFreqQueries.size();
Occur lowFreqOccur = this.lowFreqOccur;
Occur highFreqOccur = this.highFreqOccur;
int lowFreqMinShouldMatch = 0;
int highFreqMinShouldMatch = 0;
if (lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
lowFreqMinShouldMatch = calcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
}
if (highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
highFreqMinShouldMatch = calcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
}
if (lowFreqQueries.isEmpty()) {
/*
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
* prevent slow queries.
*/
if (highFreqMinShouldMatch == 0 && highFreqOccur != Occur.MUST) {
highFreqOccur = Occur.MUST;
}
}
BooleanQuery.Builder builder = new BooleanQuery.Builder();
if (lowFreqQueries.isEmpty() == false) {
BooleanQuery.Builder lowFreq = new BooleanQuery.Builder();
for (Query query : lowFreqQueries) {
lowFreq.add(query, lowFreqOccur);
}
lowFreq.setMinimumNumberShouldMatch(lowFreqMinShouldMatch);
Query lowFreqQuery = lowFreq.build();
builder.add(new BoostQuery(lowFreqQuery, lowFreqBoost), Occur.MUST);
}
if (highFreqQueries.isEmpty() == false) {
BooleanQuery.Builder highFreq = new BooleanQuery.Builder();
for (Query query : highFreqQueries) {
highFreq.add(query, highFreqOccur);
}
highFreq.setMinimumNumberShouldMatch(highFreqMinShouldMatch);
Query highFreqQuery = highFreq.build();
builder.add(new BoostQuery(highFreqQuery, highFreqBoost), Occur.SHOULD);
}
return builder.build();
}
use of org.apache.lucene.index.TermContext in project lucene-solr by apache.
the class FuzzyLikeThisQuery method newTermQuery.
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
if (ignoreTF) {
return new ConstantScoreQuery(new TermQuery(term));
} else {
// we build an artificial TermContext that will give an overall df and ttf
// equal to 1
TermContext context = new TermContext(reader.getContext());
for (LeafReaderContext leafContext : reader.leaves()) {
Terms terms = leafContext.reader().terms(term.field());
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
// we want the total df and ttf to be 1
int freq = 1 - context.docFreq();
context.register(termsEnum.termState(), leafContext.ord, freq, freq);
}
}
}
return new TermQuery(term, context);
}
}
Aggregations