use of org.apache.lucene.search.TermStatistics in project lucene-solr by apache.
the class BM25Similarity method idfExplain.
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the terms in the phrase
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
// sum into a double before casting into a float
double idf = 0d;
List<Explanation> details = new ArrayList<>();
for (final TermStatistics stat : termStats) {
Explanation idfExplain = idfExplain(collectionStats, stat);
details.add(idfExplain);
idf += idfExplain.getValue();
}
return Explanation.match((float) idf, "idf(), sum of:", details);
}
use of org.apache.lucene.search.TermStatistics in project lucene-solr by apache.
the class TFIDFSimilarity method idfExplain.
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the terms in the phrase
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
// sum into a double before casting into a float
double idf = 0d;
List<Explanation> subs = new ArrayList<>();
for (final TermStatistics stat : termStats) {
Explanation idfExplain = idfExplain(collectionStats, stat);
subs.add(idfExplain);
idf += idfExplain.getValue();
}
return Explanation.match((float) idf, "idf(), sum of:", subs);
}
use of org.apache.lucene.search.TermStatistics in project lucene-solr by apache.
the class TestMemoryIndex method testSimilarities.
@Test
public void testSimilarities() throws IOException {
MemoryIndex mi = new MemoryIndex();
mi.addField("f1", "a long text field that contains many many terms", analyzer);
IndexSearcher searcher = mi.createSearcher();
LeafReader reader = (LeafReader) searcher.getIndexReader();
NumericDocValues norms = reader.getNormValues("f1");
assertEquals(0, norms.nextDoc());
float n1 = norms.longValue();
// Norms are re-computed when we change the Similarity
mi.setSimilarity(new Similarity() {
@Override
public long computeNorm(FieldInvertState state) {
return 74;
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
throw new UnsupportedOperationException();
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
throw new UnsupportedOperationException();
}
});
norms = reader.getNormValues("f1");
assertEquals(0, norms.nextDoc());
float n2 = norms.longValue();
assertTrue(n1 != n2);
TestUtil.checkReader(reader);
}
use of org.apache.lucene.search.TermStatistics in project elasticsearch by elastic.
the class TermVectorsWriter method setFields.
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
int numFieldsWritten = 0;
PostingsEnum docsAndPosEnum = null;
PostingsEnum docsEnum = null;
boolean hasScores = termVectorsFilter != null;
for (String field : termVectorsByField) {
if ((selectedFields != null) && (!selectedFields.contains(field))) {
continue;
}
Terms fieldTermVector = termVectorsByField.terms(field);
Terms topLevelTerms = topLevelFields.terms(field);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = EMPTY_TERMS;
}
TermsEnum topLevelIterator = topLevelTerms.iterator();
boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
long termsSize = fieldTermVector.size();
if (hasScores) {
termsSize = Math.min(termsSize, termVectorsFilter.size(field));
}
startField(field, termsSize, positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) {
if (dfs != null) {
writeFieldStatistics(dfs.fieldStatistics().get(field));
} else {
writeFieldStatistics(topLevelTerms);
}
}
TermsEnum iterator = fieldTermVector.iterator();
final boolean useDocsAndPos = positions || offsets || payloads;
while (iterator.next() != null) {
// iterate all terms of the current field
BytesRef termBytesRef = iterator.term();
Term term = new Term(field, termBytesRef);
// with filtering we only keep the best terms
if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
continue;
}
startTerm(termBytesRef);
if (flags.contains(Flag.TermStatistics)) {
// get the doc frequency
if (dfs != null) {
final TermStatistics statistics = dfs.termStatistics().get(term);
writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
} else {
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
if (foundTerm) {
writeTermStatistics(topLevelIterator);
} else {
writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
}
}
}
if (useDocsAndPos) {
// given we have pos or offsets
docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads);
} else {
// if we do not have the positions stored, we need to
// get the frequency from a PostingsEnum.
docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
}
if (hasScores) {
writeScoreTerm(termVectorsFilter.getScoreTerm(term));
}
}
numFieldsWritten++;
}
response.setTermVectorsField(output);
response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
}
use of org.apache.lucene.search.TermStatistics in project elasticsearch by elastic.
the class DfsPhase method execute.
@Override
public void execute(SearchContext context) {
final ObjectHashSet<Term> termsSet = new ObjectHashSet<>();
try {
context.searcher().createNormalizedWeight(context.query(), true).extractTerms(new DelegateSet(termsSet));
for (RescoreSearchContext rescoreContext : context.rescore()) {
rescoreContext.rescorer().extractTerms(context, rescoreContext, new DelegateSet(termsSet));
}
Term[] terms = termsSet.toArray(Term.class);
TermStatistics[] termStatistics = new TermStatistics[terms.length];
IndexReaderContext indexReaderContext = context.searcher().getTopReaderContext();
for (int i = 0; i < terms.length; i++) {
if (context.isCancelled()) {
throw new TaskCancelledException("cancelled");
}
// LUCENE 4 UPGRADE: cache TermContext?
TermContext termContext = TermContext.build(indexReaderContext, terms[i]);
termStatistics[i] = context.searcher().termStatistics(terms[i], termContext);
}
ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap();
for (Term term : terms) {
assert term.field() != null : "field is null";
if (!fieldStatistics.containsKey(term.field())) {
final CollectionStatistics collectionStatistics = context.searcher().collectionStatistics(term.field());
fieldStatistics.put(term.field(), collectionStatistics);
if (context.isCancelled()) {
throw new TaskCancelledException("cancelled");
}
}
}
context.dfsResult().termsStatistics(terms, termStatistics).fieldStatistics(fieldStatistics).maxDoc(context.searcher().getIndexReader().maxDoc());
} catch (Exception e) {
throw new DfsPhaseExecutionException(context, "Exception during dfs phase", e);
} finally {
// don't hold on to terms
termsSet.clear();
}
}
Aggregations