use of org.apache.lucene.search.similarities.TFIDFSimilarity in project lucene-solr by apache.
the class TestFunctionQuery method testTFIDFFunctions.
public void testTFIDFFunctions() {
clearIndex();
TFIDFSimilarity similarity = null;
{
Similarity sim = h.getCore().getLatestSchema().getFieldType("a_tfidf").getSimilarity();
assertNotNull("Test needs *_tfidf to use a TFIDFSimilarity ... who broke the config?", sim);
assertTrue("Test needs *_tfidf to use a TFIDFSimilarity ... who broke the config: " + sim.getClass(), sim instanceof TFIDFSimilarity);
similarity = (TFIDFSimilarity) sim;
}
assertU(adoc("id", "1", "a_tdt", "2009-08-31T12:10:10.123Z", "b_tdt", "2009-08-31T12:10:10.124Z"));
assertU(adoc("id", "2", "a_tfidf", "how now brown cow"));
// create more than one segment
assertU(commit());
assertU(adoc("id", "3", "a_tfidf", "brown cow"));
assertU(adoc("id", "4"));
// create more than one segment
assertU(commit());
assertU(adoc("id", "5"));
assertU(adoc("id", "6", "a_tfidf", "cow cow cow cow cow"));
assertU(commit());
// make sure it doesn't get a NPE if no terms are present in a field.
assertQ(req("fl", "*,score", "q", "{!func}idf(nofield_tfidf,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.idf(0, 6) + "'");
assertQ(req("fl", "*,score", "q", "{!func}tf(nofield_tfidf,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.tf(0) + "'");
// fields with real values
assertQ(req("fl", "*,score", "q", "{!func}idf(a_tfidf,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.idf(3, 6) + "'");
assertQ(req("fl", "*,score", "q", "{!func}tf(a_tfidf,cow)", "fq", "id:6"), "//float[@name='score']='" + similarity.tf(5) + "'");
assertQ(req("fl", "*,score", "q", "{!func}norm(a_tfidf)", "fq", "id:2"), // 1/sqrt(4)==1/2==0.5
"//float[@name='score']='0.5'");
}
use of org.apache.lucene.search.similarities.TFIDFSimilarity in project lucene-solr by apache.
the class IDFValueSource method getValues.
@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
IndexSearcher searcher = (IndexSearcher) context.get("searcher");
TFIDFSimilarity sim = asTFIDF(searcher.getSimilarity(true), field);
if (sim == null) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
}
int docfreq = searcher.getIndexReader().docFreq(new Term(indexedField, indexedBytes));
float idf = sim.idf(docfreq, searcher.getIndexReader().maxDoc());
return new ConstDoubleDocValues(idf, this);
}
use of org.apache.lucene.search.similarities.TFIDFSimilarity in project lucene-solr by apache.
the class SweetSpotSimilarityTest method testSweetSpotTf.
public void testSweetSpotTf() {
SweetSpotSimilarity ss = new SweetSpotSimilarity();
TFIDFSimilarity d = new ClassicSimilarity();
TFIDFSimilarity s = ss;
// tf equal
ss.setBaselineTfFactors(0.0f, 0.0f);
for (int i = 1; i < 1000; i++) {
assertEquals("tf: i=" + i, d.tf(i), s.tf(i), 0.0f);
}
// tf higher
ss.setBaselineTfFactors(1.0f, 0.0f);
for (int i = 1; i < 1000; i++) {
assertTrue("tf: i=" + i + " : d=" + d.tf(i) + " < s=" + s.tf(i), d.tf(i) < s.tf(i));
}
// tf flat
ss.setBaselineTfFactors(1.0f, 6.0f);
for (int i = 1; i <= 6; i++) {
assertEquals("tf flat1: i=" + i, 1.0f, s.tf(i), 0.0f);
}
ss.setBaselineTfFactors(2.0f, 6.0f);
for (int i = 1; i <= 6; i++) {
assertEquals("tf flat2: i=" + i, 2.0f, s.tf(i), 0.0f);
}
for (int i = 6; i <= 1000; i++) {
assertTrue("tf: i=" + i + " : s=" + s.tf(i) + " < d=" + d.tf(i), s.tf(i) < d.tf(i));
}
// stupidity
assertEquals("tf zero", 0.0f, s.tf(0), 0.0f);
}
use of org.apache.lucene.search.similarities.TFIDFSimilarity in project lucene-solr by apache.
the class TFValueSource method getValues.
@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
Fields fields = readerContext.reader().fields();
final Terms terms = fields.terms(indexedField);
IndexSearcher searcher = (IndexSearcher) context.get("searcher");
final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(true), indexedField);
if (similarity == null) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
}
return new FloatDocValues(this) {
PostingsEnum docs;
int atDoc;
int lastDocRequested = -1;
{
reset();
}
public void reset() throws IOException {
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(indexedBytes)) {
docs = termsEnum.postings(null);
} else {
docs = null;
}
} else {
docs = null;
}
if (docs == null) {
docs = new PostingsEnum() {
@Override
public int freq() {
return 0;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int docID() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int nextDoc() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int advance(int target) {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public long cost() {
return 0;
}
};
}
atDoc = -1;
}
@Override
public float floatVal(int doc) {
try {
if (doc < lastDocRequested) {
// out-of-order access.... reset
reset();
}
lastDocRequested = doc;
if (atDoc < doc) {
atDoc = docs.advance(doc);
}
if (atDoc > doc) {
// end, or because the next doc is after this doc.
return similarity.tf(0);
}
// a match!
return similarity.tf(docs.freq());
} catch (IOException e) {
throw new RuntimeException("caught exception in function " + description() + " : doc=" + doc, e);
}
}
};
}
use of org.apache.lucene.search.similarities.TFIDFSimilarity in project Anserini by castorini.
the class TFIDFFeatureExtractor method extract.
@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
float score = 0.0f;
Map<String, Long> countMap = new HashMap<>();
Map<String, Integer> docFreqs = new HashMap<>();
IndexReader reader = context.getIndexSearcher().getIndexReader();
long numDocs = reader.numDocs();
for (String queryToken : context.getQueryTokens()) {
try {
docFreqs.put(queryToken, reader.docFreq(new Term(context.getField(), queryToken)));
} catch (IOException e) {
LOG.error("Error trying to read document frequency");
docFreqs.put(queryToken, 0);
}
}
try {
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
String termString = termsEnum.term().utf8ToString();
if (context.getQueryTokens().contains(termString)) {
countMap.put(termString, termsEnum.totalTermFreq());
}
}
} catch (IOException e) {
LOG.error("Error while accessing term vector");
}
TFIDFSimilarity similarity = new ClassicSimilarity();
// number of query tokens found
// how many of our query tokens were found
float coord = similarity.coord(countMap.size(), context.getQueryTokens().size());
for (String token : context.getQueryTokens()) {
long termFreq = countMap.containsKey(token) ? countMap.get(token) : 0;
long docFreq = docFreqs.containsKey(token) ? docFreqs.get(token) : 0;
float tf = similarity.tf(termFreq);
float idf = similarity.idf(docFreq, numDocs);
score += tf * idf * idf;
}
score *= coord;
return score;
}
Aggregations