use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class SdmQueryTest method spanQueriesTest.
@Test
public void spanQueriesTest() throws Exception {
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new BM25Similarity());
SpanNearQuery q;
TopDocs rs;
SpanTermQuery t1 = new SpanTermQuery(new Term(field, "john"));
SpanTermQuery t2 = new SpanTermQuery(new Term(field, "bush"));
q = new SpanNearQuery(new SpanQuery[] { t1, t2 }, 3, true);
rs = searcher.search(q, 1);
assertEquals(rs.scoreDocs.length, 0);
q = new SpanNearQuery(new SpanQuery[] { t1, t2 }, 8, true);
rs = searcher.search(q, 1);
assertEquals(rs.scoreDocs.length, 1);
q = new SpanNearQuery(new SpanQuery[] { t2, t1 }, 8, true);
rs = searcher.search(q, 1);
assertEquals(rs.scoreDocs.length, 0);
q = new SpanNearQuery(new SpanQuery[] { t2, t1 }, 8, false);
rs = searcher.search(q, 1);
assertEquals(rs.scoreDocs.length, 1);
q = new SpanNearQuery(new SpanQuery[] { t2, t1 }, 16, false);
rs = searcher.search(q, 1);
assertEquals(rs.scoreDocs.length, 1);
String sdmQueryStr = "fox information river";
Query sdmQuery1 = new SdmQueryGenerator(1.0f, 0.0f, 0.0f).buildQuery(field, analyzer, sdmQueryStr);
assertEquals(sdmQuery1.toString(), "(text:fox text:inform text:river)^1.0 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^0.0 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^0.0");
TopDocs rs1 = searcher.search(sdmQuery1, 1);
Query termQuery = new BagOfWordsQueryGenerator().buildQuery(field, analyzer, sdmQueryStr);
TopDocs rsTerm = searcher.search(termQuery, 1);
assertEquals(rs1.scoreDocs[0].score, rsTerm.scoreDocs[0].score, 1e-6f);
// ///////
Query sdmQuery2 = new SdmQueryGenerator(0.0f, 1.0f, 0.0f).buildQuery(field, analyzer, sdmQueryStr);
assertEquals(sdmQuery2.toString(), "(text:fox text:inform text:river)^0.0 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^1.0 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^0.0");
TopDocs rs2 = searcher.search(sdmQuery2, 1);
Query orderedWindowQuery1 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "fox")), new SpanTermQuery(new Term(field, "inform")) }, 1, true);
Query orderedWindowQuery2 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "inform")), new SpanTermQuery(new Term(field, "river")) }, 1, true);
TopDocs rsOrderedWindow1 = searcher.search(orderedWindowQuery1, 1);
TopDocs rsOrderedWindow2 = searcher.search(orderedWindowQuery2, 1);
assertEquals(rs2.scoreDocs[0].score, rsOrderedWindow1.scoreDocs[0].score + rsOrderedWindow2.scoreDocs[0].score, 1e-6f);
// //////
Query sdmQuery3 = new SdmQueryGenerator(0.0f, 0.0f, 1.0f).buildQuery(field, analyzer, sdmQueryStr);
assertEquals(sdmQuery3.toString(), "(text:fox text:inform text:river)^0.0 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^0.0 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^1.0");
TopDocs rs3 = searcher.search(sdmQuery3, 1);
Query unorderedWindowQuery1 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "fox")), new SpanTermQuery(new Term(field, "inform")) }, 8, false);
Query unorderedWindowQuery2 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "inform")), new SpanTermQuery(new Term(field, "river")) }, 8, false);
TopDocs rsUnorderedWindow1 = searcher.search(unorderedWindowQuery1, 1);
TopDocs rsUnorderedWindow2 = searcher.search(unorderedWindowQuery2, 1);
assertEquals(rs3.scoreDocs[0].score, rsUnorderedWindow1.scoreDocs[0].score + rsUnorderedWindow2.scoreDocs[0].score, 1e-6f);
// ////////
Query sdmQuery4 = new SdmQueryGenerator(0.85f, 0.1f, 0.05f).buildQuery(field, analyzer, sdmQueryStr);
assertEquals(sdmQuery4.toString(), "(text:fox text:inform text:river)^0.85 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^0.1 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^0.05");
TopDocs rs4 = searcher.search(sdmQuery4, 1);
assertEquals(rs4.scoreDocs[0].score, rsTerm.scoreDocs[0].score * 0.85f + (rsOrderedWindow1.scoreDocs[0].score + rsOrderedWindow2.scoreDocs[0].score) * 0.1f + (rsUnorderedWindow1.scoreDocs[0].score + rsUnorderedWindow2.scoreDocs[0].score) * 0.05f, 1e-6f);
reader.close();
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class BM25PrfReranker method rerank.
@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
// set similarity to BM25PRF
IndexSearcher searcher = context.getIndexSearcher();
BM25Similarity originalSimilarity = (BM25Similarity) searcher.getSimilarity();
searcher.setSimilarity(new BM25PrfSimilarity(k1, b));
IndexReader reader = searcher.getIndexReader();
List<String> originalQueryTerms = AnalyzerUtils.analyze(analyzer, context.getQueryText());
boolean useRf = (context.getSearchArgs().rf_qrels != null);
PrfFeatures fv = expandQuery(originalQueryTerms, docs, reader, useRf);
Query newQuery = fv.toQuery();
if (this.outputQuery) {
LOG.info("QID: " + context.getQueryId());
LOG.info("Original Query: " + context.getQuery().toString(this.field));
LOG.info("Running new query: " + newQuery.toString(this.field));
LOG.info("Features: " + fv.toString());
}
TopDocs rs;
try {
// Figure out how to break the scoring ties.
if (context.getSearchArgs().arbitraryScoreTieBreak) {
rs = searcher.search(newQuery, context.getSearchArgs().hits);
} else if (context.getSearchArgs().searchtweets) {
rs = searcher.search(newQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true);
} else {
rs = searcher.search(newQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true);
}
} catch (IOException e) {
e.printStackTrace();
return docs;
}
// set similarity back
searcher.setSimilarity(originalSimilarity);
return ScoredDocuments.fromTopDocs(rs, searcher);
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class IndexReaderUtilsTest method computeAllTermBM25Weights.
@Test
public void computeAllTermBM25Weights() throws Exception {
SearchArgs args = new SearchArgs();
Similarity similarity = new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);
// The complete term/doc matrix
Map<String, Map<String, Float>> termDocMatrix = new HashMap<>();
// We're going to iterate through all the terms in the dictionary to build the term/doc matrix
Terms terms = MultiTerms.getTerms(reader, "contents");
TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
TopDocs rs = searcher.search(new TermQuery(new Term("contents", term)), 3);
for (int i = 0; i < rs.scoreDocs.length; i++) {
String docid = reader.document(rs.scoreDocs[i].doc).getField("id").stringValue();
if (!termDocMatrix.containsKey(term))
termDocMatrix.put(term, new HashMap<>());
termDocMatrix.get(term).put(docid, rs.scoreDocs[i].score);
}
}
int numDocs = reader.numDocs();
// Iterate through the document vectors, and verify that we have the same values as in the term/doc matrix
for (int i = 0; i < numDocs; i++) {
Terms termVector = reader.getTermVector(i, "contents");
String docid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
// For this document, iterate through the terms.
termsEnum = termVector.iterator();
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
float weight = IndexReaderUtils.getBM25AnalyzedTermWeight(reader, docid, term);
assertEquals(termDocMatrix.get(term).get(docid), weight, 10e-6);
}
}
reader.close();
dir.close();
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class IndexReaderUtilsTest method testComputeQueryDocumentScore.
@Test
public void testComputeQueryDocumentScore() throws Exception {
SimpleSearcher searcher = new SimpleSearcher(tempDir1.toString());
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);
Similarity similarity = new BM25Similarity(0.9f, 0.4f);
// A bunch of test queries...
String[] queries = { "text city", "text", "city" };
for (String query : queries) {
SimpleSearcher.Result[] results = searcher.search(query);
// Strategy is to loop over the results, compute query-document score individually, and compare.
for (int i = 0; i < results.length; i++) {
float score = IndexReaderUtils.computeQueryDocumentScoreWithSimilarity(reader, results[i].docid, query, similarity);
assertEquals(score, results[i].score, 10e-5);
}
// This is hard coded - doc3 isn't retrieved by any of the queries.
assertEquals(0.0f, IndexReaderUtils.computeQueryDocumentScoreWithSimilarity(reader, "doc3", query, similarity), 10e-6);
}
reader.close();
dir.close();
}
Aggregations