Search in sources :

Example 26 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class SdmQueryTest method spanQueriesTest.

@Test
public void spanQueriesTest() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = newSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());
    SpanNearQuery q;
    TopDocs rs;
    SpanTermQuery t1 = new SpanTermQuery(new Term(field, "john"));
    SpanTermQuery t2 = new SpanTermQuery(new Term(field, "bush"));
    q = new SpanNearQuery(new SpanQuery[] { t1, t2 }, 3, true);
    rs = searcher.search(q, 1);
    assertEquals(rs.scoreDocs.length, 0);
    q = new SpanNearQuery(new SpanQuery[] { t1, t2 }, 8, true);
    rs = searcher.search(q, 1);
    assertEquals(rs.scoreDocs.length, 1);
    q = new SpanNearQuery(new SpanQuery[] { t2, t1 }, 8, true);
    rs = searcher.search(q, 1);
    assertEquals(rs.scoreDocs.length, 0);
    q = new SpanNearQuery(new SpanQuery[] { t2, t1 }, 8, false);
    rs = searcher.search(q, 1);
    assertEquals(rs.scoreDocs.length, 1);
    q = new SpanNearQuery(new SpanQuery[] { t2, t1 }, 16, false);
    rs = searcher.search(q, 1);
    assertEquals(rs.scoreDocs.length, 1);
    String sdmQueryStr = "fox information river";
    Query sdmQuery1 = new SdmQueryGenerator(1.0f, 0.0f, 0.0f).buildQuery(field, analyzer, sdmQueryStr);
    assertEquals(sdmQuery1.toString(), "(text:fox text:inform text:river)^1.0 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^0.0 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^0.0");
    TopDocs rs1 = searcher.search(sdmQuery1, 1);
    Query termQuery = new BagOfWordsQueryGenerator().buildQuery(field, analyzer, sdmQueryStr);
    TopDocs rsTerm = searcher.search(termQuery, 1);
    assertEquals(rs1.scoreDocs[0].score, rsTerm.scoreDocs[0].score, 1e-6f);
    // ///////
    Query sdmQuery2 = new SdmQueryGenerator(0.0f, 1.0f, 0.0f).buildQuery(field, analyzer, sdmQueryStr);
    assertEquals(sdmQuery2.toString(), "(text:fox text:inform text:river)^0.0 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^1.0 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^0.0");
    TopDocs rs2 = searcher.search(sdmQuery2, 1);
    Query orderedWindowQuery1 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "fox")), new SpanTermQuery(new Term(field, "inform")) }, 1, true);
    Query orderedWindowQuery2 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "inform")), new SpanTermQuery(new Term(field, "river")) }, 1, true);
    TopDocs rsOrderedWindow1 = searcher.search(orderedWindowQuery1, 1);
    TopDocs rsOrderedWindow2 = searcher.search(orderedWindowQuery2, 1);
    assertEquals(rs2.scoreDocs[0].score, rsOrderedWindow1.scoreDocs[0].score + rsOrderedWindow2.scoreDocs[0].score, 1e-6f);
    // //////
    Query sdmQuery3 = new SdmQueryGenerator(0.0f, 0.0f, 1.0f).buildQuery(field, analyzer, sdmQueryStr);
    assertEquals(sdmQuery3.toString(), "(text:fox text:inform text:river)^0.0 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^0.0 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^1.0");
    TopDocs rs3 = searcher.search(sdmQuery3, 1);
    Query unorderedWindowQuery1 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "fox")), new SpanTermQuery(new Term(field, "inform")) }, 8, false);
    Query unorderedWindowQuery2 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(field, "inform")), new SpanTermQuery(new Term(field, "river")) }, 8, false);
    TopDocs rsUnorderedWindow1 = searcher.search(unorderedWindowQuery1, 1);
    TopDocs rsUnorderedWindow2 = searcher.search(unorderedWindowQuery2, 1);
    assertEquals(rs3.scoreDocs[0].score, rsUnorderedWindow1.scoreDocs[0].score + rsUnorderedWindow2.scoreDocs[0].score, 1e-6f);
    // ////////
    Query sdmQuery4 = new SdmQueryGenerator(0.85f, 0.1f, 0.05f).buildQuery(field, analyzer, sdmQueryStr);
    assertEquals(sdmQuery4.toString(), "(text:fox text:inform text:river)^0.85 " + "(spanNear([text:fox, text:inform], 1, true) spanNear([text:inform, text:river], 1, true))^0.1 " + "(spanNear([text:fox, text:inform], 8, false) spanNear([text:inform, text:river], 8, false))^0.05");
    TopDocs rs4 = searcher.search(sdmQuery4, 1);
    assertEquals(rs4.scoreDocs[0].score, rsTerm.scoreDocs[0].score * 0.85f + (rsOrderedWindow1.scoreDocs[0].score + rsOrderedWindow2.scoreDocs[0].score) * 0.1f + (rsUnorderedWindow1.scoreDocs[0].score + rsUnorderedWindow2.scoreDocs[0].score) * 0.05f, 1e-6f);
    reader.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) SpanQuery(org.apache.lucene.search.spans.SpanQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) Term(org.apache.lucene.index.Term) SpanQuery(org.apache.lucene.search.spans.SpanQuery) TopDocs(org.apache.lucene.search.TopDocs) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Example 27 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class BM25PrfReranker method rerank.

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    // set similarity to BM25PRF
    IndexSearcher searcher = context.getIndexSearcher();
    BM25Similarity originalSimilarity = (BM25Similarity) searcher.getSimilarity();
    searcher.setSimilarity(new BM25PrfSimilarity(k1, b));
    IndexReader reader = searcher.getIndexReader();
    List<String> originalQueryTerms = AnalyzerUtils.analyze(analyzer, context.getQueryText());
    boolean useRf = (context.getSearchArgs().rf_qrels != null);
    PrfFeatures fv = expandQuery(originalQueryTerms, docs, reader, useRf);
    Query newQuery = fv.toQuery();
    if (this.outputQuery) {
        LOG.info("QID: " + context.getQueryId());
        LOG.info("Original Query: " + context.getQuery().toString(this.field));
        LOG.info("Running new query: " + newQuery.toString(this.field));
        LOG.info("Features: " + fv.toString());
    }
    TopDocs rs;
    try {
        // Figure out how to break the scoring ties.
        if (context.getSearchArgs().arbitraryScoreTieBreak) {
            rs = searcher.search(newQuery, context.getSearchArgs().hits);
        } else if (context.getSearchArgs().searchtweets) {
            rs = searcher.search(newQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_TWEETID, true);
        } else {
            rs = searcher.search(newQuery, context.getSearchArgs().hits, BREAK_SCORE_TIES_BY_DOCID, true);
        }
    } catch (IOException e) {
        e.printStackTrace();
        return docs;
    }
    // set similarity back
    searcher.setSimilarity(originalSimilarity);
    return ScoredDocuments.fromTopDocs(rs, searcher);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IndexReader(org.apache.lucene.index.IndexReader) IOException(java.io.IOException)

Example 28 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class IndexReaderUtilsTest method computeAllTermBM25Weights.

@Test
public void computeAllTermBM25Weights() throws Exception {
    SearchArgs args = new SearchArgs();
    Similarity similarity = new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    // The complete term/doc matrix
    Map<String, Map<String, Float>> termDocMatrix = new HashMap<>();
    // We're going to iterate through all the terms in the dictionary to build the term/doc matrix
    Terms terms = MultiTerms.getTerms(reader, "contents");
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(similarity);
        TopDocs rs = searcher.search(new TermQuery(new Term("contents", term)), 3);
        for (int i = 0; i < rs.scoreDocs.length; i++) {
            String docid = reader.document(rs.scoreDocs[i].doc).getField("id").stringValue();
            if (!termDocMatrix.containsKey(term))
                termDocMatrix.put(term, new HashMap<>());
            termDocMatrix.get(term).put(docid, rs.scoreDocs[i].score);
        }
    }
    int numDocs = reader.numDocs();
    // Iterate through the document vectors, and verify that we have the same values as in the term/doc matrix
    for (int i = 0; i < numDocs; i++) {
        Terms termVector = reader.getTermVector(i, "contents");
        String docid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
        // For this document, iterate through the terms.
        termsEnum = termVector.iterator();
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            float weight = IndexReaderUtils.getBM25AnalyzedTermWeight(reader, docid, term);
            assertEquals(termDocMatrix.get(term).get(docid), weight, 10e-6);
        }
    }
    reader.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) HashMap(java.util.HashMap) Terms(org.apache.lucene.index.Terms) MultiTerms(org.apache.lucene.index.MultiTerms) Term(org.apache.lucene.index.Term) TermsEnum(org.apache.lucene.index.TermsEnum) TopDocs(org.apache.lucene.search.TopDocs) SearchArgs(io.anserini.search.SearchArgs) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IndexReader(org.apache.lucene.index.IndexReader) HashMap(java.util.HashMap) Map(java.util.Map) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Example 29 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class IndexReaderUtilsTest method testComputeQueryDocumentScore.

@Test
public void testComputeQueryDocumentScore() throws Exception {
    SimpleSearcher searcher = new SimpleSearcher(tempDir1.toString());
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    // A bunch of test queries...
    String[] queries = { "text city", "text", "city" };
    for (String query : queries) {
        SimpleSearcher.Result[] results = searcher.search(query);
        // Strategy is to loop over the results, compute query-document score individually, and compare.
        for (int i = 0; i < results.length; i++) {
            float score = IndexReaderUtils.computeQueryDocumentScoreWithSimilarity(reader, results[i].docid, query, similarity);
            assertEquals(score, results[i].score, 10e-5);
        }
        // This is hard coded - doc3 isn't retrieved by any of the queries.
        assertEquals(0.0f, IndexReaderUtils.computeQueryDocumentScoreWithSimilarity(reader, "doc3", query, similarity), 10e-6);
    }
    reader.close();
    dir.close();
}
Also used : SimpleSearcher(io.anserini.search.SimpleSearcher) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Aggregations

BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)29 Directory (org.apache.lucene.store.Directory)12 IndexSearcher (org.apache.lucene.search.IndexSearcher)11 IndexReader (org.apache.lucene.index.IndexReader)10 Similarity (org.apache.lucene.search.similarities.Similarity)9 FSDirectory (org.apache.lucene.store.FSDirectory)9 Query (org.apache.lucene.search.Query)8 TopDocs (org.apache.lucene.search.TopDocs)8 TermQuery (org.apache.lucene.search.TermQuery)7 ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)7 Test (org.junit.Test)7 Term (org.apache.lucene.index.Term)6 RerankerCascade (io.anserini.rerank.RerankerCascade)5 BooleanQuery (org.apache.lucene.search.BooleanQuery)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 FeatureExtractors (io.anserini.ltr.feature.FeatureExtractors)3 IdentityReranker (io.anserini.rerank.IdentityReranker)3 ScoredDocuments (io.anserini.rerank.ScoredDocuments)3 Qrels (io.anserini.util.Qrels)3 PrintStream (java.io.PrintStream)3