Search in sources :

Example 6 with SearchArgs

use of io.anserini.search.SearchArgs in project Anserini by castorini.

the class ZhEndToEndTest method setSearchGroundTruth.

@Override
protected void setSearchGroundTruth() {
    topicReader = "TsvString";
    topicFile = "src/test/resources/sample_topics/zh_topics.tsv";
    SearchArgs searchArg = createDefaultSearchArgs().bm25();
    searchArg.language = "zh";
    testQueries.put("bm25", searchArg);
    queryTokens.put("1", new ArrayList<>());
    queryTokens.get("1").add("滑铁");
    queryTokens.get("1").add("铁卢");
    referenceRunOutput.put("bm25", new String[] { "1 Q0 doc1 1 1.337800 Anserini" });
}
Also used : SearchArgs(io.anserini.search.SearchArgs)

Example 7 with SearchArgs

use of io.anserini.search.SearchArgs in project Anserini by castorini.

the class AxiomReranker method processExternalContext.

/**
 * If the external reranking context is not null we will first search against the external
 * index and return the top ranked documents.
 *
 * @param docs The initial ranking results against target index. We will return them if external
 *             index is null.
 *
 * @return Top ranked ScoredDocuments from searching external index
 */
private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerContext<T> context) throws IOException {
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(this.externalIndexPath + " does not exist or is not a directory.");
        }
        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(context.getIndexSearcher().getSimilarity());
        SearchArgs args = new SearchArgs();
        args.hits = this.R;
        args.arbitraryScoreTieBreak = context.getSearchArgs().arbitraryScoreTieBreak;
        args.searchtweets = context.getSearchArgs().searchtweets;
        RerankerContext<T> externalContext = new RerankerContext<>(searcher, context.getQueryId(), context.getQuery(), context.getQueryDocId(), context.getQueryText(), context.getQueryTokens(), context.getFilter(), args);
        return searchTopDocs(null, externalContext);
    } else {
        return docs;
    }
}
Also used : Path(java.nio.file.Path) IndexSearcher(org.apache.lucene.search.IndexSearcher) SearchArgs(io.anserini.search.SearchArgs) IndexReader(org.apache.lucene.index.IndexReader) RerankerContext(io.anserini.rerank.RerankerContext)

Example 8 with SearchArgs

use of io.anserini.search.SearchArgs in project Anserini by castorini.

the class IndexReaderUtilsTest method computeBM25Weights.

@Test
public void computeBM25Weights() throws Exception {
    SearchArgs args = new SearchArgs();
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    assertEquals(0.43400, IndexReaderUtils.getBM25UnanalyzedTermWeightWithParameters(reader, "doc1", "city", IndexCollection.DEFAULT_ANALYZER, 0.9f, 0.4f), 10e-5);
    assertEquals(0.43400, IndexReaderUtils.getBM25AnalyzedTermWeightWithParameters(reader, "doc1", "citi", 0.9f, 0.4f), 10e-5);
    assertEquals(0.0f, IndexReaderUtils.getBM25UnanalyzedTermWeightWithParameters(reader, "doc2", "city", IndexCollection.DEFAULT_ANALYZER, 0.9f, 0.4f), 10e-5);
    assertEquals(0.0f, IndexReaderUtils.getBM25AnalyzedTermWeightWithParameters(reader, "doc2", "citi", 0.9f, 0.4f), 10e-5);
    assertEquals(0.570250, IndexReaderUtils.getBM25UnanalyzedTermWeightWithParameters(reader, "doc3", "test", IndexCollection.DEFAULT_ANALYZER, 0.9f, 0.4f), 10e-5);
    assertEquals(0.570250, IndexReaderUtils.getBM25AnalyzedTermWeightWithParameters(reader, "doc3", "test", 0.9f, 0.4f), 10e-5);
    reader.close();
    dir.close();
}
Also used : SearchArgs(io.anserini.search.SearchArgs) IndexReader(org.apache.lucene.index.IndexReader) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Example 9 with SearchArgs

use of io.anserini.search.SearchArgs in project Anserini by castorini.

the class IndexReaderUtilsTest method computeAllTermBM25Weights.

@Test
public void computeAllTermBM25Weights() throws Exception {
    SearchArgs args = new SearchArgs();
    Similarity similarity = new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    // The complete term/doc matrix
    Map<String, Map<String, Float>> termDocMatrix = new HashMap<>();
    // We're going to iterate through all the terms in the dictionary to build the term/doc matrix
    Terms terms = MultiTerms.getTerms(reader, "contents");
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(similarity);
        TopDocs rs = searcher.search(new TermQuery(new Term("contents", term)), 3);
        for (int i = 0; i < rs.scoreDocs.length; i++) {
            String docid = reader.document(rs.scoreDocs[i].doc).getField("id").stringValue();
            if (!termDocMatrix.containsKey(term))
                termDocMatrix.put(term, new HashMap<>());
            termDocMatrix.get(term).put(docid, rs.scoreDocs[i].score);
        }
    }
    int numDocs = reader.numDocs();
    // Iterate through the document vectors, and verify that we have the same values as in the term/doc matrix
    for (int i = 0; i < numDocs; i++) {
        Terms termVector = reader.getTermVector(i, "contents");
        String docid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
        // For this document, iterate through the terms.
        termsEnum = termVector.iterator();
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            float weight = IndexReaderUtils.getBM25AnalyzedTermWeight(reader, docid, term);
            assertEquals(termDocMatrix.get(term).get(docid), weight, 10e-6);
        }
    }
    reader.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) HashMap(java.util.HashMap) Terms(org.apache.lucene.index.Terms) MultiTerms(org.apache.lucene.index.MultiTerms) Term(org.apache.lucene.index.Term) TermsEnum(org.apache.lucene.index.TermsEnum) TopDocs(org.apache.lucene.search.TopDocs) SearchArgs(io.anserini.search.SearchArgs) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IndexReader(org.apache.lucene.index.IndexReader) HashMap(java.util.HashMap) Map(java.util.Map) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Example 10 with SearchArgs

use of io.anserini.search.SearchArgs in project Anserini by castorini.

the class MultiThreadingSearchTest method setSearchGroundTruth.

@Override
protected void setSearchGroundTruth() {
    topicReader = "Trec";
    topicFile = "src/test/resources/sample_topics/Trec";
    SearchArgs searchArgs;
    searchArgs = createDefaultSearchArgs().bm25();
    searchArgs.bm25_b = new String[] { "0.2", "0.8" };
    testQueries.put("bm25", searchArgs);
    runsForQuery.put("bm25", Set.of("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_default", "e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_default"));
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_default", new String[] { "1 Q0 DOC222 1 0.346600 Anserini", "1 Q0 TREC_DOC_1 2 0.325400 Anserini", "1 Q0 WSJ_1 3 0.069500 Anserini" });
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_default", new String[] { "1 Q0 TREC_DOC_1 1 0.350900 Anserini", "1 Q0 DOC222 2 0.336600 Anserini", "1 Q0 WSJ_1 3 0.067100 Anserini" });
    searchArgs = createDefaultSearchArgs().bm25();
    searchArgs.bm25_b = new String[] { "0.2", "0.8" };
    searchArgs.rm3 = true;
    testQueries.put("bm25rm3-1", searchArgs);
    runsForQuery.put("bm25rm3-1", Set.of("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)"));
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)", new String[] { "1 Q0 DOC222 1 0.086700 Anserini", "1 Q0 TREC_DOC_1 2 0.081300 Anserini", "1 Q0 WSJ_1 3 0.017400 Anserini" });
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)", new String[] { "1 Q0 TREC_DOC_1 1 0.087700 Anserini", "1 Q0 DOC222 2 0.084100 Anserini", "1 Q0 WSJ_1 3 0.016800 Anserini" });
    searchArgs = createDefaultSearchArgs().bm25();
    searchArgs.bm25_b = new String[] { "0.4", "0.5" };
    searchArgs.rm3 = true;
    searchArgs.rm3_originalQueryWeight = new String[] { "0.2", "0.9" };
    testQueries.put("bm25rm3-2", searchArgs);
    runsForQuery.put("bm25rm3-2", Set.of("e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)"));
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", new String[] { "1 Q0 DOC222 1 0.034300 Anserini", "1 Q0 TREC_DOC_1 2 0.033300 Anserini", "1 Q0 WSJ_1 3 0.006900 Anserini" });
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)", new String[] { "1 Q0 DOC222 1 0.154400 Anserini", "1 Q0 TREC_DOC_1 2 0.150100 Anserini", "1 Q0 WSJ_1 3 0.030900 Anserini" });
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", new String[] { "1 Q0 DOC222 1 0.034200 Anserini", "1 Q0 TREC_DOC_1 2 0.033800 Anserini", "1 Q0 WSJ_1 3 0.006800 Anserini" });
    groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)", new String[] { "1 Q0 DOC222 1 0.153700 Anserini", "1 Q0 TREC_DOC_1 2 0.151900 Anserini", "1 Q0 WSJ_1 3 0.030700 Anserini" });
    searchArgs = createDefaultSearchArgs().qld();
    searchArgs.qld_mu = new String[] { "1000", "2000" };
    testQueries.put("qld", searchArgs);
    runsForQuery.put("qld", Set.of("e2eTestSearchTrec_qld(mu=1000)_default", "e2eTestSearchTrec_qld(mu=2000)_default"));
    groundTruthRuns.put("e2eTestSearchTrec_qld(mu=1000)_default", new String[] { "1 Q0 DOC222 1 0.002500 Anserini", "1 Q0 TREC_DOC_1 2 0.001700 Anserini", "1 Q0 WSJ_1 3 0.000000 Anserini" });
    groundTruthRuns.put("e2eTestSearchTrec_qld(mu=2000)_default", new String[] { "1 Q0 DOC222 1 0.001200 Anserini", "1 Q0 TREC_DOC_1 2 0.000800 Anserini", "1 Q0 WSJ_1 3 0.000000 Anserini" });
}
Also used : SearchArgs(io.anserini.search.SearchArgs)

Aggregations

SearchArgs (io.anserini.search.SearchArgs)10 IndexReader (org.apache.lucene.index.IndexReader)3 Test (org.junit.Test)3 HashMap (java.util.HashMap)2 Map (java.util.Map)2 IndexSearcher (org.apache.lucene.search.IndexSearcher)2 Directory (org.apache.lucene.store.Directory)2 FSDirectory (org.apache.lucene.store.FSDirectory)2 NotStoredException (io.anserini.index.NotStoredException)1 RerankerContext (io.anserini.rerank.RerankerContext)1 SearchCollection (io.anserini.search.SearchCollection)1 File (java.io.File)1 IOException (java.io.IOException)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 MultiTerms (org.apache.lucene.index.MultiTerms)1 Term (org.apache.lucene.index.Term)1 Terms (org.apache.lucene.index.Terms)1 TermsEnum (org.apache.lucene.index.TermsEnum)1