use of io.anserini.search.SearchArgs in project Anserini by castorini.
the class ZhEndToEndTest method setSearchGroundTruth.
@Override
protected void setSearchGroundTruth() {
topicReader = "TsvString";
topicFile = "src/test/resources/sample_topics/zh_topics.tsv";
SearchArgs searchArg = createDefaultSearchArgs().bm25();
searchArg.language = "zh";
testQueries.put("bm25", searchArg);
queryTokens.put("1", new ArrayList<>());
queryTokens.get("1").add("滑铁");
queryTokens.get("1").add("铁卢");
referenceRunOutput.put("bm25", new String[] { "1 Q0 doc1 1 1.337800 Anserini" });
}
use of io.anserini.search.SearchArgs in project Anserini by castorini.
the class AxiomReranker method processExternalContext.
/**
* If the external reranking context is not null we will first search against the external
* index and return the top ranked documents.
*
* @param docs The initial ranking results against target index. We will return them if external
* index is null.
*
* @return Top ranked ScoredDocuments from searching external index
*/
private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerContext<T> context) throws IOException {
if (this.externalIndexPath != null) {
Path indexPath = Paths.get(this.externalIndexPath);
if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
throw new IllegalArgumentException(this.externalIndexPath + " does not exist or is not a directory.");
}
IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(context.getIndexSearcher().getSimilarity());
SearchArgs args = new SearchArgs();
args.hits = this.R;
args.arbitraryScoreTieBreak = context.getSearchArgs().arbitraryScoreTieBreak;
args.searchtweets = context.getSearchArgs().searchtweets;
RerankerContext<T> externalContext = new RerankerContext<>(searcher, context.getQueryId(), context.getQuery(), context.getQueryDocId(), context.getQueryText(), context.getQueryTokens(), context.getFilter(), args);
return searchTopDocs(null, externalContext);
} else {
return docs;
}
}
use of io.anserini.search.SearchArgs in project Anserini by castorini.
the class IndexReaderUtilsTest method computeBM25Weights.
@Test
public void computeBM25Weights() throws Exception {
SearchArgs args = new SearchArgs();
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);
assertEquals(0.43400, IndexReaderUtils.getBM25UnanalyzedTermWeightWithParameters(reader, "doc1", "city", IndexCollection.DEFAULT_ANALYZER, 0.9f, 0.4f), 10e-5);
assertEquals(0.43400, IndexReaderUtils.getBM25AnalyzedTermWeightWithParameters(reader, "doc1", "citi", 0.9f, 0.4f), 10e-5);
assertEquals(0.0f, IndexReaderUtils.getBM25UnanalyzedTermWeightWithParameters(reader, "doc2", "city", IndexCollection.DEFAULT_ANALYZER, 0.9f, 0.4f), 10e-5);
assertEquals(0.0f, IndexReaderUtils.getBM25AnalyzedTermWeightWithParameters(reader, "doc2", "citi", 0.9f, 0.4f), 10e-5);
assertEquals(0.570250, IndexReaderUtils.getBM25UnanalyzedTermWeightWithParameters(reader, "doc3", "test", IndexCollection.DEFAULT_ANALYZER, 0.9f, 0.4f), 10e-5);
assertEquals(0.570250, IndexReaderUtils.getBM25AnalyzedTermWeightWithParameters(reader, "doc3", "test", 0.9f, 0.4f), 10e-5);
reader.close();
dir.close();
}
use of io.anserini.search.SearchArgs in project Anserini by castorini.
the class IndexReaderUtilsTest method computeAllTermBM25Weights.
@Test
public void computeAllTermBM25Weights() throws Exception {
SearchArgs args = new SearchArgs();
Similarity similarity = new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);
// The complete term/doc matrix
Map<String, Map<String, Float>> termDocMatrix = new HashMap<>();
// We're going to iterate through all the terms in the dictionary to build the term/doc matrix
Terms terms = MultiTerms.getTerms(reader, "contents");
TermsEnum termsEnum = terms.iterator();
BytesRef text;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
TopDocs rs = searcher.search(new TermQuery(new Term("contents", term)), 3);
for (int i = 0; i < rs.scoreDocs.length; i++) {
String docid = reader.document(rs.scoreDocs[i].doc).getField("id").stringValue();
if (!termDocMatrix.containsKey(term))
termDocMatrix.put(term, new HashMap<>());
termDocMatrix.get(term).put(docid, rs.scoreDocs[i].score);
}
}
int numDocs = reader.numDocs();
// Iterate through the document vectors, and verify that we have the same values as in the term/doc matrix
for (int i = 0; i < numDocs; i++) {
Terms termVector = reader.getTermVector(i, "contents");
String docid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
// For this document, iterate through the terms.
termsEnum = termVector.iterator();
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
float weight = IndexReaderUtils.getBM25AnalyzedTermWeight(reader, docid, term);
assertEquals(termDocMatrix.get(term).get(docid), weight, 10e-6);
}
}
reader.close();
dir.close();
}
use of io.anserini.search.SearchArgs in project Anserini by castorini.
the class MultiThreadingSearchTest method setSearchGroundTruth.
@Override
protected void setSearchGroundTruth() {
topicReader = "Trec";
topicFile = "src/test/resources/sample_topics/Trec";
SearchArgs searchArgs;
searchArgs = createDefaultSearchArgs().bm25();
searchArgs.bm25_b = new String[] { "0.2", "0.8" };
testQueries.put("bm25", searchArgs);
runsForQuery.put("bm25", Set.of("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_default", "e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_default"));
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_default", new String[] { "1 Q0 DOC222 1 0.346600 Anserini", "1 Q0 TREC_DOC_1 2 0.325400 Anserini", "1 Q0 WSJ_1 3 0.069500 Anserini" });
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_default", new String[] { "1 Q0 TREC_DOC_1 1 0.350900 Anserini", "1 Q0 DOC222 2 0.336600 Anserini", "1 Q0 WSJ_1 3 0.067100 Anserini" });
searchArgs = createDefaultSearchArgs().bm25();
searchArgs.bm25_b = new String[] { "0.2", "0.8" };
searchArgs.rm3 = true;
testQueries.put("bm25rm3-1", searchArgs);
runsForQuery.put("bm25rm3-1", Set.of("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)"));
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.2)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)", new String[] { "1 Q0 DOC222 1 0.086700 Anserini", "1 Q0 TREC_DOC_1 2 0.081300 Anserini", "1 Q0 WSJ_1 3 0.017400 Anserini" });
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.8)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.5)", new String[] { "1 Q0 TREC_DOC_1 1 0.087700 Anserini", "1 Q0 DOC222 2 0.084100 Anserini", "1 Q0 WSJ_1 3 0.016800 Anserini" });
searchArgs = createDefaultSearchArgs().bm25();
searchArgs.bm25_b = new String[] { "0.4", "0.5" };
searchArgs.rm3 = true;
searchArgs.rm3_originalQueryWeight = new String[] { "0.2", "0.9" };
testQueries.put("bm25rm3-2", searchArgs);
runsForQuery.put("bm25rm3-2", Set.of("e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", "e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)"));
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", new String[] { "1 Q0 DOC222 1 0.034300 Anserini", "1 Q0 TREC_DOC_1 2 0.033300 Anserini", "1 Q0 WSJ_1 3 0.006900 Anserini" });
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.4)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)", new String[] { "1 Q0 DOC222 1 0.154400 Anserini", "1 Q0 TREC_DOC_1 2 0.150100 Anserini", "1 Q0 WSJ_1 3 0.030900 Anserini" });
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.2)", new String[] { "1 Q0 DOC222 1 0.034200 Anserini", "1 Q0 TREC_DOC_1 2 0.033800 Anserini", "1 Q0 WSJ_1 3 0.006800 Anserini" });
groundTruthRuns.put("e2eTestSearchTrec_bm25(k1=0.9,b=0.5)_rm3(fbTerms=10,fbDocs=10,originalQueryWeight=0.9)", new String[] { "1 Q0 DOC222 1 0.153700 Anserini", "1 Q0 TREC_DOC_1 2 0.151900 Anserini", "1 Q0 WSJ_1 3 0.030700 Anserini" });
searchArgs = createDefaultSearchArgs().qld();
searchArgs.qld_mu = new String[] { "1000", "2000" };
testQueries.put("qld", searchArgs);
runsForQuery.put("qld", Set.of("e2eTestSearchTrec_qld(mu=1000)_default", "e2eTestSearchTrec_qld(mu=2000)_default"));
groundTruthRuns.put("e2eTestSearchTrec_qld(mu=1000)_default", new String[] { "1 Q0 DOC222 1 0.002500 Anserini", "1 Q0 TREC_DOC_1 2 0.001700 Anserini", "1 Q0 WSJ_1 3 0.000000 Anserini" });
groundTruthRuns.put("e2eTestSearchTrec_qld(mu=2000)_default", new String[] { "1 Q0 DOC222 1 0.001200 Anserini", "1 Q0 TREC_DOC_1 2 0.000800 Anserini", "1 Q0 WSJ_1 3 0.000000 Anserini" });
}
Aggregations