use of io.anserini.rerank.RerankerContext in project Anserini by castorini.
the class SimpleSearcher method search.
// internal implementation
protected Result[] search(Query query, List<String> queryTokens, String queryString, int k) throws IOException {
// Create an IndexSearch only once. Note that the object is thread safe.
if (searcher == null) {
searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
}
SearchArgs searchArgs = new SearchArgs();
searchArgs.arbitraryScoreTieBreak = false;
searchArgs.hits = k;
TopDocs rs;
RerankerContext context;
rs = searcher.search(query, useRM3 ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true);
context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs);
ScoredDocuments hits = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
Result[] results = new Result[hits.ids.length];
for (int i = 0; i < hits.ids.length; i++) {
Document doc = hits.documents[i];
String docid = doc.getField(IndexArgs.ID).stringValue();
IndexableField field;
field = doc.getField(IndexArgs.CONTENTS);
String contents = field == null ? null : field.stringValue();
field = doc.getField(IndexArgs.RAW);
String raw = field == null ? null : field.stringValue();
results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc);
}
return results;
}
use of io.anserini.rerank.RerankerContext in project Anserini by castorini.
the class DumpTweetsLtrData method main.
public static void main(String[] argv) throws Exception {
long curTime = System.nanoTime();
LtrArgs args = new LtrArgs();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(argv);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: DumpTweetsLtrData" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + args.index);
Directory dir = FSDirectory.open(Paths.get(args.index));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
if (args.ql) {
LOG.info("Using QL scoring model");
searcher.setSimilarity(new LMDirichletSimilarity(args.mu));
} else if (args.bm25) {
LOG.info("Using BM25 scoring model");
searcher.setSimilarity(new BM25Similarity(args.k1, args.b));
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
Qrels qrels = new Qrels(args.qrels);
FeatureExtractors extractors = null;
if (args.extractors != null) {
extractors = FeatureExtractors.loadExtractor(args.extractors);
}
PrintStream out = new PrintStream(new FileOutputStream(new File(args.output)));
RerankerCascade cascade = new RerankerCascade();
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
cascade.add(new TweetsLtrDataGenerator(out, qrels, extractors));
MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(args.topics));
LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
long totalTime = 0;
int cnt = 0;
for (MicroblogTopic topic : topics) {
long curQueryTime = System.nanoTime();
Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filter, BooleanClause.Occur.FILTER);
builder.add(query, BooleanClause.Occur.MUST);
Query q = builder.build();
TopDocs rs = searcher.search(q, args.hits);
List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
long qtime = (System.nanoTime() - curQueryTime) / 1000000;
LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
totalTime += qtime;
cnt++;
}
LOG.info("All queries completed!");
LOG.info("Total elapsed time = " + totalTime + "ms");
LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
reader.close();
out.close();
}
use of io.anserini.rerank.RerankerContext in project Anserini by castorini.
the class PyseriniEntryPoint method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public Map<String, Float> search(SortedMap<Integer, String> topics, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
Map<String, Float> scoredDocs = new LinkedHashMap<>();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
for (int i = 0; i < docs.documents.length; i++) {
String docid = docs.documents[i].getField(FIELD_ID).stringValue();
float score = docs.scores[i];
scoredDocs.put(docid, score);
}
}
return scoredDocs;
}
use of io.anserini.rerank.RerankerContext in project Anserini by castorini.
the class SearchWebCollection method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
/**
* For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
*/
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
/**
* the first column is the topic number.
* the second column is currently unused and should always be "Q0".
* the third column is the official document identifier of the retrieved document.
* the fourth column is the rank the document is retrieved.
* the fifth column shows the score (integer or floating point) that generated the ranking.
* the sixth column is called the "run tag" and should be a unique identifier for your
*/
for (int i = 0; i < docs.documents.length; i++) {
out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
}
}
out.flush();
out.close();
}
use of io.anserini.rerank.RerankerContext in project Anserini by castorini.
the class BaseFeatureExtractorTest method assertFeatureValues.
/**
* Used to test features involving multiple documents in the collection at the same time
* @param expected An array of expected values for the computed features
* @param queryText Query
* @param docTexts A list of document texts representing documents in the collection
* @param extractors The chain of feature extractors to use
* @param docToExtract Index of the document we want to compute features for
*/
protected void assertFeatureValues(float[] expected, String queryText, List<String> docTexts, FeatureExtractors extractors, int docToExtract) throws IOException {
List<Document> addedDocs = new ArrayList<>();
for (String docText : docTexts) {
Document testDoc = addTestDocument(docText);
addedDocs.add(testDoc);
}
testWriter.forceMerge(1);
Document testDoc = addedDocs.get(docToExtract);
RerankerContext context = makeTestContext(queryText);
IndexReader reader = context.getIndexSearcher().getIndexReader();
Terms terms = reader.getTermVector(docToExtract, TEST_FIELD_NAME);
float[] extractedFeatureValues = extractors.extractAll(testDoc, terms, context);
assertArrayEquals(expected, extractedFeatureValues, DELTA);
}
Aggregations