use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.
the class LookupTopic method search.
/**
* Prints query results to the standard output stream.
*
* @param queryName the entity name to search
* @throws Exception on error
*/
public void search(String queryName) throws Exception {
LOG.info("Querying started...");
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
SimpleAnalyzer analyzer = new SimpleAnalyzer();
int numHits = 20;
// find exact title
QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
Query titleQuery = titleParser.parse(queryName);
TopDocs rs = searcher.search(titleQuery, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact WIKI_TITLE found! Ending search.");
return;
} else {
System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
}
System.out.println();
// find exact label
QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
Query labelQuery = labelParser.parse(queryName);
rs = searcher.search(labelQuery, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact W3_LABEL found! Ending search.");
return;
} else {
System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
}
System.out.println();
float k1 = 1.5f;
float b = 0.75f;
Similarity similarity = new BM25Similarity(k1, b);
searcher.setSimilarity(similarity);
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Query query = queryParser.parse(queryName);
rs = searcher.search(query, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
LOG.info("Querying completed.");
}
use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.
the class EntityLinking method exactQuerySearch.
/**
* Returns a list of query results.
*
* @param queryName the entity name to search
* @throws Exception on error
* @return a list of top ranked entities
*/
public List<RankedEntity> exactQuerySearch(String queryName, int numHits) throws Exception {
List<RankedEntity> rankedEntities = new ArrayList<>();
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// do exact search on query name
QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
queryParser.setAutoGeneratePhraseQueries(true);
queryParser.setPhraseSlop(3);
queryName = "\"" + queryName + "\"";
Query query = queryParser.parse(queryName);
TopDocs rs = searcher.search(query, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
float score = docs.scores[i];
String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
String shortMid = getShortMid(mid);
String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
rankedEntities.add(new RankedEntity(shortMid, score, name, label));
}
return rankedEntities;
}
use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.
the class LookupTopic method search.
/**
* Prints all known facts about a particular mid.
* @param queryName query topic name
* @throws Exception on error
*/
public void search(String queryName, int numHits) throws Exception {
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// search for query in multiple fields
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL, IndexTopics.FIELD_ALIAS }, new SimpleAnalyzer());
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Query query = queryParser.parse(queryName);
TopDocs rs = searcher.search(query, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nOBJECT_NAME: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_ALIAS).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
}
use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.
the class RemoveRetweetsTemporalTiebreakReranker method rerank.
@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
// Resort results based on score, breaking ties by larger docid first (i.e., recent first).
SortedSet<Result> sortedResults = new TreeSet<Result>();
for (int i = 0; i < docs.documents.length; i++) {
Result result = new Result();
result.document = docs.documents[i];
result.score = docs.scores[i];
result.id = docs.ids[i];
result.docid = Long.parseLong(docs.documents[i].getField(FIELD_ID).stringValue());
sortedResults.add(result);
}
int numResults = sortedResults.size();
ScoredDocuments rerankedDocs = new ScoredDocuments();
rerankedDocs.documents = new Document[numResults];
rerankedDocs.ids = new int[numResults];
rerankedDocs.scores = new float[numResults];
int i = 0;
int dup = 0;
float prevScore = 0;
for (Result result : sortedResults) {
float curScore = result.score;
// If we encounter ties, we want to perturb the final score a bit.
if (Math.abs(curScore - prevScore) > 0.001f) {
dup = 0;
} else {
dup++;
curScore = curScore - 0.000001f * dup;
}
rerankedDocs.documents[i] = result.document;
rerankedDocs.ids[i] = result.id;
rerankedDocs.scores[i] = (float) curScore;
prevScore = result.score;
i++;
}
return rerankedDocs;
}
use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.
the class SearchTweets method main.
public static void main(String[] args) throws Exception {
long initializationTime = System.currentTimeMillis();
SearchArgs searchArgs = new SearchArgs();
CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + searchArgs.index);
Directory dir;
if (searchArgs.inmem) {
LOG.info("Using MMapDirectory with preload");
dir = new MMapDirectory(Paths.get(searchArgs.index));
((MMapDirectory) dir).setPreload(true);
} else {
LOG.info("Using default FSDirectory");
dir = FSDirectory.open(Paths.get(searchArgs.index));
}
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
if (searchArgs.ql) {
LOG.info("Using QL scoring model");
searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
} else if (searchArgs.bm25) {
LOG.info("Using BM25 scoring model");
searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
RerankerCascade cascade = new RerankerCascade();
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
if (searchArgs.rm3) {
cascade.add(new Rm3Reranker(englishAnalyzer, FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
} else {
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
}
if (!searchArgs.model.isEmpty() && searchArgs.extractors != null) {
LOG.debug(String.format("Ranklib model used, modeled loaded from %s", searchArgs.model));
cascade.add(new RankLibReranker(searchArgs.model, FIELD_BODY, searchArgs.extractors));
}
FeatureExtractors extractorChain = null;
if (searchArgs.extractors != null) {
extractorChain = FeatureExtractors.loadExtractor(searchArgs.extractors);
}
if (searchArgs.dumpFeatures) {
PrintStream out = new PrintStream(searchArgs.featureFile);
Qrels qrels = new Qrels(searchArgs.qrels);
cascade.add(new TweetsLtrDataGenerator(out, qrels, extractorChain));
}
MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));
PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
LOG.info("Writing output to " + searchArgs.output);
LOG.info("Initialized complete! (elapsed time = " + (System.currentTimeMillis() - initializationTime) + "ms)");
long totalTime = 0;
int cnt = 0;
for (MicroblogTopic topic : topics) {
long curQueryTime = System.currentTimeMillis();
// do not cosider the tweets with tweet ids that are beyond the queryTweetTime
// <querytweettime> tag contains the timestamp of the query in terms of the
// chronologically nearest tweet id within the corpus
Query filter = TermRangeQuery.newStringRange(FIELD_ID, "0", String.valueOf(topic.getQueryTweetTime()), true, true);
Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, englishAnalyzer, topic.getQuery());
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filter, BooleanClause.Occur.FILTER);
builder.add(query, BooleanClause.Occur.MUST);
Query q = builder.build();
TopDocs rs = searcher.search(q, searchArgs.hits);
List<String> queryTokens = AnalyzerUtils.tokenize(englishAnalyzer, topic.getQuery());
RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, FIELD_BODY, filter);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
long queryTime = (System.currentTimeMillis() - curQueryTime);
for (int i = 0; i < docs.documents.length; i++) {
String qid = topic.getId().replaceFirst("^MB0*", "");
out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], searchArgs.runtag));
}
LOG.info("Query " + topic.getId() + " (elapsed time = " + queryTime + "ms)");
totalTime += queryTime;
cnt++;
}
LOG.info("All queries completed!");
LOG.info("Total elapsed time = " + totalTime + "ms");
LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
reader.close();
out.close();
}
Aggregations