Search in sources :

Example 1 with ScoredDocuments

use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.

the class LookupTopic method search.

/**
   * Prints query results to the standard output stream.
   *
   * @param queryName the entity name to search
   * @throws Exception on error
   */
public void search(String queryName) throws Exception {
    LOG.info("Querying started...");
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    SimpleAnalyzer analyzer = new SimpleAnalyzer();
    int numHits = 20;
    // find exact title
    QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
    Query titleQuery = titleParser.parse(queryName);
    TopDocs rs = searcher.search(titleQuery, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact WIKI_TITLE found! Ending search.");
        return;
    } else {
        System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
    }
    System.out.println();
    // find exact label
    QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
    Query labelQuery = labelParser.parse(queryName);
    rs = searcher.search(labelQuery, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact W3_LABEL found! Ending search.");
        return;
    } else {
        System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
    }
    System.out.println();
    float k1 = 1.5f;
    float b = 0.75f;
    Similarity similarity = new BM25Similarity(k1, b);
    searcher.setSimilarity(similarity);
    MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Query query = queryParser.parse(queryName);
    rs = searcher.search(query, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    LOG.info("Querying completed.");
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ScoredDocuments(io.anserini.rerank.ScoredDocuments) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 2 with ScoredDocuments

use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.

the class EntityLinking method exactQuerySearch.

/**
 * Returns a list of query results.
 *
 * @param queryName the entity name to search
 * @throws Exception on error
 * @return a list of top ranked entities
 */
public List<RankedEntity> exactQuerySearch(String queryName, int numHits) throws Exception {
    List<RankedEntity> rankedEntities = new ArrayList<>();
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    // do exact search on query name
    QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
    queryParser.setAutoGeneratePhraseQueries(true);
    queryParser.setPhraseSlop(3);
    queryName = "\"" + queryName + "\"";
    Query query = queryParser.parse(queryName);
    TopDocs rs = searcher.search(query, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        float score = docs.scores[i];
        String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
        String shortMid = getShortMid(mid);
        String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
        String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
        rankedEntities.add(new RankedEntity(shortMid, score, name, label));
    }
    return rankedEntities;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ArrayList(java.util.ArrayList) ScoredDocuments(io.anserini.rerank.ScoredDocuments)

Example 3 with ScoredDocuments

use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.

the class LookupTopic method search.

/**
 * Prints all known facts about a particular mid.
 * @param queryName query topic name
 * @throws Exception on error
 */
public void search(String queryName, int numHits) throws Exception {
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    // search for query in multiple fields
    MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL, IndexTopics.FIELD_ALIAS }, new SimpleAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Query query = queryParser.parse(queryName);
    TopDocs rs = searcher.search(query, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nOBJECT_NAME: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_ALIAS).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) Query(org.apache.lucene.search.Query) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ScoredDocuments(io.anserini.rerank.ScoredDocuments)

Example 4 with ScoredDocuments

use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.

the class RemoveRetweetsTemporalTiebreakReranker method rerank.

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    // Resort results based on score, breaking ties by larger docid first (i.e., recent first).
    SortedSet<Result> sortedResults = new TreeSet<Result>();
    for (int i = 0; i < docs.documents.length; i++) {
        Result result = new Result();
        result.document = docs.documents[i];
        result.score = docs.scores[i];
        result.id = docs.ids[i];
        result.docid = Long.parseLong(docs.documents[i].getField(FIELD_ID).stringValue());
        sortedResults.add(result);
    }
    int numResults = sortedResults.size();
    ScoredDocuments rerankedDocs = new ScoredDocuments();
    rerankedDocs.documents = new Document[numResults];
    rerankedDocs.ids = new int[numResults];
    rerankedDocs.scores = new float[numResults];
    int i = 0;
    int dup = 0;
    float prevScore = 0;
    for (Result result : sortedResults) {
        float curScore = result.score;
        // If we encounter ties, we want to perturb the final score a bit.
        if (Math.abs(curScore - prevScore) > 0.001f) {
            dup = 0;
        } else {
            dup++;
            curScore = curScore - 0.000001f * dup;
        }
        rerankedDocs.documents[i] = result.document;
        rerankedDocs.ids[i] = result.id;
        rerankedDocs.scores[i] = (float) curScore;
        prevScore = result.score;
        i++;
    }
    return rerankedDocs;
}
Also used : TreeSet(java.util.TreeSet) ScoredDocuments(io.anserini.rerank.ScoredDocuments)

Example 5 with ScoredDocuments

use of io.anserini.rerank.ScoredDocuments in project Anserini by castorini.

the class SearchTweets method main.

public static void main(String[] args) throws Exception {
    long initializationTime = System.currentTimeMillis();
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    RerankerCascade cascade = new RerankerCascade();
    EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(englishAnalyzer, FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    } else {
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    }
    if (!searchArgs.model.isEmpty() && searchArgs.extractors != null) {
        LOG.debug(String.format("Ranklib model used, modeled loaded from %s", searchArgs.model));
        cascade.add(new RankLibReranker(searchArgs.model, FIELD_BODY, searchArgs.extractors));
    }
    FeatureExtractors extractorChain = null;
    if (searchArgs.extractors != null) {
        extractorChain = FeatureExtractors.loadExtractor(searchArgs.extractors);
    }
    if (searchArgs.dumpFeatures) {
        PrintStream out = new PrintStream(searchArgs.featureFile);
        Qrels qrels = new Qrels(searchArgs.qrels);
        cascade.add(new TweetsLtrDataGenerator(out, qrels, extractorChain));
    }
    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));
    PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
    LOG.info("Writing output to " + searchArgs.output);
    LOG.info("Initialized complete! (elapsed time = " + (System.currentTimeMillis() - initializationTime) + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.currentTimeMillis();
        // do not cosider the tweets with tweet ids that are beyond the queryTweetTime
        // <querytweettime> tag contains the timestamp of the query in terms of the
        // chronologically nearest tweet id within the corpus
        Query filter = TermRangeQuery.newStringRange(FIELD_ID, "0", String.valueOf(topic.getQueryTweetTime()), true, true);
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, englishAnalyzer, topic.getQuery());
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(filter, BooleanClause.Occur.FILTER);
        builder.add(query, BooleanClause.Occur.MUST);
        Query q = builder.build();
        TopDocs rs = searcher.search(q, searchArgs.hits);
        List<String> queryTokens = AnalyzerUtils.tokenize(englishAnalyzer, topic.getQuery());
        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, FIELD_BODY, filter);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        long queryTime = (System.currentTimeMillis() - curQueryTime);
        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], searchArgs.runtag));
        }
        LOG.info("Query " + topic.getId() + " (elapsed time = " + queryTime + "ms)");
        totalTime += queryTime;
        cnt++;
    }
    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
    reader.close();
    out.close();
}
Also used : RemoveRetweetsTemporalTiebreakReranker(io.anserini.rerank.twitter.RemoveRetweetsTemporalTiebreakReranker) ScoredDocuments(io.anserini.rerank.ScoredDocuments) RerankerCascade(io.anserini.rerank.RerankerCascade) Rm3Reranker(io.anserini.rerank.rm3.Rm3Reranker) RankLibReranker(io.anserini.rerank.RankLibReranker) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) PrintStream(java.io.PrintStream) Qrels(io.anserini.util.Qrels) CmdLineParser(org.kohsuke.args4j.CmdLineParser) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) MMapDirectory(org.apache.lucene.store.MMapDirectory) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) TweetsLtrDataGenerator(io.anserini.ltr.TweetsLtrDataGenerator) FileOutputStream(java.io.FileOutputStream) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) File(java.io.File) CmdLineException(org.kohsuke.args4j.CmdLineException) RerankerContext(io.anserini.rerank.RerankerContext)

Aggregations

ScoredDocuments (io.anserini.rerank.ScoredDocuments)18 TopDocs (org.apache.lucene.search.TopDocs)15 Query (org.apache.lucene.search.Query)12 IndexSearcher (org.apache.lucene.search.IndexSearcher)11 RerankerContext (io.anserini.rerank.RerankerContext)9 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)6 Document (org.apache.lucene.document.Document)5 SimpleAnalyzer (org.apache.lucene.analysis.core.SimpleAnalyzer)4 EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)4 IndexableField (org.apache.lucene.index.IndexableField)4 MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)4 BooleanQuery (org.apache.lucene.search.BooleanQuery)4 RerankerCascade (io.anserini.rerank.RerankerCascade)3 ArrayList (java.util.ArrayList)3 QueryNodeException (org.apache.lucene.queryparser.flexible.core.QueryNodeException)3 ScoreDoc (org.apache.lucene.search.ScoreDoc)3 TermInSetQuery (org.apache.lucene.search.TermInSetQuery)3 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)3 Similarity (org.apache.lucene.search.similarities.Similarity)3 ScoreTiesAdjusterReranker (io.anserini.rerank.lib.ScoreTiesAdjusterReranker)2