Search in sources :

Example 1 with QueryGenerator

use of io.anserini.search.query.QueryGenerator in project Anserini by castorini.

the class SearchCollection method searchTweets.

public <K> ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String queryString, long t, RerankerCascade cascade, ScoredDocuments queryQrels, boolean hasRelDocs) throws IOException {
    Query keywordQuery;
    if (args.sdm) {
        keywordQuery = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
    } else {
        try {
            QueryGenerator generator = (QueryGenerator) Class.forName("io.anserini.search.query." + args.queryGenerator).getConstructor().newInstance();
            keywordQuery = generator.buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
        } catch (Exception e) {
            e.printStackTrace();
            throw new IllegalArgumentException("Unable to load QueryGenerator: " + args.topicReader);
        }
    }
    List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);
    // Do not consider the tweets with tweet ids that are beyond the queryTweetTime
    // <querytweettime> tag contains the timestamp of the query in terms of the
    // chronologically nearest tweet id within the corpus
    Query filter = LongPoint.newRangeQuery(TweetGenerator.TweetField.ID_LONG.name, 0L, t);
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(filter, BooleanClause.Occur.FILTER);
    builder.add(keywordQuery, BooleanClause.Occur.MUST);
    Query compositeQuery = builder.build();
    TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] {});
    if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
        if (args.arbitraryScoreTieBreak) {
            // Figure out how to break the scoring ties.
            rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
        } else {
            rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_TWEETID, true);
        }
    }
    RerankerContext context = new RerankerContext<>(searcher, qid, keywordQuery, null, queryString, queryTokens, filter, args);
    ScoredDocuments scoredFbDocs;
    if (isRerank && args.rf_qrels != null) {
        if (hasRelDocs) {
            scoredFbDocs = queryQrels;
        } else {
            // if no relevant documents, only perform score based tie breaking next
            scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
            cascade = new RerankerCascade();
            cascade.add(new ScoreTiesAdjusterReranker());
        }
    } else {
        scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
    }
    return cascade.run(scoredFbDocs, context);
}
Also used : TotalHits(org.apache.lucene.search.TotalHits) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) TermInSetQuery(org.apache.lucene.search.TermInSetQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) ScoredDocuments(io.anserini.rerank.ScoredDocuments) QueryNodeException(org.apache.lucene.queryparser.flexible.core.QueryNodeException) IOException(java.io.IOException) CompletionException(java.util.concurrent.CompletionException) CmdLineException(org.kohsuke.args4j.CmdLineException) AtomicMoveNotSupportedException(java.nio.file.AtomicMoveNotSupportedException) TopDocs(org.apache.lucene.search.TopDocs) RerankerCascade(io.anserini.rerank.RerankerCascade) QueryGenerator(io.anserini.search.query.QueryGenerator) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) ScoreTiesAdjusterReranker(io.anserini.rerank.lib.ScoreTiesAdjusterReranker) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) RerankerContext(io.anserini.rerank.RerankerContext)

Example 2 with QueryGenerator

use of io.anserini.search.query.QueryGenerator in project Anserini by castorini.

the class SearchCollection method search.

public <K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryString, RerankerCascade cascade, ScoredDocuments queryQrels, boolean hasRelDocs) throws IOException {
    Query query = null;
    if (args.sdm) {
        query = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
    } else {
        QueryGenerator generator;
        try {
            generator = (QueryGenerator) Class.forName("io.anserini.search.query." + args.queryGenerator).getConstructor().newInstance();
        } catch (Exception e) {
            e.printStackTrace();
            throw new IllegalArgumentException("Unable to load QueryGenerator: " + args.topicReader);
        }
        query = generator.buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
    }
    TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] {});
    if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
        if (args.arbitraryScoreTieBreak) {
            // Figure out how to break the scoring ties.
            rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
        } else {
            rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true);
        }
    }
    List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);
    queries.put(qid.toString(), queryTokens);
    RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args);
    ScoredDocuments scoredFbDocs;
    if (isRerank && args.rf_qrels != null) {
        if (hasRelDocs) {
            scoredFbDocs = queryQrels;
        } else {
            // if no relevant documents, only perform score based tie breaking next
            LOG.info("No relevant documents for " + qid.toString());
            scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
            cascade = new RerankerCascade();
            cascade.add(new ScoreTiesAdjusterReranker());
        }
    } else {
        scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
    }
    return cascade.run(scoredFbDocs, context);
}
Also used : TotalHits(org.apache.lucene.search.TotalHits) Query(org.apache.lucene.search.Query) TermInSetQuery(org.apache.lucene.search.TermInSetQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) ScoredDocuments(io.anserini.rerank.ScoredDocuments) QueryNodeException(org.apache.lucene.queryparser.flexible.core.QueryNodeException) IOException(java.io.IOException) CompletionException(java.util.concurrent.CompletionException) CmdLineException(org.kohsuke.args4j.CmdLineException) AtomicMoveNotSupportedException(java.nio.file.AtomicMoveNotSupportedException) TopDocs(org.apache.lucene.search.TopDocs) RerankerCascade(io.anserini.rerank.RerankerCascade) QueryGenerator(io.anserini.search.query.QueryGenerator) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) ScoreTiesAdjusterReranker(io.anserini.rerank.lib.ScoreTiesAdjusterReranker) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) RerankerContext(io.anserini.rerank.RerankerContext)

Example 3 with QueryGenerator

use of io.anserini.search.query.QueryGenerator in project Anserini by castorini.

the class SearchMsmarco method main.

public static void main(String[] args) throws Exception {
    Args retrieveArgs = new Args();
    CmdLineParser parser = new CmdLineParser(retrieveArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: Eval " + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    System.out.println("###############################################################################");
    System.out.println("WARNING: This class has been deprecated and may be removed in a future release!");
    System.out.println("###############################################################################\n");
    long totalStartTime = System.nanoTime();
    Analyzer analyzer;
    if (retrieveArgs.pretokenized) {
        analyzer = new WhitespaceAnalyzer();
        System.out.println("Initializing whilte space analyzer");
    } else {
        analyzer = DefaultEnglishAnalyzer.fromArguments(retrieveArgs.stemmer, retrieveArgs.keepstop, retrieveArgs.stopwords);
        System.out.println("Initializing analyzer with stemmer=" + retrieveArgs.stemmer + ", keepstop=" + retrieveArgs.keepstop + ", stopwords=" + retrieveArgs.stopwords);
    }
    SimpleSearcher searcher = new SimpleSearcher(retrieveArgs.index, analyzer);
    searcher.setBM25(retrieveArgs.k1, retrieveArgs.b);
    System.out.println("Initializing BM25, setting k1=" + retrieveArgs.k1 + " and b=" + retrieveArgs.b + "");
    if (retrieveArgs.rm3) {
        searcher.setRM3(retrieveArgs.fbTerms, retrieveArgs.fbDocs, retrieveArgs.originalQueryWeight);
        System.out.println("Initializing RM3, setting fbTerms=" + retrieveArgs.fbTerms + ", fbDocs=" + retrieveArgs.fbDocs + " and originalQueryWeight=" + retrieveArgs.originalQueryWeight);
    }
    Map<String, Float> fields = new HashMap<>();
    retrieveArgs.fields.forEach((key, value) -> fields.put(key, Float.valueOf(value)));
    if (retrieveArgs.fields.size() > 0) {
        System.out.println("Performing weighted field search with fields=" + retrieveArgs.fields);
    }
    QueryGenerator queryGenerator;
    if (retrieveArgs.dismax) {
        queryGenerator = new DisjunctionMaxQueryGenerator(retrieveArgs.dismax_tiebreaker);
        System.out.println("Initializing dismax query generator, with tiebreaker=" + retrieveArgs.dismax_tiebreaker);
    } else {
        queryGenerator = new BagOfWordsQueryGenerator();
        System.out.println("Initializing bag-of-words query generator.");
    }
    PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(retrieveArgs.output), StandardCharsets.US_ASCII));
    if (retrieveArgs.threads == 1) {
        // single-threaded retrieval
        long startTime = System.nanoTime();
        List<String> lines = FileUtils.readLines(new File(retrieveArgs.qid_queries), "utf-8");
        for (int lineNumber = 0; lineNumber < lines.size(); ++lineNumber) {
            String line = lines.get(lineNumber);
            String[] split = line.trim().split("\t");
            String qid = split[0];
            String query = split[1];
            SimpleSearcher.Result[] hits;
            if (retrieveArgs.fields.size() > 0) {
                hits = searcher.searchFields(queryGenerator, query, fields, retrieveArgs.hits);
            } else {
                hits = searcher.search(queryGenerator, query, retrieveArgs.hits);
            }
            if (lineNumber % 100 == 0) {
                double timePerQuery = (double) (System.nanoTime() - startTime) / (lineNumber + 1) / 1e9;
                System.out.format("Retrieving query " + lineNumber + " (%.3f s/query)\n", timePerQuery);
            }
            for (int rank = 0; rank < hits.length; ++rank) {
                String docno = hits[rank].docid;
                out.println(qid + "\t" + docno + "\t" + (rank + 1));
            }
        }
    } else {
        // multithreaded batch retrieval
        List<String> lines = FileUtils.readLines(new File(retrieveArgs.qid_queries), "utf-8");
        List<String> queries = lines.stream().map(x -> x.trim().split("\t")[1]).collect(Collectors.toList());
        List<String> qids = lines.stream().map(x -> x.trim().split("\t")[0]).collect(Collectors.toList());
        Map<String, SimpleSearcher.Result[]> results;
        if (retrieveArgs.fields.size() > 0) {
            results = searcher.batchSearchFields(queryGenerator, queries, qids, retrieveArgs.hits, retrieveArgs.threads, fields);
        } else {
            results = searcher.batchSearch(queryGenerator, queries, qids, retrieveArgs.hits, retrieveArgs.threads);
        }
        for (String qid : qids) {
            SimpleSearcher.Result[] hits = results.get(qid);
            for (int rank = 0; rank < hits.length; ++rank) {
                String docno = hits[rank].docid;
                out.println(qid + "\t" + docno + "\t" + (rank + 1));
            }
        }
    }
    searcher.close();
    out.flush();
    out.close();
    double totalTime = (double) (System.nanoTime() - totalStartTime) / 1e9;
    System.out.format("Total retrieval time: %.3f s\n", totalTime);
    System.out.println("Done!");
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) PrintWriter(java.io.PrintWriter) CmdLineParser(org.kohsuke.args4j.CmdLineParser) OptionHandlerFilter(org.kohsuke.args4j.OptionHandlerFilter) Files(java.nio.file.Files) Analyzer(org.apache.lucene.analysis.Analyzer) MapOptionHandler(org.kohsuke.args4j.spi.MapOptionHandler) FileUtils(org.apache.commons.io.FileUtils) HashMap(java.util.HashMap) Option(org.kohsuke.args4j.Option) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Collectors(java.util.stream.Collectors) DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) ParserProperties(org.kohsuke.args4j.ParserProperties) File(java.io.File) StandardCharsets(java.nio.charset.StandardCharsets) CmdLineException(org.kohsuke.args4j.CmdLineException) List(java.util.List) Paths(java.nio.file.Paths) Map(java.util.Map) QueryGenerator(io.anserini.search.query.QueryGenerator) BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) DisjunctionMaxQueryGenerator(io.anserini.search.query.DisjunctionMaxQueryGenerator) CmdLineParser(org.kohsuke.args4j.CmdLineParser) HashMap(java.util.HashMap) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) QueryGenerator(io.anserini.search.query.QueryGenerator) BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) DisjunctionMaxQueryGenerator(io.anserini.search.query.DisjunctionMaxQueryGenerator) DisjunctionMaxQueryGenerator(io.anserini.search.query.DisjunctionMaxQueryGenerator) File(java.io.File) CmdLineException(org.kohsuke.args4j.CmdLineException) PrintWriter(java.io.PrintWriter)

Aggregations

QueryGenerator (io.anserini.search.query.QueryGenerator)3 CmdLineException (org.kohsuke.args4j.CmdLineException)3 RerankerCascade (io.anserini.rerank.RerankerCascade)2 RerankerContext (io.anserini.rerank.RerankerContext)2 ScoredDocuments (io.anserini.rerank.ScoredDocuments)2 ScoreTiesAdjusterReranker (io.anserini.rerank.lib.ScoreTiesAdjusterReranker)2 SdmQueryGenerator (io.anserini.search.query.SdmQueryGenerator)2 IOException (java.io.IOException)2 AtomicMoveNotSupportedException (java.nio.file.AtomicMoveNotSupportedException)2 CompletionException (java.util.concurrent.CompletionException)2 QueryNodeException (org.apache.lucene.queryparser.flexible.core.QueryNodeException)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2 Query (org.apache.lucene.search.Query)2 TermInSetQuery (org.apache.lucene.search.TermInSetQuery)2 TopDocs (org.apache.lucene.search.TopDocs)2 TotalHits (org.apache.lucene.search.TotalHits)2 DefaultEnglishAnalyzer (io.anserini.analysis.DefaultEnglishAnalyzer)1 BagOfWordsQueryGenerator (io.anserini.search.query.BagOfWordsQueryGenerator)1 DisjunctionMaxQueryGenerator (io.anserini.search.query.DisjunctionMaxQueryGenerator)1 File (java.io.File)1