Search in sources :

Example 1 with BagOfWordsQueryGenerator

use of io.anserini.search.query.BagOfWordsQueryGenerator in project Anserini by castorini.

the class SimpleTweetSearcher method searchTweets.

public Result[] searchTweets(String q, int k, long t) throws IOException {
    Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q);
    List<String> queryTokens = AnalyzerUtils.analyze(analyzer, q);
    return searchTweets(query, queryTokens, q, k, t);
}
Also used : BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) Query(org.apache.lucene.search.Query) BooleanQuery(org.apache.lucene.search.BooleanQuery)

Example 2 with BagOfWordsQueryGenerator

use of io.anserini.search.query.BagOfWordsQueryGenerator in project Anserini by castorini.

the class IndexReaderUtils method computeQueryDocumentScoreWithSimilarityAndAnalyzer.

/**
 * Computes the score of a document with respect to a query given a scoring function and an analyzer.
 *
 * @param reader index reader
 * @param docid docid of the document to score
 * @param q query
 * @param similarity scoring function
 * @param analyzer analyzer to use
 * @return the score of the document with respect to the query
 * @throws IOException if error encountered during query
 */
public static float computeQueryDocumentScoreWithSimilarityAndAnalyzer(IndexReader reader, String docid, String q, Similarity similarity, Analyzer analyzer) throws IOException {
    // We compute the query-document score by issuing the query with an additional filter clause that restricts
    // consideration to only the docid in question, and then returning the retrieval score.
    // 
    // This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means
    // that we don't need to copy the scoring function and keep it in sync wrt code updates.
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q);
    Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(filterQuery, BooleanClause.Occur.MUST);
    builder.add(query, BooleanClause.Occur.MUST);
    Query finalQuery = builder.build();
    TopDocs rs = searcher.search(finalQuery, 1);
    // If we get zero results, indicates that term isn't found in the document.
    return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) Query(org.apache.lucene.search.Query) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) Term(org.apache.lucene.index.Term)

Example 3 with BagOfWordsQueryGenerator

use of io.anserini.search.query.BagOfWordsQueryGenerator in project Anserini by castorini.

the class SearchMsmarco method main.

public static void main(String[] args) throws Exception {
    Args retrieveArgs = new Args();
    CmdLineParser parser = new CmdLineParser(retrieveArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: Eval " + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    System.out.println("###############################################################################");
    System.out.println("WARNING: This class has been deprecated and may be removed in a future release!");
    System.out.println("###############################################################################\n");
    long totalStartTime = System.nanoTime();
    Analyzer analyzer;
    if (retrieveArgs.pretokenized) {
        analyzer = new WhitespaceAnalyzer();
        System.out.println("Initializing whilte space analyzer");
    } else {
        analyzer = DefaultEnglishAnalyzer.fromArguments(retrieveArgs.stemmer, retrieveArgs.keepstop, retrieveArgs.stopwords);
        System.out.println("Initializing analyzer with stemmer=" + retrieveArgs.stemmer + ", keepstop=" + retrieveArgs.keepstop + ", stopwords=" + retrieveArgs.stopwords);
    }
    SimpleSearcher searcher = new SimpleSearcher(retrieveArgs.index, analyzer);
    searcher.setBM25(retrieveArgs.k1, retrieveArgs.b);
    System.out.println("Initializing BM25, setting k1=" + retrieveArgs.k1 + " and b=" + retrieveArgs.b + "");
    if (retrieveArgs.rm3) {
        searcher.setRM3(retrieveArgs.fbTerms, retrieveArgs.fbDocs, retrieveArgs.originalQueryWeight);
        System.out.println("Initializing RM3, setting fbTerms=" + retrieveArgs.fbTerms + ", fbDocs=" + retrieveArgs.fbDocs + " and originalQueryWeight=" + retrieveArgs.originalQueryWeight);
    }
    Map<String, Float> fields = new HashMap<>();
    retrieveArgs.fields.forEach((key, value) -> fields.put(key, Float.valueOf(value)));
    if (retrieveArgs.fields.size() > 0) {
        System.out.println("Performing weighted field search with fields=" + retrieveArgs.fields);
    }
    QueryGenerator queryGenerator;
    if (retrieveArgs.dismax) {
        queryGenerator = new DisjunctionMaxQueryGenerator(retrieveArgs.dismax_tiebreaker);
        System.out.println("Initializing dismax query generator, with tiebreaker=" + retrieveArgs.dismax_tiebreaker);
    } else {
        queryGenerator = new BagOfWordsQueryGenerator();
        System.out.println("Initializing bag-of-words query generator.");
    }
    PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(retrieveArgs.output), StandardCharsets.US_ASCII));
    if (retrieveArgs.threads == 1) {
        // single-threaded retrieval
        long startTime = System.nanoTime();
        List<String> lines = FileUtils.readLines(new File(retrieveArgs.qid_queries), "utf-8");
        for (int lineNumber = 0; lineNumber < lines.size(); ++lineNumber) {
            String line = lines.get(lineNumber);
            String[] split = line.trim().split("\t");
            String qid = split[0];
            String query = split[1];
            SimpleSearcher.Result[] hits;
            if (retrieveArgs.fields.size() > 0) {
                hits = searcher.searchFields(queryGenerator, query, fields, retrieveArgs.hits);
            } else {
                hits = searcher.search(queryGenerator, query, retrieveArgs.hits);
            }
            if (lineNumber % 100 == 0) {
                double timePerQuery = (double) (System.nanoTime() - startTime) / (lineNumber + 1) / 1e9;
                System.out.format("Retrieving query " + lineNumber + " (%.3f s/query)\n", timePerQuery);
            }
            for (int rank = 0; rank < hits.length; ++rank) {
                String docno = hits[rank].docid;
                out.println(qid + "\t" + docno + "\t" + (rank + 1));
            }
        }
    } else {
        // multithreaded batch retrieval
        List<String> lines = FileUtils.readLines(new File(retrieveArgs.qid_queries), "utf-8");
        List<String> queries = lines.stream().map(x -> x.trim().split("\t")[1]).collect(Collectors.toList());
        List<String> qids = lines.stream().map(x -> x.trim().split("\t")[0]).collect(Collectors.toList());
        Map<String, SimpleSearcher.Result[]> results;
        if (retrieveArgs.fields.size() > 0) {
            results = searcher.batchSearchFields(queryGenerator, queries, qids, retrieveArgs.hits, retrieveArgs.threads, fields);
        } else {
            results = searcher.batchSearch(queryGenerator, queries, qids, retrieveArgs.hits, retrieveArgs.threads);
        }
        for (String qid : qids) {
            SimpleSearcher.Result[] hits = results.get(qid);
            for (int rank = 0; rank < hits.length; ++rank) {
                String docno = hits[rank].docid;
                out.println(qid + "\t" + docno + "\t" + (rank + 1));
            }
        }
    }
    searcher.close();
    out.flush();
    out.close();
    double totalTime = (double) (System.nanoTime() - totalStartTime) / 1e9;
    System.out.format("Total retrieval time: %.3f s\n", totalTime);
    System.out.println("Done!");
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) PrintWriter(java.io.PrintWriter) CmdLineParser(org.kohsuke.args4j.CmdLineParser) OptionHandlerFilter(org.kohsuke.args4j.OptionHandlerFilter) Files(java.nio.file.Files) Analyzer(org.apache.lucene.analysis.Analyzer) MapOptionHandler(org.kohsuke.args4j.spi.MapOptionHandler) FileUtils(org.apache.commons.io.FileUtils) HashMap(java.util.HashMap) Option(org.kohsuke.args4j.Option) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Collectors(java.util.stream.Collectors) DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) ParserProperties(org.kohsuke.args4j.ParserProperties) File(java.io.File) StandardCharsets(java.nio.charset.StandardCharsets) CmdLineException(org.kohsuke.args4j.CmdLineException) List(java.util.List) Paths(java.nio.file.Paths) Map(java.util.Map) QueryGenerator(io.anserini.search.query.QueryGenerator) BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) DisjunctionMaxQueryGenerator(io.anserini.search.query.DisjunctionMaxQueryGenerator) CmdLineParser(org.kohsuke.args4j.CmdLineParser) HashMap(java.util.HashMap) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) QueryGenerator(io.anserini.search.query.QueryGenerator) BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) DisjunctionMaxQueryGenerator(io.anserini.search.query.DisjunctionMaxQueryGenerator) DisjunctionMaxQueryGenerator(io.anserini.search.query.DisjunctionMaxQueryGenerator) File(java.io.File) CmdLineException(org.kohsuke.args4j.CmdLineException) PrintWriter(java.io.PrintWriter)

Example 4 with BagOfWordsQueryGenerator

use of io.anserini.search.query.BagOfWordsQueryGenerator in project Anserini by castorini.

the class SimpleSearcher method search.

/**
 * Searches the collection, returning a specified number of hits.
 *
 * @param q query
 * @param k number of hits
 * @return array of search results
 * @throws IOException if error encountered during search
 */
public Result[] search(String q, int k) throws IOException {
    Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q);
    List<String> queryTokens = AnalyzerUtils.analyze(analyzer, q);
    return search(query, queryTokens, q, k);
}
Also used : BagOfWordsQueryGenerator(io.anserini.search.query.BagOfWordsQueryGenerator) Query(org.apache.lucene.search.Query)

Aggregations

BagOfWordsQueryGenerator (io.anserini.search.query.BagOfWordsQueryGenerator)4 Query (org.apache.lucene.search.Query)3 BooleanQuery (org.apache.lucene.search.BooleanQuery)2 DefaultEnglishAnalyzer (io.anserini.analysis.DefaultEnglishAnalyzer)1 DisjunctionMaxQueryGenerator (io.anserini.search.query.DisjunctionMaxQueryGenerator)1 QueryGenerator (io.anserini.search.query.QueryGenerator)1 File (java.io.File)1 PrintWriter (java.io.PrintWriter)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Paths (java.nio.file.Paths)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Collectors (java.util.stream.Collectors)1 FileUtils (org.apache.commons.io.FileUtils)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)1 Term (org.apache.lucene.index.Term)1 ConstantScoreQuery (org.apache.lucene.search.ConstantScoreQuery)1