Search in sources :

Example 1 with TopicReader

use of io.anserini.search.query.TopicReader in project Anserini by castorini.

the class SearchWebCollection method main.

public static void main(String[] args) throws Exception {
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }
    Similarity similarity = null;
    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        similarity = new LMDirichletSimilarity(searchArgs.mu);
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    RerankerCascade cascade = new RerankerCascade();
    boolean useQueryParser = false;
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(new EnglishAnalyzer(), FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
        useQueryParser = true;
    } else {
        cascade.add(new IdentityReranker());
    }
    FeatureExtractors extractors = null;
    if (searchArgs.extractors != null) {
        extractors = FeatureExtractors.loadExtractor(searchArgs.extractors);
    }
    if (searchArgs.dumpFeatures) {
        PrintStream out = new PrintStream(searchArgs.featureFile);
        Qrels qrels = new Qrels(searchArgs.qrels);
        cascade.add(new WebCollectionLtrDataGenerator(out, qrels, extractors));
    }
    Path topicsFile = Paths.get(searchArgs.topics);
    if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
        throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }
    TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + searchArgs.topicReader + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
    SortedMap<Integer, String> topics = tr.read();
    final long start = System.nanoTime();
    SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
    searcher.search(topics, searchArgs.output, similarity, searchArgs.hits, cascade, useQueryParser, searchArgs.keepstop);
    searcher.close();
    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + topics.size() + " topics searched in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
Also used : LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IdentityReranker(io.anserini.rerank.IdentityReranker) RerankerCascade(io.anserini.rerank.RerankerCascade) TopicReader(io.anserini.search.query.TopicReader) Rm3Reranker(io.anserini.rerank.rm3.Rm3Reranker) WebCollectionLtrDataGenerator(io.anserini.ltr.WebCollectionLtrDataGenerator) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) PrintStream(java.io.PrintStream) Qrels(io.anserini.util.Qrels) CmdLineParser(org.kohsuke.args4j.CmdLineParser) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) MMapDirectory(org.apache.lucene.store.MMapDirectory) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) CmdLineException(org.kohsuke.args4j.CmdLineException)

Example 2 with TopicReader

use of io.anserini.search.query.TopicReader in project Anserini by castorini.

the class SearchTimeUtil method main.

public static void main(String[] args) throws IOException, ParseException, ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException, InstantiationException {
    if (args.length != 1) {
        System.err.println("Usage: SearchTimeUtil <indexDir>");
        System.err.println("indexDir: index directory");
        System.exit(1);
    }
    String[] topics = { "topics.web.1-50.txt", "topics.web.51-100.txt", "topics.web.101-150.txt", "topics.web.151-200.txt", "topics.web.201-250.txt", "topics.web.251-300.txt" };
    SearchWebCollection searcher = new SearchWebCollection(args[0]);
    for (String topicFile : topics) {
        Path topicsFile = Paths.get("src/resources/topics-and-qrels/", topicFile);
        TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + "Webxml" + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
        SortedMap<Integer, String> queries = tr.read();
        for (int i = 1; i <= 3; i++) {
            final long start = System.nanoTime();
            String submissionFile = File.createTempFile(topicFile + "_" + i, ".tmp").getAbsolutePath();
            RerankerCascade cascade = new RerankerCascade();
            cascade.add(new IdentityReranker());
            searcher.search(queries, submissionFile, new BM25Similarity(0.9f, 0.4f), 1000, cascade);
            final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
            System.out.println(topicFile + "_" + i + " search completed in " + DurationFormatUtils.formatDuration(durationMillis, "mm:ss:SSS"));
        }
    }
    searcher.close();
}
Also used : Path(java.nio.file.Path) RerankerCascade(io.anserini.rerank.RerankerCascade) TopicReader(io.anserini.search.query.TopicReader) IdentityReranker(io.anserini.rerank.IdentityReranker) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) SearchWebCollection(io.anserini.search.SearchWebCollection)

Aggregations

IdentityReranker (io.anserini.rerank.IdentityReranker)2 RerankerCascade (io.anserini.rerank.RerankerCascade)2 TopicReader (io.anserini.search.query.TopicReader)2 Path (java.nio.file.Path)2 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)2 WebCollectionLtrDataGenerator (io.anserini.ltr.WebCollectionLtrDataGenerator)1 FeatureExtractors (io.anserini.ltr.feature.FeatureExtractors)1 Rm3Reranker (io.anserini.rerank.rm3.Rm3Reranker)1 SearchWebCollection (io.anserini.search.SearchWebCollection)1 Qrels (io.anserini.util.Qrels)1 PrintStream (java.io.PrintStream)1 EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)1 LMDirichletSimilarity (org.apache.lucene.search.similarities.LMDirichletSimilarity)1 Similarity (org.apache.lucene.search.similarities.Similarity)1 Directory (org.apache.lucene.store.Directory)1 FSDirectory (org.apache.lucene.store.FSDirectory)1 MMapDirectory (org.apache.lucene.store.MMapDirectory)1 CmdLineException (org.kohsuke.args4j.CmdLineException)1 CmdLineParser (org.kohsuke.args4j.CmdLineParser)1