Search in sources :

Example 21 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project Anserini by castorini.

the class SearchTweets method main.

public static void main(String[] args) throws Exception {
    long curTime = System.nanoTime();
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    RerankerCascade cascade = new RerankerCascade();
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    } else {
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    }
    if (!searchArgs.model.isEmpty() && searchArgs.extractors != null) {
        LOG.debug(String.format("Ranklib model used, modeled loaded from %s", searchArgs.model));
        cascade.add(new RankLibReranker(searchArgs.model, StatusField.TEXT.name, searchArgs.extractors));
    }
    FeatureExtractors extractorChain = null;
    if (searchArgs.extractors != null) {
        extractorChain = FeatureExtractors.loadExtractor(searchArgs.extractors);
    }
    if (searchArgs.dumpFeatures) {
        PrintStream out = new PrintStream(searchArgs.featureFile);
        Qrels qrels = new Qrels(searchArgs.qrels);
        cascade.add(new TweetsLtrDataGenerator(out, qrels, extractorChain));
    }
    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));
    PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
    LOG.info("Writing output to " + searchArgs.output);
    LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.nanoTime();
        Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(filter, BooleanClause.Occur.FILTER);
        builder.add(query, BooleanClause.Occur.MUST);
        Query q = builder.build();
        TopDocs rs = searcher.search(q, searchArgs.hits);
        List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], searchArgs.runtag));
        }
        long qtime = (System.nanoTime() - curQueryTime) / 1000000;
        LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
        totalTime += qtime;
        cnt++;
    }
    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
    reader.close();
    out.close();
}
Also used : RemoveRetweetsTemporalTiebreakReranker(io.anserini.rerank.twitter.RemoveRetweetsTemporalTiebreakReranker) ScoredDocuments(io.anserini.rerank.ScoredDocuments) RerankerCascade(io.anserini.rerank.RerankerCascade) Rm3Reranker(io.anserini.rerank.rm3.Rm3Reranker) RankLibReranker(io.anserini.rerank.RankLibReranker) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) PrintStream(java.io.PrintStream) Qrels(io.anserini.util.Qrels) CmdLineParser(org.kohsuke.args4j.CmdLineParser) MMapDirectory(org.apache.lucene.store.MMapDirectory) LongPoint(org.apache.lucene.document.LongPoint) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) TweetsLtrDataGenerator(io.anserini.ltr.TweetsLtrDataGenerator) FileOutputStream(java.io.FileOutputStream) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) File(java.io.File) CmdLineException(org.kohsuke.args4j.CmdLineException) RerankerContext(io.anserini.rerank.RerankerContext)

Example 22 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project Anserini by castorini.

the class SearchWebCollection method main.

public static void main(String[] args) throws Exception {
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }
    Similarity similarity = null;
    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        similarity = new LMDirichletSimilarity(searchArgs.mu);
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    RerankerCascade cascade = new RerankerCascade();
    boolean useQueryParser = false;
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(new EnglishAnalyzer(), FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
        useQueryParser = true;
    } else {
        cascade.add(new IdentityReranker());
    }
    FeatureExtractors extractors = null;
    if (searchArgs.extractors != null) {
        extractors = FeatureExtractors.loadExtractor(searchArgs.extractors);
    }
    if (searchArgs.dumpFeatures) {
        PrintStream out = new PrintStream(searchArgs.featureFile);
        Qrels qrels = new Qrels(searchArgs.qrels);
        cascade.add(new WebCollectionLtrDataGenerator(out, qrels, extractors));
    }
    Path topicsFile = Paths.get(searchArgs.topics);
    if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
        throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }
    TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + searchArgs.topicReader + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
    SortedMap<Integer, String> topics = tr.read();
    final long start = System.nanoTime();
    SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
    searcher.search(topics, searchArgs.output, similarity, searchArgs.hits, cascade, useQueryParser, searchArgs.keepstop);
    searcher.close();
    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + topics.size() + " topics searched in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
Also used : LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IdentityReranker(io.anserini.rerank.IdentityReranker) RerankerCascade(io.anserini.rerank.RerankerCascade) TopicReader(io.anserini.search.query.TopicReader) Rm3Reranker(io.anserini.rerank.rm3.Rm3Reranker) WebCollectionLtrDataGenerator(io.anserini.ltr.WebCollectionLtrDataGenerator) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) PrintStream(java.io.PrintStream) Qrels(io.anserini.util.Qrels) CmdLineParser(org.kohsuke.args4j.CmdLineParser) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) MMapDirectory(org.apache.lucene.store.MMapDirectory) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) CmdLineException(org.kohsuke.args4j.CmdLineException)

Example 23 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project Anserini by castorini.

the class DumpDocids method readIndex.

public void readIndex(String indexDir, String docIdPath, String docId) throws IOException, IDNameException {
    FSDirectory dir = FSDirectory.open(new File(indexDir).toPath());
    DirectoryReader reader = DirectoryReader.open(dir);
    FileWriter fw = new FileWriter(new File(docIdPath));
    BufferedWriter bw = new BufferedWriter(fw);
    int len = reader.numDocs();
    if (len > 0) {
        String docName = reader.document(0).get(docId);
        if (docName == null) {
            LOG.info(docId + " is a wrong document ID field name!");
            bw.close();
            throw new IDNameException(docId + " is a wrong document ID field name!");
        }
    } else {
        bw.close();
        throw new IDNameException("No document is in the index!");
    }
    for (int i = 0; i < len; i++) {
        String docName = reader.document(i).get(docId);
        bw.write(docName + "\n");
        if ((i % 100000) == 0) {
            LOG.info("DumpDocids: " + i + " docs got");
        }
    }
    bw.close();
}
Also used : DirectoryReader(org.apache.lucene.index.DirectoryReader) FileWriter(java.io.FileWriter) FSDirectory(org.apache.lucene.store.FSDirectory) File(java.io.File) BufferedWriter(java.io.BufferedWriter)

Example 24 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project elasticsearch by elastic.

the class DirectoryUtilsTests method testGetLeave.

public void testGetLeave() throws IOException {
    Path file = createTempDir();
    final int iters = scaledRandomIntBetween(10, 100);
    for (int i = 0; i < iters; i++) {
        {
            BaseDirectoryWrapper dir = newFSDirectory(file);
            FSDirectory directory = DirectoryUtils.getLeaf(new FilterDirectory(dir) {
            }, FSDirectory.class, null);
            assertThat(directory, notNullValue());
            assertThat(directory, sameInstance(DirectoryUtils.getLeafDirectory(dir, null)));
            dir.close();
        }
        {
            BaseDirectoryWrapper dir = newFSDirectory(file);
            FSDirectory directory = DirectoryUtils.getLeaf(dir, FSDirectory.class, null);
            assertThat(directory, notNullValue());
            assertThat(directory, sameInstance(DirectoryUtils.getLeafDirectory(dir, null)));
            dir.close();
        }
        {
            Set<String> stringSet = Collections.emptySet();
            BaseDirectoryWrapper dir = newFSDirectory(file);
            FSDirectory directory = DirectoryUtils.getLeaf(new FileSwitchDirectory(stringSet, dir, dir, random().nextBoolean()), FSDirectory.class, null);
            assertThat(directory, notNullValue());
            assertThat(directory, sameInstance(DirectoryUtils.getLeafDirectory(dir, null)));
            dir.close();
        }
        {
            Set<String> stringSet = Collections.emptySet();
            BaseDirectoryWrapper dir = newFSDirectory(file);
            FSDirectory directory = DirectoryUtils.getLeaf(new FilterDirectory(new FileSwitchDirectory(stringSet, dir, dir, random().nextBoolean())) {
            }, FSDirectory.class, null);
            assertThat(directory, notNullValue());
            assertThat(directory, sameInstance(DirectoryUtils.getLeafDirectory(dir, null)));
            dir.close();
        }
        {
            Set<String> stringSet = Collections.emptySet();
            BaseDirectoryWrapper dir = newFSDirectory(file);
            RAMDirectory directory = DirectoryUtils.getLeaf(new FilterDirectory(new FileSwitchDirectory(stringSet, dir, dir, random().nextBoolean())) {
            }, RAMDirectory.class, null);
            assertThat(directory, nullValue());
            dir.close();
        }
    }
}
Also used : Path(java.nio.file.Path) Set(java.util.Set) FilterDirectory(org.apache.lucene.store.FilterDirectory) BaseDirectoryWrapper(org.apache.lucene.store.BaseDirectoryWrapper) FileSwitchDirectory(org.apache.lucene.store.FileSwitchDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) RAMDirectory(org.apache.lucene.store.RAMDirectory)

Example 25 with FSDirectory

use of org.apache.lucene.store.FSDirectory in project languagetool by languagetool-org.

the class FrequencyIndexCreatorTest method testReadPerformance.

@Test
@Ignore("Interactive use only")
public void testReadPerformance() throws IOException {
    try (FSDirectory directory = FSDirectory.open(INDEX_DIR.toPath())) {
        DirectoryReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);
        try (Scanner scanner = new Scanner(new File("/lt/performance-test/en.txt"))) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                String[] parts = line.split(" ");
                accessNgrams(parts, searcher);
            }
        }
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Scanner(java.util.Scanner) DirectoryReader(org.apache.lucene.index.DirectoryReader) FSDirectory(org.apache.lucene.store.FSDirectory) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

FSDirectory (org.apache.lucene.store.FSDirectory)43 File (java.io.File)18 Directory (org.apache.lucene.store.Directory)12 IOException (java.io.IOException)10 Path (java.nio.file.Path)10 IndexSearcher (org.apache.lucene.search.IndexSearcher)9 FileNotFoundException (java.io.FileNotFoundException)5 FileSystem (java.nio.file.FileSystem)5 Document (org.apache.lucene.document.Document)5 IndexReader (org.apache.lucene.index.IndexReader)5 MMapDirectory (org.apache.lucene.store.MMapDirectory)5 NIOFSDirectory (org.apache.lucene.store.NIOFSDirectory)5 FilterDirectory (org.apache.lucene.store.FilterDirectory)4 SimpleFSDirectory (org.apache.lucene.store.SimpleFSDirectory)4 PrintStream (java.io.PrintStream)3 ArrayList (java.util.ArrayList)3 DirectoryReader (org.apache.lucene.index.DirectoryReader)3 Term (org.apache.lucene.index.Term)3 WindowsFS (org.apache.lucene.mockfile.WindowsFS)3 TermQuery (org.apache.lucene.search.TermQuery)3