Search in sources :

Example 6 with MMapDirectory

use of org.apache.lucene.store.MMapDirectory in project lucene-solr by apache.

the class BaseStoredFieldsFormatTestCase method testBigDocuments.

@Nightly
public void testBigDocuments() throws IOException {
    assumeWorkingMMapOnWindows();
    // "big" as "much bigger than the chunk size"
    // for this test we force a FS dir
    // we can't just use newFSDirectory, because this test doesn't really index anything.
    // so if we get NRTCachingDir+SimpleText, we make massive stored fields and OOM (LUCENE-4484)
    Directory dir = new MockDirectoryWrapper(random(), new MMapDirectory(createTempDir("testBigDocuments")));
    IndexWriterConfig iwConf = newIndexWriterConfig(new MockAnalyzer(random()));
    iwConf.setMaxBufferedDocs(RandomNumbers.randomIntBetween(random(), 2, 30));
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
    if (dir instanceof MockDirectoryWrapper) {
        ((MockDirectoryWrapper) dir).setThrottling(Throttling.NEVER);
    }
    // emptyDoc
    final Document emptyDoc = new Document();
    // lot of small fields
    final Document bigDoc1 = new Document();
    // 1 very big field
    final Document bigDoc2 = new Document();
    final Field idField = new StringField("id", "", Store.NO);
    emptyDoc.add(idField);
    bigDoc1.add(idField);
    bigDoc2.add(idField);
    final FieldType onlyStored = new FieldType(StringField.TYPE_STORED);
    onlyStored.setIndexOptions(IndexOptions.NONE);
    final Field smallField = new Field("fld", randomByteArray(random().nextInt(10), 256), onlyStored);
    final int numFields = RandomNumbers.randomIntBetween(random(), 500000, 1000000);
    for (int i = 0; i < numFields; ++i) {
        bigDoc1.add(smallField);
    }
    final Field bigField = new Field("fld", randomByteArray(RandomNumbers.randomIntBetween(random(), 1000000, 5000000), 2), onlyStored);
    bigDoc2.add(bigField);
    final int numDocs = atLeast(5);
    final Document[] docs = new Document[numDocs];
    for (int i = 0; i < numDocs; ++i) {
        docs[i] = RandomPicks.randomFrom(random(), Arrays.asList(emptyDoc, bigDoc1, bigDoc2));
    }
    for (int i = 0; i < numDocs; ++i) {
        idField.setStringValue("" + i);
        iw.addDocument(docs[i]);
        if (random().nextInt(numDocs) == 0) {
            iw.commit();
        }
    }
    iw.commit();
    // look at what happens when big docs are merged
    iw.forceMerge(1);
    final DirectoryReader rd = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(rd);
    for (int i = 0; i < numDocs; ++i) {
        final Query query = new TermQuery(new Term("id", "" + i));
        final TopDocs topDocs = searcher.search(query, 1);
        assertEquals("" + i, 1, topDocs.totalHits);
        final Document doc = rd.document(topDocs.scoreDocs[0].doc);
        assertNotNull(doc);
        final IndexableField[] fieldValues = doc.getFields("fld");
        assertEquals(docs[i].getFields("fld").length, fieldValues.length);
        if (fieldValues.length > 0) {
            assertEquals(docs[i].getFields("fld")[0].binaryValue(), fieldValues[0].binaryValue());
        }
    }
    rd.close();
    iw.close();
    dir.close();
}
Also used : MockDirectoryWrapper(org.apache.lucene.store.MockDirectoryWrapper) IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) Document(org.apache.lucene.document.Document) MMapDirectory(org.apache.lucene.store.MMapDirectory) IntPoint(org.apache.lucene.document.IntPoint) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) StringField(org.apache.lucene.document.StringField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringField(org.apache.lucene.document.StringField) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory)

Example 7 with MMapDirectory

use of org.apache.lucene.store.MMapDirectory in project Anserini by castorini.

the class SearchTweets method main.

public static void main(String[] args) throws Exception {
    long curTime = System.nanoTime();
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    RerankerCascade cascade = new RerankerCascade();
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    } else {
        cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    }
    if (!searchArgs.model.isEmpty() && searchArgs.extractors != null) {
        LOG.debug(String.format("Ranklib model used, modeled loaded from %s", searchArgs.model));
        cascade.add(new RankLibReranker(searchArgs.model, StatusField.TEXT.name, searchArgs.extractors));
    }
    FeatureExtractors extractorChain = null;
    if (searchArgs.extractors != null) {
        extractorChain = FeatureExtractors.loadExtractor(searchArgs.extractors);
    }
    if (searchArgs.dumpFeatures) {
        PrintStream out = new PrintStream(searchArgs.featureFile);
        Qrels qrels = new Qrels(searchArgs.qrels);
        cascade.add(new TweetsLtrDataGenerator(out, qrels, extractorChain));
    }
    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));
    PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
    LOG.info("Writing output to " + searchArgs.output);
    LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.nanoTime();
        Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(filter, BooleanClause.Occur.FILTER);
        builder.add(query, BooleanClause.Occur.MUST);
        Query q = builder.build();
        TopDocs rs = searcher.search(q, searchArgs.hits);
        List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        for (int i = 0; i < docs.documents.length; i++) {
            String qid = topic.getId().replaceFirst("^MB0*", "");
            out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], searchArgs.runtag));
        }
        long qtime = (System.nanoTime() - curQueryTime) / 1000000;
        LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
        totalTime += qtime;
        cnt++;
    }
    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
    reader.close();
    out.close();
}
Also used : RemoveRetweetsTemporalTiebreakReranker(io.anserini.rerank.twitter.RemoveRetweetsTemporalTiebreakReranker) ScoredDocuments(io.anserini.rerank.ScoredDocuments) RerankerCascade(io.anserini.rerank.RerankerCascade) Rm3Reranker(io.anserini.rerank.rm3.Rm3Reranker) RankLibReranker(io.anserini.rerank.RankLibReranker) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) PrintStream(java.io.PrintStream) Qrels(io.anserini.util.Qrels) CmdLineParser(org.kohsuke.args4j.CmdLineParser) MMapDirectory(org.apache.lucene.store.MMapDirectory) LongPoint(org.apache.lucene.document.LongPoint) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) TweetsLtrDataGenerator(io.anserini.ltr.TweetsLtrDataGenerator) FileOutputStream(java.io.FileOutputStream) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) File(java.io.File) CmdLineException(org.kohsuke.args4j.CmdLineException) RerankerContext(io.anserini.rerank.RerankerContext)

Example 8 with MMapDirectory

use of org.apache.lucene.store.MMapDirectory in project Anserini by castorini.

the class SearchWebCollection method main.

public static void main(String[] args) throws Exception {
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }
    Similarity similarity = null;
    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        similarity = new LMDirichletSimilarity(searchArgs.mu);
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    RerankerCascade cascade = new RerankerCascade();
    boolean useQueryParser = false;
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(new EnglishAnalyzer(), FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
        useQueryParser = true;
    } else {
        cascade.add(new IdentityReranker());
    }
    FeatureExtractors extractors = null;
    if (searchArgs.extractors != null) {
        extractors = FeatureExtractors.loadExtractor(searchArgs.extractors);
    }
    if (searchArgs.dumpFeatures) {
        PrintStream out = new PrintStream(searchArgs.featureFile);
        Qrels qrels = new Qrels(searchArgs.qrels);
        cascade.add(new WebCollectionLtrDataGenerator(out, qrels, extractors));
    }
    Path topicsFile = Paths.get(searchArgs.topics);
    if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
        throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }
    TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + searchArgs.topicReader + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
    SortedMap<Integer, String> topics = tr.read();
    final long start = System.nanoTime();
    SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
    searcher.search(topics, searchArgs.output, similarity, searchArgs.hits, cascade, useQueryParser, searchArgs.keepstop);
    searcher.close();
    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + topics.size() + " topics searched in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
Also used : LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IdentityReranker(io.anserini.rerank.IdentityReranker) RerankerCascade(io.anserini.rerank.RerankerCascade) TopicReader(io.anserini.search.query.TopicReader) Rm3Reranker(io.anserini.rerank.rm3.Rm3Reranker) WebCollectionLtrDataGenerator(io.anserini.ltr.WebCollectionLtrDataGenerator) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) PrintStream(java.io.PrintStream) Qrels(io.anserini.util.Qrels) CmdLineParser(org.kohsuke.args4j.CmdLineParser) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) MMapDirectory(org.apache.lucene.store.MMapDirectory) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) CmdLineException(org.kohsuke.args4j.CmdLineException)

Example 9 with MMapDirectory

use of org.apache.lucene.store.MMapDirectory in project elasticsearch by elastic.

the class FsDirectoryServiceTests method doTestPreload.

private void doTestPreload(String... preload) throws IOException {
    Settings build = Settings.builder().put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), "mmapfs").putArray(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), preload).build();
    IndexSettings settings = IndexSettingsModule.newIndexSettings("foo", build);
    IndexStore store = new IndexStore(settings);
    Path tempDir = createTempDir().resolve(settings.getUUID()).resolve("0");
    Files.createDirectories(tempDir);
    ShardPath path = new ShardPath(false, tempDir, tempDir, new ShardId(settings.getIndex(), 0));
    FsDirectoryService fsDirectoryService = new FsDirectoryService(settings, store, path);
    Directory directory = fsDirectoryService.newDirectory();
    assertFalse(directory instanceof SleepingLockWrapper);
    if (preload.length == 0) {
        assertTrue(directory.toString(), directory instanceof MMapDirectory);
        assertFalse(((MMapDirectory) directory).getPreload());
    } else if (Arrays.asList(preload).contains("*")) {
        assertTrue(directory.toString(), directory instanceof MMapDirectory);
        assertTrue(((MMapDirectory) directory).getPreload());
    } else {
        assertTrue(directory.toString(), directory instanceof FileSwitchDirectory);
        FileSwitchDirectory fsd = (FileSwitchDirectory) directory;
        assertTrue(fsd.getPrimaryDir() instanceof MMapDirectory);
        assertTrue(((MMapDirectory) fsd.getPrimaryDir()).getPreload());
        assertTrue(fsd.getSecondaryDir() instanceof MMapDirectory);
        assertFalse(((MMapDirectory) fsd.getSecondaryDir()).getPreload());
    }
}
Also used : ShardPath(org.elasticsearch.index.shard.ShardPath) Path(java.nio.file.Path) ShardId(org.elasticsearch.index.shard.ShardId) ShardPath(org.elasticsearch.index.shard.ShardPath) IndexSettings(org.elasticsearch.index.IndexSettings) SleepingLockWrapper(org.apache.lucene.store.SleepingLockWrapper) FileSwitchDirectory(org.apache.lucene.store.FileSwitchDirectory) MMapDirectory(org.apache.lucene.store.MMapDirectory) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FileSwitchDirectory(org.apache.lucene.store.FileSwitchDirectory)

Example 10 with MMapDirectory

use of org.apache.lucene.store.MMapDirectory in project lucene-solr by apache.

the class TestIndexWriter method testDeleteUnusedFiles.

public void testDeleteUnusedFiles() throws Exception {
    assumeFalse("test relies on exact filenames", Codec.getDefault() instanceof SimpleTextCodec);
    assumeWorkingMMapOnWindows();
    for (int iter = 0; iter < 2; iter++) {
        // relies on windows semantics
        Path path = createTempDir();
        FileSystem fs = new WindowsFS(path.getFileSystem()).getFileSystem(URI.create("file:///"));
        Path indexPath = new FilterPath(path, fs);
        // NOTE: on Unix, we cannot use MMapDir, because WindowsFS doesn't see/think it keeps file handles open.  Yet, on Windows, we MUST use
        // MMapDir because the windows OS will in fact prevent file deletion for us, and fails otherwise:
        FSDirectory dir;
        if (Constants.WINDOWS) {
            dir = new MMapDirectory(indexPath);
        } else {
            dir = new NIOFSDirectory(indexPath);
        }
        MergePolicy mergePolicy = newLogMergePolicy(true);
        // This test expects all of its segments to be in CFS
        mergePolicy.setNoCFSRatio(1.0);
        mergePolicy.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(mergePolicy).setUseCompoundFile(true));
        Document doc = new Document();
        doc.add(newTextField("field", "go", Field.Store.NO));
        w.addDocument(doc);
        DirectoryReader r;
        if (iter == 0) {
            // use NRT
            r = w.getReader();
        } else {
            // don't use NRT
            w.commit();
            r = DirectoryReader.open(dir);
        }
        assertTrue(Files.exists(indexPath.resolve("_0.cfs")));
        assertTrue(Files.exists(indexPath.resolve("_0.cfe")));
        assertTrue(Files.exists(indexPath.resolve("_0.si")));
        if (iter == 1) {
            // we run a full commit so there should be a segments file etc.
            assertTrue(Files.exists(indexPath.resolve("segments_1")));
        } else {
            // this is an NRT reopen - no segments files yet
            assertFalse(Files.exists(indexPath.resolve("segments_1")));
        }
        w.addDocument(doc);
        w.forceMerge(1);
        if (iter == 1) {
            w.commit();
        }
        IndexReader r2 = DirectoryReader.openIfChanged(r);
        assertNotNull(r2);
        assertTrue(r != r2);
        // NOTE: here we rely on "Windows" behavior, ie, even
        // though IW wanted to delete _0.cfs since it was
        // merged away, because we have a reader open
        // against this file, it should still be here:
        assertTrue(Files.exists(indexPath.resolve("_0.cfs")));
        // forceMerge created this
        //assertTrue(files.contains("_2.cfs"));
        w.deleteUnusedFiles();
        // r still holds this file open
        assertTrue(Files.exists(indexPath.resolve("_0.cfs")));
        //assertTrue(files.contains("_2.cfs"));
        r.close();
        if (iter == 0) {
            // on closing NRT reader, it calls writer.deleteUnusedFiles
            assertFalse(Files.exists(indexPath.resolve("_0.cfs")));
        } else {
            // now FSDir can remove it
            dir.deletePendingFiles();
            assertFalse(Files.exists(indexPath.resolve("_0.cfs")));
        }
        w.close();
        r2.close();
        dir.close();
    }
}
Also used : FilterPath(org.apache.lucene.mockfile.FilterPath) Path(java.nio.file.Path) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory) FilterPath(org.apache.lucene.mockfile.FilterPath) SimpleTextCodec(org.apache.lucene.codecs.simpletext.SimpleTextCodec) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory) Document(org.apache.lucene.document.Document) MMapDirectory(org.apache.lucene.store.MMapDirectory) WindowsFS(org.apache.lucene.mockfile.WindowsFS) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) FileSystem(java.nio.file.FileSystem)

Aggregations

MMapDirectory (org.apache.lucene.store.MMapDirectory)13 Directory (org.apache.lucene.store.Directory)7 Path (java.nio.file.Path)4 FSDirectory (org.apache.lucene.store.FSDirectory)4 NIOFSDirectory (org.apache.lucene.store.NIOFSDirectory)4 SimpleFSDirectory (org.apache.lucene.store.SimpleFSDirectory)4 File (java.io.File)3 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)3 Document (org.apache.lucene.document.Document)3 FeatureExtractors (io.anserini.ltr.feature.FeatureExtractors)2 RerankerCascade (io.anserini.rerank.RerankerCascade)2 Rm3Reranker (io.anserini.rerank.rm3.Rm3Reranker)2 Qrels (io.anserini.util.Qrels)2 PrintStream (java.io.PrintStream)2 Field (org.apache.lucene.document.Field)2 FieldType (org.apache.lucene.document.FieldType)2 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)2 LMDirichletSimilarity (org.apache.lucene.search.similarities.LMDirichletSimilarity)2 MockDirectoryWrapper (org.apache.lucene.store.MockDirectoryWrapper)2 CmdLineException (org.kohsuke.args4j.CmdLineException)2