use of org.apache.lucene.store.MMapDirectory in project lucene-solr by apache.
the class BaseStoredFieldsFormatTestCase method testBigDocuments.
@Nightly
public void testBigDocuments() throws IOException {
assumeWorkingMMapOnWindows();
// "big" as "much bigger than the chunk size"
// for this test we force a FS dir
// we can't just use newFSDirectory, because this test doesn't really index anything.
// so if we get NRTCachingDir+SimpleText, we make massive stored fields and OOM (LUCENE-4484)
Directory dir = new MockDirectoryWrapper(random(), new MMapDirectory(createTempDir("testBigDocuments")));
IndexWriterConfig iwConf = newIndexWriterConfig(new MockAnalyzer(random()));
iwConf.setMaxBufferedDocs(RandomNumbers.randomIntBetween(random(), 2, 30));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
if (dir instanceof MockDirectoryWrapper) {
((MockDirectoryWrapper) dir).setThrottling(Throttling.NEVER);
}
// emptyDoc
final Document emptyDoc = new Document();
// lot of small fields
final Document bigDoc1 = new Document();
// 1 very big field
final Document bigDoc2 = new Document();
final Field idField = new StringField("id", "", Store.NO);
emptyDoc.add(idField);
bigDoc1.add(idField);
bigDoc2.add(idField);
final FieldType onlyStored = new FieldType(StringField.TYPE_STORED);
onlyStored.setIndexOptions(IndexOptions.NONE);
final Field smallField = new Field("fld", randomByteArray(random().nextInt(10), 256), onlyStored);
final int numFields = RandomNumbers.randomIntBetween(random(), 500000, 1000000);
for (int i = 0; i < numFields; ++i) {
bigDoc1.add(smallField);
}
final Field bigField = new Field("fld", randomByteArray(RandomNumbers.randomIntBetween(random(), 1000000, 5000000), 2), onlyStored);
bigDoc2.add(bigField);
final int numDocs = atLeast(5);
final Document[] docs = new Document[numDocs];
for (int i = 0; i < numDocs; ++i) {
docs[i] = RandomPicks.randomFrom(random(), Arrays.asList(emptyDoc, bigDoc1, bigDoc2));
}
for (int i = 0; i < numDocs; ++i) {
idField.setStringValue("" + i);
iw.addDocument(docs[i]);
if (random().nextInt(numDocs) == 0) {
iw.commit();
}
}
iw.commit();
// look at what happens when big docs are merged
iw.forceMerge(1);
final DirectoryReader rd = DirectoryReader.open(dir);
final IndexSearcher searcher = new IndexSearcher(rd);
for (int i = 0; i < numDocs; ++i) {
final Query query = new TermQuery(new Term("id", "" + i));
final TopDocs topDocs = searcher.search(query, 1);
assertEquals("" + i, 1, topDocs.totalHits);
final Document doc = rd.document(topDocs.scoreDocs[0].doc);
assertNotNull(doc);
final IndexableField[] fieldValues = doc.getFields("fld");
assertEquals(docs[i].getFields("fld").length, fieldValues.length);
if (fieldValues.length > 0) {
assertEquals(docs[i].getFields("fld")[0].binaryValue(), fieldValues[0].binaryValue());
}
}
rd.close();
iw.close();
dir.close();
}
use of org.apache.lucene.store.MMapDirectory in project Anserini by castorini.
the class SearchTweets method main.
public static void main(String[] args) throws Exception {
long curTime = System.nanoTime();
SearchArgs searchArgs = new SearchArgs();
CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + searchArgs.index);
Directory dir;
if (searchArgs.inmem) {
LOG.info("Using MMapDirectory with preload");
dir = new MMapDirectory(Paths.get(searchArgs.index));
((MMapDirectory) dir).setPreload(true);
} else {
LOG.info("Using default FSDirectory");
dir = FSDirectory.open(Paths.get(searchArgs.index));
}
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
if (searchArgs.ql) {
LOG.info("Using QL scoring model");
searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu));
} else if (searchArgs.bm25) {
LOG.info("Using BM25 scoring model");
searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b));
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
RerankerCascade cascade = new RerankerCascade();
if (searchArgs.rm3) {
cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt"));
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
} else {
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
}
if (!searchArgs.model.isEmpty() && searchArgs.extractors != null) {
LOG.debug(String.format("Ranklib model used, modeled loaded from %s", searchArgs.model));
cascade.add(new RankLibReranker(searchArgs.model, StatusField.TEXT.name, searchArgs.extractors));
}
FeatureExtractors extractorChain = null;
if (searchArgs.extractors != null) {
extractorChain = FeatureExtractors.loadExtractor(searchArgs.extractors);
}
if (searchArgs.dumpFeatures) {
PrintStream out = new PrintStream(searchArgs.featureFile);
Qrels qrels = new Qrels(searchArgs.qrels);
cascade.add(new TweetsLtrDataGenerator(out, qrels, extractorChain));
}
MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics));
PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output)));
LOG.info("Writing output to " + searchArgs.output);
LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
long totalTime = 0;
int cnt = 0;
for (MicroblogTopic topic : topics) {
long curQueryTime = System.nanoTime();
Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filter, BooleanClause.Occur.FILTER);
builder.add(query, BooleanClause.Occur.MUST);
Query q = builder.build();
TopDocs rs = searcher.search(q, searchArgs.hits);
List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
for (int i = 0; i < docs.documents.length; i++) {
String qid = topic.getId().replaceFirst("^MB0*", "");
out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], searchArgs.runtag));
}
long qtime = (System.nanoTime() - curQueryTime) / 1000000;
LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
totalTime += qtime;
cnt++;
}
LOG.info("All queries completed!");
LOG.info("Total elapsed time = " + totalTime + "ms");
LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
reader.close();
out.close();
}
use of org.apache.lucene.store.MMapDirectory in project Anserini by castorini.
the class SearchWebCollection method main.
public static void main(String[] args) throws Exception {
SearchArgs searchArgs = new SearchArgs();
CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + searchArgs.index);
Directory dir;
if (searchArgs.inmem) {
LOG.info("Using MMapDirectory with preload");
dir = new MMapDirectory(Paths.get(searchArgs.index));
((MMapDirectory) dir).setPreload(true);
} else {
LOG.info("Using default FSDirectory");
dir = FSDirectory.open(Paths.get(searchArgs.index));
}
Similarity similarity = null;
if (searchArgs.ql) {
LOG.info("Using QL scoring model");
similarity = new LMDirichletSimilarity(searchArgs.mu);
} else if (searchArgs.bm25) {
LOG.info("Using BM25 scoring model");
similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
RerankerCascade cascade = new RerankerCascade();
boolean useQueryParser = false;
if (searchArgs.rm3) {
cascade.add(new Rm3Reranker(new EnglishAnalyzer(), FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
useQueryParser = true;
} else {
cascade.add(new IdentityReranker());
}
FeatureExtractors extractors = null;
if (searchArgs.extractors != null) {
extractors = FeatureExtractors.loadExtractor(searchArgs.extractors);
}
if (searchArgs.dumpFeatures) {
PrintStream out = new PrintStream(searchArgs.featureFile);
Qrels qrels = new Qrels(searchArgs.qrels);
cascade.add(new WebCollectionLtrDataGenerator(out, qrels, extractors));
}
Path topicsFile = Paths.get(searchArgs.topics);
if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
}
TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + searchArgs.topicReader + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
SortedMap<Integer, String> topics = tr.read();
final long start = System.nanoTime();
SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
searcher.search(topics, searchArgs.output, similarity, searchArgs.hits, cascade, useQueryParser, searchArgs.keepstop);
searcher.close();
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info("Total " + topics.size() + " topics searched in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
use of org.apache.lucene.store.MMapDirectory in project elasticsearch by elastic.
the class FsDirectoryServiceTests method doTestPreload.
private void doTestPreload(String... preload) throws IOException {
Settings build = Settings.builder().put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), "mmapfs").putArray(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), preload).build();
IndexSettings settings = IndexSettingsModule.newIndexSettings("foo", build);
IndexStore store = new IndexStore(settings);
Path tempDir = createTempDir().resolve(settings.getUUID()).resolve("0");
Files.createDirectories(tempDir);
ShardPath path = new ShardPath(false, tempDir, tempDir, new ShardId(settings.getIndex(), 0));
FsDirectoryService fsDirectoryService = new FsDirectoryService(settings, store, path);
Directory directory = fsDirectoryService.newDirectory();
assertFalse(directory instanceof SleepingLockWrapper);
if (preload.length == 0) {
assertTrue(directory.toString(), directory instanceof MMapDirectory);
assertFalse(((MMapDirectory) directory).getPreload());
} else if (Arrays.asList(preload).contains("*")) {
assertTrue(directory.toString(), directory instanceof MMapDirectory);
assertTrue(((MMapDirectory) directory).getPreload());
} else {
assertTrue(directory.toString(), directory instanceof FileSwitchDirectory);
FileSwitchDirectory fsd = (FileSwitchDirectory) directory;
assertTrue(fsd.getPrimaryDir() instanceof MMapDirectory);
assertTrue(((MMapDirectory) fsd.getPrimaryDir()).getPreload());
assertTrue(fsd.getSecondaryDir() instanceof MMapDirectory);
assertFalse(((MMapDirectory) fsd.getSecondaryDir()).getPreload());
}
}
use of org.apache.lucene.store.MMapDirectory in project lucene-solr by apache.
the class TestIndexWriter method testDeleteUnusedFiles.
public void testDeleteUnusedFiles() throws Exception {
assumeFalse("test relies on exact filenames", Codec.getDefault() instanceof SimpleTextCodec);
assumeWorkingMMapOnWindows();
for (int iter = 0; iter < 2; iter++) {
// relies on windows semantics
Path path = createTempDir();
FileSystem fs = new WindowsFS(path.getFileSystem()).getFileSystem(URI.create("file:///"));
Path indexPath = new FilterPath(path, fs);
// NOTE: on Unix, we cannot use MMapDir, because WindowsFS doesn't see/think it keeps file handles open. Yet, on Windows, we MUST use
// MMapDir because the windows OS will in fact prevent file deletion for us, and fails otherwise:
FSDirectory dir;
if (Constants.WINDOWS) {
dir = new MMapDirectory(indexPath);
} else {
dir = new NIOFSDirectory(indexPath);
}
MergePolicy mergePolicy = newLogMergePolicy(true);
// This test expects all of its segments to be in CFS
mergePolicy.setNoCFSRatio(1.0);
mergePolicy.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(mergePolicy).setUseCompoundFile(true));
Document doc = new Document();
doc.add(newTextField("field", "go", Field.Store.NO));
w.addDocument(doc);
DirectoryReader r;
if (iter == 0) {
// use NRT
r = w.getReader();
} else {
// don't use NRT
w.commit();
r = DirectoryReader.open(dir);
}
assertTrue(Files.exists(indexPath.resolve("_0.cfs")));
assertTrue(Files.exists(indexPath.resolve("_0.cfe")));
assertTrue(Files.exists(indexPath.resolve("_0.si")));
if (iter == 1) {
// we run a full commit so there should be a segments file etc.
assertTrue(Files.exists(indexPath.resolve("segments_1")));
} else {
// this is an NRT reopen - no segments files yet
assertFalse(Files.exists(indexPath.resolve("segments_1")));
}
w.addDocument(doc);
w.forceMerge(1);
if (iter == 1) {
w.commit();
}
IndexReader r2 = DirectoryReader.openIfChanged(r);
assertNotNull(r2);
assertTrue(r != r2);
// NOTE: here we rely on "Windows" behavior, ie, even
// though IW wanted to delete _0.cfs since it was
// merged away, because we have a reader open
// against this file, it should still be here:
assertTrue(Files.exists(indexPath.resolve("_0.cfs")));
// forceMerge created this
//assertTrue(files.contains("_2.cfs"));
w.deleteUnusedFiles();
// r still holds this file open
assertTrue(Files.exists(indexPath.resolve("_0.cfs")));
//assertTrue(files.contains("_2.cfs"));
r.close();
if (iter == 0) {
// on closing NRT reader, it calls writer.deleteUnusedFiles
assertFalse(Files.exists(indexPath.resolve("_0.cfs")));
} else {
// now FSDir can remove it
dir.deletePendingFiles();
assertFalse(Files.exists(indexPath.resolve("_0.cfs")));
}
w.close();
r2.close();
dir.close();
}
}
Aggregations