Search in sources :

Example 6 with PostingsFormat

use of org.apache.lucene.codecs.PostingsFormat in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method doTestTermsEnumRandom.

// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    // set to duel against a codec which has ordinals:
    final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
    final DocValuesFormat dv = new Lucene54DocValuesFormat();
    conf.setCodec(new AssertingCodec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return pf;
        }

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            return dv;
        }
    });
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // index some docs
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
        doc.add(idField);
        final int length = TestUtil.nextInt(random(), minLength, maxLength);
        int numValues = random().nextInt(17);
        // create a random list of strings
        List<String> values = new ArrayList<>();
        for (int v = 0; v < numValues; v++) {
            values.add(TestUtil.randomSimpleString(random(), minLength, length));
        }
        // add in any order to the indexed field
        ArrayList<String> unordered = new ArrayList<>(values);
        Collections.shuffle(unordered, random());
        for (String v : values) {
            doc.add(newStringField("indexed", v, Field.Store.NO));
        }
        // add in any order to the dv field
        ArrayList<String> unordered2 = new ArrayList<>(values);
        Collections.shuffle(unordered2, random());
        for (String v : unordered2) {
            doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
        }
        writer.addDocument(doc);
        if (random().nextInt(31) == 0) {
            writer.commit();
        }
    }
    // delete some docs
    int numDeletions = random().nextInt(numDocs / 10);
    for (int i = 0; i < numDeletions; i++) {
        int id = random().nextInt(numDocs);
        writer.deleteDocuments(new Term("id", Integer.toString(id)));
    }
    // compare per-segment
    DirectoryReader ir = writer.getReader();
    for (LeafReaderContext context : ir.leaves()) {
        LeafReader r = context.reader();
        Terms terms = r.terms("indexed");
        if (terms != null) {
            SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
            assertEquals(terms.size(), ssdv.getValueCount());
            TermsEnum expected = terms.iterator();
            TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
            assertEquals(terms.size(), expected, actual);
            doTestSortedSetEnumAdvanceIndependently(ssdv);
        }
    }
    ir.close();
    writer.forceMerge(1);
    // now compare again after the merge
    ir = writer.getReader();
    LeafReader ar = getOnlyLeafReader(ir);
    Terms terms = ar.terms("indexed");
    if (terms != null) {
        assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
        TermsEnum expected = terms.iterator();
        TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
        assertEquals(terms.size(), expected, actual);
    }
    ir.close();
    writer.close();
    dir.close();
}
Also used : ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) DocValuesFormat(org.apache.lucene.codecs.DocValuesFormat) TermsEnum(org.apache.lucene.index.TermsEnum) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) AssertingCodec(org.apache.lucene.codecs.asserting.AssertingCodec) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) StringField(org.apache.lucene.document.StringField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 7 with PostingsFormat

use of org.apache.lucene.codecs.PostingsFormat in project lucene-solr by apache.

the class TestLucene70DocValuesFormat method doTestTermsEnumRandom.

// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    // set to duel against a codec which has ordinals:
    final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
    final DocValuesFormat dv = new Lucene70DocValuesFormat();
    conf.setCodec(new AssertingCodec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return pf;
        }

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            return dv;
        }
    });
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // index some docs
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
        doc.add(idField);
        int numValues = random().nextInt(17);
        // create a random list of strings
        List<String> values = new ArrayList<>();
        for (int v = 0; v < numValues; v++) {
            values.add(valuesProducer.get());
        }
        // add in any order to the indexed field
        ArrayList<String> unordered = new ArrayList<>(values);
        Collections.shuffle(unordered, random());
        for (String v : values) {
            doc.add(newStringField("indexed", v, Field.Store.NO));
        }
        // add in any order to the dv field
        ArrayList<String> unordered2 = new ArrayList<>(values);
        Collections.shuffle(unordered2, random());
        for (String v : unordered2) {
            doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
        }
        writer.addDocument(doc);
        if (random().nextInt(31) == 0) {
            writer.commit();
        }
    }
    // delete some docs
    int numDeletions = random().nextInt(numDocs / 10);
    for (int i = 0; i < numDeletions; i++) {
        int id = random().nextInt(numDocs);
        writer.deleteDocuments(new Term("id", Integer.toString(id)));
    }
    // compare per-segment
    DirectoryReader ir = writer.getReader();
    for (LeafReaderContext context : ir.leaves()) {
        LeafReader r = context.reader();
        Terms terms = r.terms("indexed");
        if (terms != null) {
            SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
            assertEquals(terms.size(), ssdv.getValueCount());
            TermsEnum expected = terms.iterator();
            TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
            assertEquals(terms.size(), expected, actual);
            doTestSortedSetEnumAdvanceIndependently(ssdv);
        }
    }
    ir.close();
    writer.forceMerge(1);
    // now compare again after the merge
    ir = writer.getReader();
    LeafReader ar = getOnlyLeafReader(ir);
    Terms terms = ar.terms("indexed");
    if (terms != null) {
        assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
        TermsEnum expected = terms.iterator();
        TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
        assertEquals(terms.size(), expected, actual);
    }
    ir.close();
    writer.close();
    dir.close();
}
Also used : Lucene70DocValuesFormat(org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) Lucene70DocValuesFormat(org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat) DocValuesFormat(org.apache.lucene.codecs.DocValuesFormat) TermsEnum(org.apache.lucene.index.TermsEnum) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) AssertingCodec(org.apache.lucene.codecs.asserting.AssertingCodec) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) StringField(org.apache.lucene.document.StringField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 8 with PostingsFormat

use of org.apache.lucene.codecs.PostingsFormat in project lucene-solr by apache.

the class CreateIndexTask method createWriterConfig.

public static IndexWriterConfig createWriterConfig(Config config, PerfRunData runData, OpenMode mode, IndexCommit commit) {
    @SuppressWarnings("deprecation") IndexWriterConfig iwConf = new IndexWriterConfig(runData.getAnalyzer());
    iwConf.setOpenMode(mode);
    IndexDeletionPolicy indexDeletionPolicy = getIndexDeletionPolicy(config);
    iwConf.setIndexDeletionPolicy(indexDeletionPolicy);
    if (commit != null) {
        iwConf.setIndexCommit(commit);
    }
    final String mergeScheduler = config.get("merge.scheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
    if (mergeScheduler.equals(NoMergeScheduler.class.getName())) {
        iwConf.setMergeScheduler(NoMergeScheduler.INSTANCE);
    } else {
        try {
            iwConf.setMergeScheduler(Class.forName(mergeScheduler).asSubclass(MergeScheduler.class).newInstance());
        } catch (Exception e) {
            throw new RuntimeException("unable to instantiate class '" + mergeScheduler + "' as merge scheduler", e);
        }
        if (mergeScheduler.equals("org.apache.lucene.index.ConcurrentMergeScheduler")) {
            ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwConf.getMergeScheduler();
            int maxThreadCount = config.get("concurrent.merge.scheduler.max.thread.count", ConcurrentMergeScheduler.AUTO_DETECT_MERGES_AND_THREADS);
            int maxMergeCount = config.get("concurrent.merge.scheduler.max.merge.count", ConcurrentMergeScheduler.AUTO_DETECT_MERGES_AND_THREADS);
            cms.setMaxMergesAndThreads(maxMergeCount, maxThreadCount);
        }
    }
    final String defaultCodec = config.get("default.codec", null);
    if (defaultCodec != null) {
        try {
            Class<? extends Codec> clazz = Class.forName(defaultCodec).asSubclass(Codec.class);
            iwConf.setCodec(clazz.newInstance());
        } catch (Exception e) {
            throw new RuntimeException("Couldn't instantiate Codec: " + defaultCodec, e);
        }
    }
    final String postingsFormat = config.get("codec.postingsFormat", null);
    if (defaultCodec == null && postingsFormat != null) {
        try {
            final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
            iwConf.setCodec(new Lucene70Codec() {

                @Override
                public PostingsFormat getPostingsFormatForField(String field) {
                    return postingsFormatChosen;
                }
            });
        } catch (Exception e) {
            throw new RuntimeException("Couldn't instantiate Postings Format: " + postingsFormat, e);
        }
    }
    final String mergePolicy = config.get("merge.policy", "org.apache.lucene.index.LogByteSizeMergePolicy");
    boolean isCompound = config.get("compound", true);
    iwConf.setUseCompoundFile(isCompound);
    if (mergePolicy.equals(NoMergePolicy.class.getName())) {
        iwConf.setMergePolicy(NoMergePolicy.INSTANCE);
    } else {
        try {
            iwConf.setMergePolicy(Class.forName(mergePolicy).asSubclass(MergePolicy.class).newInstance());
        } catch (Exception e) {
            throw new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy", e);
        }
        iwConf.getMergePolicy().setNoCFSRatio(isCompound ? 1.0 : 0.0);
        if (iwConf.getMergePolicy() instanceof LogMergePolicy) {
            LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
            logMergePolicy.setMergeFactor(config.get("merge.factor", OpenIndexTask.DEFAULT_MERGE_PFACTOR));
        }
    }
    final double ramBuffer = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
    final int maxBuffered = config.get("max.buffered", OpenIndexTask.DEFAULT_MAX_BUFFERED);
    if (maxBuffered == IndexWriterConfig.DISABLE_AUTO_FLUSH) {
        iwConf.setRAMBufferSizeMB(ramBuffer);
        iwConf.setMaxBufferedDocs(maxBuffered);
    } else {
        iwConf.setMaxBufferedDocs(maxBuffered);
        iwConf.setRAMBufferSizeMB(ramBuffer);
    }
    return iwConf;
}
Also used : ConcurrentMergeScheduler(org.apache.lucene.index.ConcurrentMergeScheduler) IOException(java.io.IOException) NoMergeScheduler(org.apache.lucene.index.NoMergeScheduler) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) Lucene70Codec(org.apache.lucene.codecs.lucene70.Lucene70Codec) NoMergePolicy(org.apache.lucene.index.NoMergePolicy) LogMergePolicy(org.apache.lucene.index.LogMergePolicy) IndexDeletionPolicy(org.apache.lucene.index.IndexDeletionPolicy) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 9 with PostingsFormat

use of org.apache.lucene.codecs.PostingsFormat in project neo4j by neo4j.

the class IndexWriterConfigs method standard.

public static IndexWriterConfig standard() {
    IndexWriterConfig writerConfig = new IndexWriterConfig(LuceneDataSource.KEYWORD_ANALYZER);
    writerConfig.setMaxBufferedDocs(MAX_BUFFERED_DOCS);
    writerConfig.setMaxBufferedDeleteTerms(MAX_BUFFERED_DELETE_TERMS);
    writerConfig.setIndexDeletionPolicy(new MultipleBackupDeletionPolicy());
    writerConfig.setUseCompoundFile(true);
    writerConfig.setRAMBufferSizeMB(STANDARD_RAM_BUFFER_SIZE_MB);
    writerConfig.setCodec(new Lucene54Codec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            PostingsFormat postingFormat = super.getPostingsFormatForField(field);
            return CODEC_BLOCK_TREE_ORDS_POSTING_FORMAT ? blockTreeOrdsPostingsFormat : postingFormat;
        }
    });
    LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy();
    mergePolicy.setNoCFSRatio(MERGE_POLICY_NO_CFS_RATIO);
    mergePolicy.setMinMergeMB(MERGE_POLICY_MIN_MERGE_MB);
    mergePolicy.setMergeFactor(MERGE_POLICY_MERGE_FACTOR);
    writerConfig.setMergePolicy(mergePolicy);
    return writerConfig;
}
Also used : MultipleBackupDeletionPolicy(org.neo4j.index.impl.lucene.legacy.MultipleBackupDeletionPolicy) LogByteSizeMergePolicy(org.apache.lucene.index.LogByteSizeMergePolicy) BlockTreeOrdsPostingsFormat(org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Lucene54Codec(org.apache.lucene.codecs.lucene54.Lucene54Codec)

Example 10 with PostingsFormat

use of org.apache.lucene.codecs.PostingsFormat in project lucene-solr by apache.

the class TestPerFieldPostingsFormat2 method testMergeCalledOnTwoFormats.

@SuppressWarnings("deprecation")
public void testMergeCalledOnTwoFormats() throws IOException {
    MergeRecordingPostingsFormatWrapper pf1 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat());
    MergeRecordingPostingsFormatWrapper pf2 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat());
    IndexWriterConfig iwc = new IndexWriterConfig();
    iwc.setCodec(new AssertingCodec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            switch(field) {
                case "f1":
                case "f2":
                    return pf1;
                case "f3":
                case "f4":
                    return pf2;
                default:
                    return super.getPostingsFormatForField(field);
            }
        }
    });
    Directory directory = newDirectory();
    IndexWriter iwriter = new IndexWriter(directory, iwc);
    Document doc = new Document();
    doc.add(new StringField("f1", "val1", Field.Store.NO));
    doc.add(new StringField("f2", "val2", Field.Store.YES));
    // Points are not indexed as postings and should not appear in the merge fields
    doc.add(new IntPoint("f3", 3));
    doc.add(new StringField("f4", "val4", Field.Store.NO));
    iwriter.addDocument(doc);
    iwriter.commit();
    doc = new Document();
    doc.add(new StringField("f1", "val5", Field.Store.NO));
    doc.add(new StringField("f2", "val6", Field.Store.YES));
    doc.add(new IntPoint("f3", 7));
    doc.add(new StringField("f4", "val8", Field.Store.NO));
    iwriter.addDocument(doc);
    iwriter.commit();
    iwriter.forceMerge(1, true);
    iwriter.close();
    assertEquals(1, pf1.nbMergeCalls);
    assertEquals(new HashSet<>(Arrays.asList("f1", "f2")), new HashSet<>(pf1.fieldNames));
    assertEquals(1, pf2.nbMergeCalls);
    assertEquals(Collections.singletonList("f4"), pf2.fieldNames);
    directory.close();
}
Also used : AssertingCodec(org.apache.lucene.codecs.asserting.AssertingCodec) IntPoint(org.apache.lucene.document.IntPoint) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) DirectPostingsFormat(org.apache.lucene.codecs.memory.DirectPostingsFormat) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) MemoryPostingsFormat(org.apache.lucene.codecs.memory.MemoryPostingsFormat) StringField(org.apache.lucene.document.StringField) Document(org.apache.lucene.document.Document) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Directory(org.apache.lucene.store.Directory)

Aggregations

PostingsFormat (org.apache.lucene.codecs.PostingsFormat)12 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)6 DocValuesFormat (org.apache.lucene.codecs.DocValuesFormat)4 AssertingCodec (org.apache.lucene.codecs.asserting.AssertingCodec)4 AssertingPostingsFormat (org.apache.lucene.codecs.asserting.AssertingPostingsFormat)4 Lucene70Codec (org.apache.lucene.codecs.lucene70.Lucene70Codec)4 MockRandomPostingsFormat (org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat)4 Document (org.apache.lucene.document.Document)4 Directory (org.apache.lucene.store.Directory)4 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)3 BlockTreeOrdsPostingsFormat (org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat)3 DirectPostingsFormat (org.apache.lucene.codecs.memory.DirectPostingsFormat)3 MemoryPostingsFormat (org.apache.lucene.codecs.memory.MemoryPostingsFormat)3 StringField (org.apache.lucene.document.StringField)3 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)3 BytesRef (org.apache.lucene.util.BytesRef)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 FSTOrdPostingsFormat (org.apache.lucene.codecs.memory.FSTOrdPostingsFormat)2 FSTPostingsFormat (org.apache.lucene.codecs.memory.FSTPostingsFormat)2