Search in sources :

Example 66 with LeafReader

use of org.apache.lucene.index.LeafReader in project lucene-solr by apache.

the class BooleanPerceptronClassifierTest method testBasicUsage.

@Test
public void testBasicUsage() throws Exception {
    LeafReader leafReader = null;
    try {
        MockAnalyzer analyzer = new MockAnalyzer(random());
        leafReader = getSampleIndex(analyzer);
        BooleanPerceptronClassifier classifier = new BooleanPerceptronClassifier(leafReader, analyzer, null, 1, null, booleanFieldName, textFieldName);
        checkCorrectClassification(classifier, TECHNOLOGY_INPUT, false);
        checkCorrectClassification(classifier, POLITICS_INPUT, true);
    } finally {
        if (leafReader != null) {
            leafReader.close();
        }
    }
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Test(org.junit.Test)

Example 67 with LeafReader

use of org.apache.lucene.index.LeafReader in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method testSortedNumericAroundBlockSize.

@Slow
public void testSortedNumericAroundBlockSize() throws IOException {
    final int frontier = 1 << Lucene54DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
    for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
        final Directory dir = newDirectory();
        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
        RAMFile buffer = new RAMFile();
        RAMOutputStream out = new RAMOutputStream(buffer, false);
        Document doc = new Document();
        SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("snum", 0L);
        doc.add(field1);
        SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("snum", 0L);
        doc.add(field2);
        for (int i = 0; i < maxDoc; ++i) {
            long s1 = random().nextInt(100);
            long s2 = random().nextInt(100);
            field1.setLongValue(s1);
            field2.setLongValue(s2);
            w.addDocument(doc);
            out.writeVLong(Math.min(s1, s2));
            out.writeVLong(Math.max(s1, s2));
        }
        out.close();
        w.forceMerge(1);
        DirectoryReader r = DirectoryReader.open(w);
        w.close();
        LeafReader sr = getOnlyLeafReader(r);
        assertEquals(maxDoc, sr.maxDoc());
        SortedNumericDocValues values = sr.getSortedNumericDocValues("snum");
        assertNotNull(values);
        RAMInputStream in = new RAMInputStream("", buffer);
        for (int i = 0; i < maxDoc; ++i) {
            assertEquals(i, values.nextDoc());
            assertEquals(2, values.docValueCount());
            assertEquals(in.readVLong(), values.nextValue());
            assertEquals(in.readVLong(), values.nextValue());
        }
        r.close();
        dir.close();
    }
}
Also used : RAMFile(org.apache.lucene.store.RAMFile) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) LeafReader(org.apache.lucene.index.LeafReader) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMInputStream(org.apache.lucene.store.RAMInputStream) RAMOutputStream(org.apache.lucene.store.RAMOutputStream) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory)

Example 68 with LeafReader

use of org.apache.lucene.index.LeafReader in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method doTestTermsEnumRandom.

// TODO: try to refactor this and some termsenum tests into the base class.
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
// the postings format correctly.
private void doTestTermsEnumRandom(int numDocs, int minLength, int maxLength) throws Exception {
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    // set to duel against a codec which has ordinals:
    final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
    final DocValuesFormat dv = new Lucene54DocValuesFormat();
    conf.setCodec(new AssertingCodec() {

        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return pf;
        }

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            return dv;
        }
    });
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // index some docs
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
        doc.add(idField);
        final int length = TestUtil.nextInt(random(), minLength, maxLength);
        int numValues = random().nextInt(17);
        // create a random list of strings
        List<String> values = new ArrayList<>();
        for (int v = 0; v < numValues; v++) {
            values.add(TestUtil.randomSimpleString(random(), minLength, length));
        }
        // add in any order to the indexed field
        ArrayList<String> unordered = new ArrayList<>(values);
        Collections.shuffle(unordered, random());
        for (String v : values) {
            doc.add(newStringField("indexed", v, Field.Store.NO));
        }
        // add in any order to the dv field
        ArrayList<String> unordered2 = new ArrayList<>(values);
        Collections.shuffle(unordered2, random());
        for (String v : unordered2) {
            doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
        }
        writer.addDocument(doc);
        if (random().nextInt(31) == 0) {
            writer.commit();
        }
    }
    // delete some docs
    int numDeletions = random().nextInt(numDocs / 10);
    for (int i = 0; i < numDeletions; i++) {
        int id = random().nextInt(numDocs);
        writer.deleteDocuments(new Term("id", Integer.toString(id)));
    }
    // compare per-segment
    DirectoryReader ir = writer.getReader();
    for (LeafReaderContext context : ir.leaves()) {
        LeafReader r = context.reader();
        Terms terms = r.terms("indexed");
        if (terms != null) {
            SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
            assertEquals(terms.size(), ssdv.getValueCount());
            TermsEnum expected = terms.iterator();
            TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
            assertEquals(terms.size(), expected, actual);
            doTestSortedSetEnumAdvanceIndependently(ssdv);
        }
    }
    ir.close();
    writer.forceMerge(1);
    // now compare again after the merge
    ir = writer.getReader();
    LeafReader ar = getOnlyLeafReader(ir);
    Terms terms = ar.terms("indexed");
    if (terms != null) {
        assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
        TermsEnum expected = terms.iterator();
        TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
        assertEquals(terms.size(), expected, actual);
    }
    ir.close();
    writer.close();
    dir.close();
}
Also used : ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) DocValuesFormat(org.apache.lucene.codecs.DocValuesFormat) TermsEnum(org.apache.lucene.index.TermsEnum) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) AssertingCodec(org.apache.lucene.codecs.asserting.AssertingCodec) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) PostingsFormat(org.apache.lucene.codecs.PostingsFormat) StringField(org.apache.lucene.document.StringField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 69 with LeafReader

use of org.apache.lucene.index.LeafReader in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method doTestSparseDocValuesVsStoredFields.

private void doTestSparseDocValuesVsStoredFields() throws Exception {
    final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
    for (int i = 0; i < values.length; ++i) {
        values[i] = random().nextLong();
    }
    Directory dir = newFSDirectory(createTempDir());
    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
    conf.setMergeScheduler(new SerialMergeScheduler());
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
    // sparse compression is only enabled if less than 1% of docs have a value
    final int avgGap = 100;
    final int numDocs = atLeast(200);
    for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
        writer.addDocument(new Document());
    }
    final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
    for (int i = 0; i < numDocs; ++i) {
        Document doc = new Document();
        // single-valued
        long docValue = values[random().nextInt(values.length)];
        doc.add(new NumericDocValuesField("numeric", docValue));
        doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
        doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
        doc.add(new StoredField("value", docValue));
        // multi-valued
        final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
        for (int j = 0; j < numValues; ++j) {
            docValue = values[random().nextInt(values.length)];
            doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
            doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
            doc.add(new StoredField("values", docValue));
        }
        writer.addDocument(doc);
        // add a gap
        for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
            writer.addDocument(new Document());
        }
    }
    if (random().nextBoolean()) {
        writer.forceMerge(1);
    }
    final IndexReader indexReader = writer.getReader();
    TestUtil.checkReader(indexReader);
    writer.close();
    for (LeafReaderContext context : indexReader.leaves()) {
        final LeafReader reader = context.reader();
        final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
        final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
        final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
        final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
        final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
        for (int i = 0; i < reader.maxDoc(); ++i) {
            final Document doc = reader.document(i);
            final IndexableField valueField = doc.getField("value");
            final Long value = valueField == null ? null : valueField.numericValue().longValue();
            if (value == null) {
                assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
            } else {
                assertEquals(i, numeric.nextDoc());
                assertEquals(i, binary.nextDoc());
                assertEquals(i, sorted.nextDoc());
                assertEquals(value.longValue(), numeric.longValue());
                assertTrue(sorted.ordValue() >= 0);
                assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
                assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
            }
            final IndexableField[] valuesFields = doc.getFields("values");
            if (valuesFields.length == 0) {
                assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
            } else {
                final Set<Long> valueSet = new HashSet<>();
                for (IndexableField sf : valuesFields) {
                    valueSet.add(sf.numericValue().longValue());
                }
                assertEquals(i, sortedNumeric.nextDoc());
                assertEquals(valuesFields.length, sortedNumeric.docValueCount());
                for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
                    assertTrue(valueSet.contains(sortedNumeric.nextValue()));
                }
                assertEquals(i, sortedSet.nextDoc());
                int sortedSetCount = 0;
                while (true) {
                    long ord = sortedSet.nextOrd();
                    if (ord == SortedSetDocValues.NO_MORE_ORDS) {
                        break;
                    }
                    assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
                    sortedSetCount++;
                }
                assertEquals(valueSet.size(), sortedSetCount);
            }
        }
    }
    indexReader.close();
    dir.close();
}
Also used : SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) NumericDocValues(org.apache.lucene.index.NumericDocValues) SparseNumericDocValues(org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer.SparseNumericDocValues) SortedNumericDocValues(org.apache.lucene.index.SortedNumericDocValues) Document(org.apache.lucene.document.Document) BinaryDocValues(org.apache.lucene.index.BinaryDocValues) SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) StoredField(org.apache.lucene.document.StoredField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) LeafReader(org.apache.lucene.index.LeafReader) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValues(org.apache.lucene.index.SortedDocValues) IndexableField(org.apache.lucene.index.IndexableField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) IndexReader(org.apache.lucene.index.IndexReader) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 70 with LeafReader

use of org.apache.lucene.index.LeafReader in project lucene-solr by apache.

the class TestLucene54DocValuesFormat method testSortedSetAroundBlockSize.

@Slow
public void testSortedSetAroundBlockSize() throws IOException {
    final int frontier = 1 << Lucene54DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
    for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
        final Directory dir = newDirectory();
        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
        RAMFile buffer = new RAMFile();
        RAMOutputStream out = new RAMOutputStream(buffer, false);
        Document doc = new Document();
        SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
        doc.add(field1);
        SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
        doc.add(field2);
        for (int i = 0; i < maxDoc; ++i) {
            BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
            BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
            field1.setBytesValue(s1);
            field2.setBytesValue(s2);
            w.addDocument(doc);
            Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
            out.writeVInt(set.size());
            for (BytesRef ref : set) {
                out.writeVInt(ref.length);
                out.writeBytes(ref.bytes, ref.offset, ref.length);
            }
        }
        out.close();
        w.forceMerge(1);
        DirectoryReader r = DirectoryReader.open(w);
        w.close();
        LeafReader sr = getOnlyLeafReader(r);
        assertEquals(maxDoc, sr.maxDoc());
        SortedSetDocValues values = sr.getSortedSetDocValues("sset");
        assertNotNull(values);
        RAMInputStream in = new RAMInputStream("", buffer);
        BytesRefBuilder b = new BytesRefBuilder();
        for (int i = 0; i < maxDoc; ++i) {
            assertEquals(i, values.nextDoc());
            final int numValues = in.readVInt();
            for (int j = 0; j < numValues; ++j) {
                b.setLength(in.readVInt());
                b.grow(b.length());
                in.readBytes(b.bytes(), 0, b.length());
                assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
            }
            assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
        }
        r.close();
        dir.close();
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMInputStream(org.apache.lucene.store.RAMInputStream) Document(org.apache.lucene.document.Document) RAMFile(org.apache.lucene.store.RAMFile) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) TreeSet(java.util.TreeSet) RAMOutputStream(org.apache.lucene.store.RAMOutputStream) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

LeafReader (org.apache.lucene.index.LeafReader)168 BytesRef (org.apache.lucene.util.BytesRef)65 Document (org.apache.lucene.document.Document)61 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)58 Directory (org.apache.lucene.store.Directory)56 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)55 DirectoryReader (org.apache.lucene.index.DirectoryReader)47 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)42 Test (org.junit.Test)36 IndexWriter (org.apache.lucene.index.IndexWriter)32 Terms (org.apache.lucene.index.Terms)30 TermsEnum (org.apache.lucene.index.TermsEnum)28 NumericDocValues (org.apache.lucene.index.NumericDocValues)24 SortedSetDocValues (org.apache.lucene.index.SortedSetDocValues)24 SortedDocValues (org.apache.lucene.index.SortedDocValues)22 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)21 IndexReader (org.apache.lucene.index.IndexReader)20 Term (org.apache.lucene.index.Term)20 SortedDocValuesField (org.apache.lucene.document.SortedDocValuesField)18 Bits (org.apache.lucene.util.Bits)18