Search in sources :

Example 26 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class DocValuesConsumer method mergeSortedSetField.

/**
   * Merges the sortedset docvalues from <code>toMerge</code>.
   * <p>
   * The default implementation calls {@link #addSortedSetField}, passing
   * an Iterable that merges ordinals and values and filters deleted documents .
   */
public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
    List<SortedSetDocValues> toMerge = new ArrayList<>();
    for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
        SortedSetDocValues values = null;
        DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
        if (docValuesProducer != null) {
            FieldInfo fieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
            if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
                values = docValuesProducer.getSortedSet(fieldInfo);
            }
        }
        if (values == null) {
            values = DocValues.emptySortedSet();
        }
        toMerge.add(values);
    }
    // step 1: iterate thru each sub and mark terms still in use
    TermsEnum[] liveTerms = new TermsEnum[toMerge.size()];
    long[] weights = new long[liveTerms.length];
    for (int sub = 0; sub < liveTerms.length; sub++) {
        SortedSetDocValues dv = toMerge.get(sub);
        Bits liveDocs = mergeState.liveDocs[sub];
        if (liveDocs == null) {
            liveTerms[sub] = dv.termsEnum();
            weights[sub] = dv.getValueCount();
        } else {
            LongBitSet bitset = new LongBitSet(dv.getValueCount());
            int docID;
            while ((docID = dv.nextDoc()) != NO_MORE_DOCS) {
                if (liveDocs.get(docID)) {
                    long ord;
                    while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                        bitset.set(ord);
                    }
                }
            }
            liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
            weights[sub] = bitset.cardinality();
        }
    }
    // step 2: create ordinal map (this conceptually does the "merging")
    final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
    // step 3: add field
    addSortedSetField(mergeFieldInfo, new EmptyDocValuesProducer() {

        @Override
        public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
            if (fieldInfo != mergeFieldInfo) {
                throw new IllegalArgumentException("wrong FieldInfo");
            }
            // We must make new iterators + DocIDMerger for each iterator:
            List<SortedSetDocValuesSub> subs = new ArrayList<>();
            long cost = 0;
            for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
                SortedSetDocValues values = null;
                DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
                if (docValuesProducer != null) {
                    FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name);
                    if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
                        values = docValuesProducer.getSortedSet(readerFieldInfo);
                    }
                }
                if (values == null) {
                    values = DocValues.emptySortedSet();
                }
                cost += values.cost();
                subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
            }
            final DocIDMerger<SortedSetDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
            final long finalCost = cost;
            return new SortedSetDocValues() {

                private int docID = -1;

                private SortedSetDocValuesSub currentSub;

                @Override
                public int docID() {
                    return docID;
                }

                @Override
                public int nextDoc() throws IOException {
                    currentSub = docIDMerger.next();
                    if (currentSub == null) {
                        docID = NO_MORE_DOCS;
                    } else {
                        docID = currentSub.mappedDocID;
                    }
                    return docID;
                }

                @Override
                public int advance(int target) throws IOException {
                    throw new UnsupportedOperationException();
                }

                @Override
                public boolean advanceExact(int target) throws IOException {
                    throw new UnsupportedOperationException();
                }

                @Override
                public long nextOrd() throws IOException {
                    long subOrd = currentSub.values.nextOrd();
                    if (subOrd == NO_MORE_ORDS) {
                        return NO_MORE_ORDS;
                    }
                    return currentSub.map.get(subOrd);
                }

                @Override
                public long cost() {
                    return finalCost;
                }

                @Override
                public BytesRef lookupOrd(long ord) throws IOException {
                    int segmentNumber = map.getFirstSegmentNumber(ord);
                    long segmentOrd = map.getFirstSegmentOrd(ord);
                    return toMerge.get(segmentNumber).lookupOrd(segmentOrd);
                }

                @Override
                public long getValueCount() {
                    return map.getValueCount();
                }
            };
        }
    });
}
Also used : ArrayList(java.util.ArrayList) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) LongBitSet(org.apache.lucene.util.LongBitSet) IOException(java.io.IOException) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap) TermsEnum(org.apache.lucene.index.TermsEnum) FilteredTermsEnum(org.apache.lucene.index.FilteredTermsEnum) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) DocIDMerger(org.apache.lucene.index.DocIDMerger) Bits(org.apache.lucene.util.Bits) ArrayList(java.util.ArrayList) List(java.util.List) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef)

Example 27 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestOrdsBlockTree method testBasic.

public void testBasic() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("field", "a b c", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator();
    // Test next()
    assertEquals(new BytesRef("a"), te.next());
    assertEquals(0L, te.ord());
    assertEquals(new BytesRef("b"), te.next());
    assertEquals(1L, te.ord());
    assertEquals(new BytesRef("c"), te.next());
    assertEquals(2L, te.ord());
    assertNull(te.next());
    // Test seekExact by term
    assertTrue(te.seekExact(new BytesRef("b")));
    assertEquals(1, te.ord());
    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(0, te.ord());
    assertTrue(te.seekExact(new BytesRef("c")));
    assertEquals(2, te.ord());
    // Test seekExact by ord
    te.seekExact(1);
    assertEquals(new BytesRef("b"), te.term());
    te.seekExact(0);
    assertEquals(new BytesRef("a"), te.term());
    te.seekExact(2);
    assertEquals(new BytesRef("c"), te.term());
    r.close();
    w.close();
    dir.close();
}
Also used : IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 28 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestOrdsBlockTree method testThreeBlocks.

public void testThreeBlocks() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 36; i++) {
        Document doc = new Document();
        String term = "" + (char) (97 + i);
        terms.add(term);
        if (VERBOSE) {
            System.out.println("i=" + i + " term=" + term);
        }
        doc.add(newTextField("field", term, Field.Store.NO));
        w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
        Document doc = new Document();
        String term = "m" + (char) (97 + i);
        terms.add(term);
        if (VERBOSE) {
            System.out.println("i=" + i + " term=" + term);
        }
        doc.add(newTextField("field", term, Field.Store.NO));
        w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
        Document doc = new Document();
        String term = "mo" + (char) (97 + i);
        terms.add(term);
        if (VERBOSE) {
            System.out.println("i=" + i + " term=" + term);
        }
        doc.add(newTextField("field", term, Field.Store.NO));
        w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator();
    if (VERBOSE) {
        while (te.next() != null) {
            System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString());
        }
    }
    assertTrue(te.seekExact(new BytesRef("mo")));
    assertEquals(27, te.ord());
    te.seekExact(90);
    assertEquals(new BytesRef("s"), te.term());
    testEnum(te, terms);
    r.close();
    w.close();
    dir.close();
}
Also used : ArrayList(java.util.ArrayList) IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 29 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestOrdsBlockTree method testFloorBlocks.

public void testFloorBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    for (int i = 0; i < 128; i++) {
        Document doc = new Document();
        String term = "" + (char) i;
        if (VERBOSE) {
            System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
        }
        doc.add(newStringField("field", term, Field.Store.NO));
        w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w);
    TermsEnum te = MultiFields.getTerms(r, "field").iterator();
    if (VERBOSE) {
        BytesRef term;
        while ((term = te.next()) != null) {
            System.out.println("  " + te.ord() + ": " + term.utf8ToString());
        }
    }
    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(97, te.ord());
    te.seekExact(98);
    assertEquals(new BytesRef("b"), te.term());
    assertTrue(te.seekExact(new BytesRef("z")));
    assertEquals(122, te.ord());
    r.close();
    w.close();
    dir.close();
}
Also used : MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 30 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestOrdsBlockTree method testSeveralNonRootBlocks.

public void testSeveralNonRootBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 30; i++) {
        for (int j = 0; j < 30; j++) {
            Document doc = new Document();
            String term = "" + (char) (97 + i) + (char) (97 + j);
            terms.add(term);
            if (VERBOSE) {
                System.out.println("term=" + term);
            }
            doc.add(newTextField("body", term, Field.Store.NO));
            w.addDocument(doc);
        }
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w);
    TermsEnum te = MultiFields.getTerms(r, "body").iterator();
    for (int i = 0; i < 30; i++) {
        for (int j = 0; j < 30; j++) {
            String term = "" + (char) (97 + i) + (char) (97 + j);
            if (VERBOSE) {
                System.out.println("TEST: check term=" + term);
            }
            assertEquals(term, te.next().utf8ToString());
            assertEquals(30 * i + j, te.ord());
        }
    }
    testEnum(te, terms);
    te.seekExact(0);
    assertEquals("aa", te.term().utf8ToString());
    r.close();
    w.close();
    dir.close();
}
Also used : MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) ArrayList(java.util.ArrayList) IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)153 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)101 PostingsEnum (org.apache.lucene.index.PostingsEnum)51 Term (org.apache.lucene.index.Term)31 ArrayList (java.util.ArrayList)30 IndexReader (org.apache.lucene.index.IndexReader)28 LeafReader (org.apache.lucene.index.LeafReader)28 Fields (org.apache.lucene.index.Fields)26 IOException (java.io.IOException)25 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)11 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10