Search in sources :

Example 41 with FieldInfo

use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.

the class TermVectorsWriter method addAllDocVectors.

/** Safe (but, slowish) default method to write every
   *  vector field in the document. */
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
    if (vectors == null) {
        startDocument(0);
        finishDocument();
        return;
    }
    int numFields = vectors.size();
    if (numFields == -1) {
        // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
        numFields = 0;
        for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
            it.next();
            numFields++;
        }
    }
    startDocument(numFields);
    String lastFieldName = null;
    TermsEnum termsEnum = null;
    PostingsEnum docsAndPositionsEnum = null;
    int fieldCount = 0;
    for (String fieldName : vectors) {
        fieldCount++;
        final FieldInfo fieldInfo = mergeState.mergeFieldInfos.fieldInfo(fieldName);
        assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0 : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
        lastFieldName = fieldName;
        final Terms terms = vectors.terms(fieldName);
        if (terms == null) {
            // FieldsEnum shouldn't lie...
            continue;
        }
        final boolean hasPositions = terms.hasPositions();
        final boolean hasOffsets = terms.hasOffsets();
        final boolean hasPayloads = terms.hasPayloads();
        assert !hasPayloads || hasPositions;
        int numTerms = (int) terms.size();
        if (numTerms == -1) {
            // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
            numTerms = 0;
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
                numTerms++;
            }
        }
        startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
        termsEnum = terms.iterator();
        int termCount = 0;
        while (termsEnum.next() != null) {
            termCount++;
            final int freq = (int) termsEnum.totalTermFreq();
            startTerm(termsEnum.term(), freq);
            if (hasPositions || hasOffsets) {
                docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
                assert docsAndPositionsEnum != null;
                final int docID = docsAndPositionsEnum.nextDoc();
                assert docID != DocIdSetIterator.NO_MORE_DOCS;
                assert docsAndPositionsEnum.freq() == freq;
                for (int posUpto = 0; posUpto < freq; posUpto++) {
                    final int pos = docsAndPositionsEnum.nextPosition();
                    final int startOffset = docsAndPositionsEnum.startOffset();
                    final int endOffset = docsAndPositionsEnum.endOffset();
                    final BytesRef payload = docsAndPositionsEnum.getPayload();
                    assert !hasPositions || pos >= 0;
                    addPosition(pos, startOffset, endOffset, payload);
                }
            }
            finishTerm();
        }
        assert termCount == numTerms;
        finishField();
    }
    assert fieldCount == numFields;
    finishDocument();
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 42 with FieldInfo

use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.

the class SimpleTextFieldsWriter method write.

public void write(FieldInfos fieldInfos, Fields fields) throws IOException {
    // for each field
    for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
            // Annoyingly, this can happen!
            continue;
        }
        FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
        boolean wroteField = false;
        boolean hasPositions = terms.hasPositions();
        boolean hasFreqs = terms.hasFreqs();
        boolean hasPayloads = fieldInfo.hasPayloads();
        boolean hasOffsets = terms.hasOffsets();
        int flags = 0;
        if (hasPositions) {
            flags = PostingsEnum.POSITIONS;
            if (hasPayloads) {
                flags = flags | PostingsEnum.PAYLOADS;
            }
            if (hasOffsets) {
                flags = flags | PostingsEnum.OFFSETS;
            }
        } else {
            if (hasFreqs) {
                flags = flags | PostingsEnum.FREQS;
            }
        }
        TermsEnum termsEnum = terms.iterator();
        PostingsEnum postingsEnum = null;
        // for each term in field
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            postingsEnum = termsEnum.postings(postingsEnum, flags);
            assert postingsEnum != null : "termsEnum=" + termsEnum + " hasPos=" + hasPositions + " flags=" + flags;
            boolean wroteTerm = false;
            // for each doc in field+term
            while (true) {
                int doc = postingsEnum.nextDoc();
                if (doc == PostingsEnum.NO_MORE_DOCS) {
                    break;
                }
                if (!wroteTerm) {
                    if (!wroteField) {
                        // we lazily do this, in case the field had
                        // no terms              
                        write(FIELD);
                        write(field);
                        newline();
                        wroteField = true;
                    }
                    // we lazily do this, in case the term had
                    // zero docs
                    write(TERM);
                    write(term);
                    newline();
                    wroteTerm = true;
                }
                write(DOC);
                write(Integer.toString(doc));
                newline();
                if (hasFreqs) {
                    int freq = postingsEnum.freq();
                    write(FREQ);
                    write(Integer.toString(freq));
                    newline();
                    if (hasPositions) {
                        // for assert:
                        int lastStartOffset = 0;
                        // for each pos in field+term+doc
                        for (int i = 0; i < freq; i++) {
                            int position = postingsEnum.nextPosition();
                            write(POS);
                            write(Integer.toString(position));
                            newline();
                            if (hasOffsets) {
                                int startOffset = postingsEnum.startOffset();
                                int endOffset = postingsEnum.endOffset();
                                assert endOffset >= startOffset;
                                assert startOffset >= lastStartOffset : "startOffset=" + startOffset + " lastStartOffset=" + lastStartOffset;
                                lastStartOffset = startOffset;
                                write(START_OFFSET);
                                write(Integer.toString(startOffset));
                                newline();
                                write(END_OFFSET);
                                write(Integer.toString(endOffset));
                                newline();
                            }
                            BytesRef payload = postingsEnum.getPayload();
                            if (payload != null && payload.length > 0) {
                                assert payload.length != 0;
                                write(PAYLOAD);
                                write(payload);
                                newline();
                            }
                        }
                    }
                }
            }
        }
    }
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 43 with FieldInfo

use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.

the class SimpleTextStoredFieldsReader method visitDocument.

@Override
public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException {
    in.seek(offsets[n]);
    while (true) {
        readLine();
        if (StringHelper.startsWith(scratch.get(), FIELD) == false) {
            break;
        }
        int fieldNumber = parseIntAt(FIELD.length);
        FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
        readLine();
        assert StringHelper.startsWith(scratch.get(), NAME);
        readLine();
        assert StringHelper.startsWith(scratch.get(), TYPE);
        final BytesRef type;
        if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) {
            type = TYPE_STRING;
        } else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) {
            type = TYPE_BINARY;
        } else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) {
            type = TYPE_INT;
        } else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) {
            type = TYPE_LONG;
        } else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) {
            type = TYPE_FLOAT;
        } else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) {
            type = TYPE_DOUBLE;
        } else {
            throw new RuntimeException("unknown field type");
        }
        switch(visitor.needsField(fieldInfo)) {
            case YES:
                readField(type, fieldInfo, visitor);
                break;
            case NO:
                readLine();
                assert StringHelper.startsWith(scratch.get(), VALUE);
                break;
            case STOP:
                return;
        }
    }
}
Also used : FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef)

Example 44 with FieldInfo

use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.

the class PointsWriter method mergeOneField.

/** Default naive merge implementation for one field: it just re-indexes all the values
   *  from the incoming segment.  The default codec overrides this for 1D fields and uses
   *  a faster but more complex implementation. */
protected void mergeOneField(MergeState mergeState, FieldInfo fieldInfo) throws IOException {
    long maxPointCount = 0;
    int docCount = 0;
    for (int i = 0; i < mergeState.pointsReaders.length; i++) {
        PointsReader pointsReader = mergeState.pointsReaders[i];
        if (pointsReader != null) {
            FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
            if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
                PointValues values = pointsReader.getValues(fieldInfo.name);
                if (values != null) {
                    maxPointCount += values.size();
                    docCount += values.getDocCount();
                }
            }
        }
    }
    final long finalMaxPointCount = maxPointCount;
    final int finalDocCount = docCount;
    writeField(fieldInfo, new PointsReader() {

        @Override
        public long ramBytesUsed() {
            return 0;
        }

        @Override
        public void close() throws IOException {
        }

        @Override
        public PointValues getValues(String fieldName) {
            if (fieldName.equals(fieldInfo.name) == false) {
                throw new IllegalArgumentException("field name must match the field being merged");
            }
            return new PointValues() {

                @Override
                public void intersect(IntersectVisitor mergedVisitor) throws IOException {
                    for (int i = 0; i < mergeState.pointsReaders.length; i++) {
                        PointsReader pointsReader = mergeState.pointsReaders[i];
                        if (pointsReader == null) {
                            // This segment has no points
                            continue;
                        }
                        FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldName);
                        if (readerFieldInfo == null) {
                            // This segment never saw this field
                            continue;
                        }
                        if (readerFieldInfo.getPointDimensionCount() == 0) {
                            // This segment saw this field, but the field did not index points in it:
                            continue;
                        }
                        PointValues values = pointsReader.getValues(fieldName);
                        if (values == null) {
                            continue;
                        }
                        MergeState.DocMap docMap = mergeState.docMaps[i];
                        values.intersect(new IntersectVisitor() {

                            @Override
                            public void visit(int docID) {
                                // Should never be called because our compare method never returns Relation.CELL_INSIDE_QUERY
                                throw new IllegalStateException();
                            }

                            @Override
                            public void visit(int docID, byte[] packedValue) throws IOException {
                                int newDocID = docMap.get(docID);
                                if (newDocID != -1) {
                                    // Not deleted:
                                    mergedVisitor.visit(newDocID, packedValue);
                                }
                            }

                            @Override
                            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
                                // Forces this segment's PointsReader to always visit all docs + values:
                                return Relation.CELL_CROSSES_QUERY;
                            }
                        });
                    }
                }

                @Override
                public long estimatePointCount(IntersectVisitor visitor) {
                    throw new UnsupportedOperationException();
                }

                @Override
                public byte[] getMinPackedValue() {
                    throw new UnsupportedOperationException();
                }

                @Override
                public byte[] getMaxPackedValue() {
                    throw new UnsupportedOperationException();
                }

                @Override
                public int getNumDimensions() {
                    throw new UnsupportedOperationException();
                }

                @Override
                public int getBytesPerDimension() {
                    throw new UnsupportedOperationException();
                }

                @Override
                public long size() {
                    return finalMaxPointCount;
                }

                @Override
                public int getDocCount() {
                    return finalDocCount;
                }
            };
        }

        @Override
        public void checkIntegrity() throws IOException {
            throw new UnsupportedOperationException();
        }
    });
}
Also used : IOException(java.io.IOException) PointValues(org.apache.lucene.index.PointValues) FieldInfo(org.apache.lucene.index.FieldInfo)

Example 45 with FieldInfo

use of org.apache.lucene.index.FieldInfo in project lucene-solr by apache.

the class DocValuesConsumer method mergeSortedField.

/**
   * Merges the sorted docvalues from <code>toMerge</code>.
   * <p>
   * The default implementation calls {@link #addSortedField}, passing
   * an Iterable that merges ordinals and values and filters deleted documents .
   */
public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) throws IOException {
    List<SortedDocValues> toMerge = new ArrayList<>();
    for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
        SortedDocValues values = null;
        DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
        if (docValuesProducer != null) {
            FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
            if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
                values = docValuesProducer.getSorted(fieldInfo);
            }
        }
        if (values == null) {
            values = DocValues.emptySorted();
        }
        toMerge.add(values);
    }
    final int numReaders = toMerge.size();
    final SortedDocValues[] dvs = toMerge.toArray(new SortedDocValues[numReaders]);
    // step 1: iterate thru each sub and mark terms still in use
    TermsEnum[] liveTerms = new TermsEnum[dvs.length];
    long[] weights = new long[liveTerms.length];
    for (int sub = 0; sub < numReaders; sub++) {
        SortedDocValues dv = dvs[sub];
        Bits liveDocs = mergeState.liveDocs[sub];
        if (liveDocs == null) {
            liveTerms[sub] = dv.termsEnum();
            weights[sub] = dv.getValueCount();
        } else {
            LongBitSet bitset = new LongBitSet(dv.getValueCount());
            int docID;
            while ((docID = dv.nextDoc()) != NO_MORE_DOCS) {
                if (liveDocs.get(docID)) {
                    int ord = dv.ordValue();
                    if (ord >= 0) {
                        bitset.set(ord);
                    }
                }
            }
            liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
            weights[sub] = bitset.cardinality();
        }
    }
    // step 2: create ordinal map (this conceptually does the "merging")
    final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
    // step 3: add field
    addSortedField(fieldInfo, new EmptyDocValuesProducer() {

        @Override
        public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException {
            if (fieldInfoIn != fieldInfo) {
                throw new IllegalArgumentException("wrong FieldInfo");
            }
            // We must make new iterators + DocIDMerger for each iterator:
            List<SortedDocValuesSub> subs = new ArrayList<>();
            long cost = 0;
            for (int i = 0; i < mergeState.docValuesProducers.length; i++) {
                SortedDocValues values = null;
                DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i];
                if (docValuesProducer != null) {
                    FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name);
                    if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) {
                        values = docValuesProducer.getSorted(readerFieldInfo);
                    }
                }
                if (values == null) {
                    values = DocValues.emptySorted();
                }
                cost += values.cost();
                subs.add(new SortedDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i)));
            }
            final long finalCost = cost;
            final DocIDMerger<SortedDocValuesSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
            return new SortedDocValues() {

                private int docID = -1;

                private int ord;

                @Override
                public int docID() {
                    return docID;
                }

                @Override
                public int nextDoc() throws IOException {
                    SortedDocValuesSub sub = docIDMerger.next();
                    if (sub == null) {
                        return docID = NO_MORE_DOCS;
                    }
                    int subOrd = sub.values.ordValue();
                    assert subOrd != -1;
                    ord = (int) sub.map.get(subOrd);
                    docID = sub.mappedDocID;
                    return docID;
                }

                @Override
                public int ordValue() {
                    return ord;
                }

                @Override
                public int advance(int target) {
                    throw new UnsupportedOperationException();
                }

                @Override
                public boolean advanceExact(int target) throws IOException {
                    throw new UnsupportedOperationException();
                }

                @Override
                public long cost() {
                    return finalCost;
                }

                @Override
                public int getValueCount() {
                    return (int) map.getValueCount();
                }

                @Override
                public BytesRef lookupOrd(int ord) throws IOException {
                    int segmentNumber = map.getFirstSegmentNumber(ord);
                    int segmentOrd = (int) map.getFirstSegmentOrd(ord);
                    return dvs[segmentNumber].lookupOrd(segmentOrd);
                }
            };
        }
    });
}
Also used : ArrayList(java.util.ArrayList) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) LongBitSet(org.apache.lucene.util.LongBitSet) IOException(java.io.IOException) SortedDocValues(org.apache.lucene.index.SortedDocValues) OrdinalMap(org.apache.lucene.index.MultiDocValues.OrdinalMap) TermsEnum(org.apache.lucene.index.TermsEnum) FilteredTermsEnum(org.apache.lucene.index.FilteredTermsEnum) EmptyDocValuesProducer(org.apache.lucene.index.EmptyDocValuesProducer) DocIDMerger(org.apache.lucene.index.DocIDMerger) Bits(org.apache.lucene.util.Bits) ArrayList(java.util.ArrayList) List(java.util.List) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

FieldInfo (org.apache.lucene.index.FieldInfo)53 BytesRef (org.apache.lucene.util.BytesRef)13 LeafReader (org.apache.lucene.index.LeafReader)12 ArrayList (java.util.ArrayList)10 Terms (org.apache.lucene.index.Terms)9 TermsEnum (org.apache.lucene.index.TermsEnum)9 IOException (java.io.IOException)8 FieldInfos (org.apache.lucene.index.FieldInfos)8 HashMap (java.util.HashMap)7 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)7 DocValuesType (org.apache.lucene.index.DocValuesType)6 PointValues (org.apache.lucene.index.PointValues)6 IndexOutput (org.apache.lucene.store.IndexOutput)6 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)5 SortedSetDocValues (org.apache.lucene.index.SortedSetDocValues)5 StoredFieldVisitor (org.apache.lucene.index.StoredFieldVisitor)5 Map (java.util.Map)4 Document (org.apache.lucene.document.Document)4 EmptyDocValuesProducer (org.apache.lucene.index.EmptyDocValuesProducer)4 IndexReader (org.apache.lucene.index.IndexReader)4