Search in sources :

Example 26 with CorruptIndexException

use of org.apache.lucene.index.CorruptIndexException in project lucene-solr by apache.

the class CompressingTermVectorsReader method get.

public Fields get(int doc) throws IOException {
    // seek to the right place
        final long startPointer = indexReader.getStartPointer(doc);;
    // decode
    // - docBase: first doc ID of the chunk
    // - chunkDocs: number of docs of the chunk
    final int docBase = vectorsStream.readVInt();
    final int chunkDocs = vectorsStream.readVInt();
    if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
        throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
    // number of fields to skip
    final int skip;
    // number of fields of the document we're looking for
    final int numFields;
    // total number of fields of the chunk (sum for all docs)
    final int totalFields;
    if (chunkDocs == 1) {
        skip = 0;
        numFields = totalFields = vectorsStream.readVInt();
    } else {
        reader.reset(vectorsStream, chunkDocs);
        int sum = 0;
        for (int i = docBase; i < doc; ++i) {
            sum +=;
        skip = sum;
        numFields = (int);
        sum += numFields;
        for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
            sum +=;
        totalFields = sum;
    if (numFields == 0) {
        // no vectors
        return null;
    // read field numbers that have term vectors
    final int[] fieldNums;
        final int token = vectorsStream.readByte() & 0xFF;
        // means no term vectors, cannot happen since we checked for numFields == 0
        assert token != 0;
        final int bitsPerFieldNum = token & 0x1F;
        int totalDistinctFields = token >>> 5;
        if (totalDistinctFields == 0x07) {
            totalDistinctFields += vectorsStream.readVInt();
        final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
        fieldNums = new int[totalDistinctFields];
        for (int i = 0; i < totalDistinctFields; ++i) {
            fieldNums[i] = (int);
    // read field numbers and flags
    final int[] fieldNumOffs = new int[numFields];
    final PackedInts.Reader flags;
        final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
        final PackedInts.Reader allFieldNumOffs = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
        switch(vectorsStream.readVInt()) {
            case 0:
                final PackedInts.Reader fieldFlags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, fieldNums.length, FLAGS_BITS);
                PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
                for (int i = 0; i < totalFields; ++i) {
                    final int fieldNumOff = (int) allFieldNumOffs.get(i);
                    assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
                    final int fgs = (int) fieldFlags.get(fieldNumOff);
                    f.set(i, fgs);
                flags = f;
            case 1:
                flags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, FLAGS_BITS);
                throw new AssertionError();
        for (int i = 0; i < numFields; ++i) {
            fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
    // number of terms per field for all fields
    final PackedInts.Reader numTerms;
    final int totalTerms;
        final int bitsRequired = vectorsStream.readVInt();
        numTerms = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
        int sum = 0;
        for (int i = 0; i < totalFields; ++i) {
            sum += numTerms.get(i);
        totalTerms = sum;
    // term lengths
    int docOff = 0, docLen = 0, totalLen;
    final int[] fieldLengths = new int[numFields];
    final int[][] prefixLengths = new int[numFields][];
    final int[][] suffixLengths = new int[numFields][];
        reader.reset(vectorsStream, totalTerms);
        // skip
        int toSkip = 0;
        for (int i = 0; i < skip; ++i) {
            toSkip += numTerms.get(i);
        // read prefix lengths
        for (int i = 0; i < numFields; ++i) {
            final int termCount = (int) numTerms.get(skip + i);
            final int[] fieldPrefixLengths = new int[termCount];
            prefixLengths[i] = fieldPrefixLengths;
            for (int j = 0; j < termCount; ) {
                final LongsRef next = - j);
                for (int k = 0; k < next.length; ++k) {
                    fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
        reader.skip(totalTerms - reader.ord());
        reader.reset(vectorsStream, totalTerms);
        // skip
        toSkip = 0;
        for (int i = 0; i < skip; ++i) {
            for (int j = 0; j < numTerms.get(i); ++j) {
                docOff +=;
        for (int i = 0; i < numFields; ++i) {
            final int termCount = (int) numTerms.get(skip + i);
            final int[] fieldSuffixLengths = new int[termCount];
            suffixLengths[i] = fieldSuffixLengths;
            for (int j = 0; j < termCount; ) {
                final LongsRef next = - j);
                for (int k = 0; k < next.length; ++k) {
                    fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
            fieldLengths[i] = sum(suffixLengths[i]);
            docLen += fieldLengths[i];
        totalLen = docOff + docLen;
        for (int i = skip + numFields; i < totalFields; ++i) {
            for (int j = 0; j < numTerms.get(i); ++j) {
                totalLen +=;
    // term freqs
    final int[] termFreqs = new int[totalTerms];
        reader.reset(vectorsStream, totalTerms);
        for (int i = 0; i < totalTerms; ) {
            final LongsRef next = - i);
            for (int k = 0; k < next.length; ++k) {
                termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
    // total number of positions, offsets and payloads
    int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
    for (int i = 0, termIndex = 0; i < totalFields; ++i) {
        final int f = (int) flags.get(i);
        final int termCount = (int) numTerms.get(i);
        for (int j = 0; j < termCount; ++j) {
            final int freq = termFreqs[termIndex++];
            if ((f & POSITIONS) != 0) {
                totalPositions += freq;
            if ((f & OFFSETS) != 0) {
                totalOffsets += freq;
            if ((f & PAYLOADS) != 0) {
                totalPayloads += freq;
        assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
    final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
    final int[][] positions, startOffsets, lengths;
    if (totalPositions > 0) {
        positions = readPositions(skip, numFields, flags, numTerms, termFreqs, POSITIONS, totalPositions, positionIndex);
    } else {
        positions = new int[numFields][];
    if (totalOffsets > 0) {
        // average number of chars per term
        final float[] charsPerTerm = new float[fieldNums.length];
        for (int i = 0; i < charsPerTerm.length; ++i) {
            charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
        startOffsets = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
        lengths = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
        for (int i = 0; i < numFields; ++i) {
            final int[] fStartOffsets = startOffsets[i];
            final int[] fPositions = positions[i];
            // patch offsets from positions
            if (fStartOffsets != null && fPositions != null) {
                final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
                for (int j = 0; j < startOffsets[i].length; ++j) {
                    fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
            if (fStartOffsets != null) {
                final int[] fPrefixLengths = prefixLengths[i];
                final int[] fSuffixLengths = suffixLengths[i];
                final int[] fLengths = lengths[i];
                for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
                    // delta-decode start offsets and  patch lengths using term lengths
                    final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
                    lengths[i][positionIndex[i][j]] += termLength;
                    for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
                        fStartOffsets[k] += fStartOffsets[k - 1];
                        fLengths[k] += termLength;
    } else {
        startOffsets = lengths = new int[numFields][];
    if (totalPositions > 0) {
        // delta-decode positions
        for (int i = 0; i < numFields; ++i) {
            final int[] fPositions = positions[i];
            final int[] fpositionIndex = positionIndex[i];
            if (fPositions != null) {
                for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
                    // delta-decode start offsets
                    for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
                        fPositions[k] += fPositions[k - 1];
    // payload lengths
    final int[][] payloadIndex = new int[numFields][];
    int totalPayloadLength = 0;
    int payloadOff = 0;
    int payloadLen = 0;
    if (totalPayloads > 0) {
        reader.reset(vectorsStream, totalPayloads);
        // skip
        int termIndex = 0;
        for (int i = 0; i < skip; ++i) {
            final int f = (int) flags.get(i);
            final int termCount = (int) numTerms.get(i);
            if ((f & PAYLOADS) != 0) {
                for (int j = 0; j < termCount; ++j) {
                    final int freq = termFreqs[termIndex + j];
                    for (int k = 0; k < freq; ++k) {
                        final int l = (int);
                        payloadOff += l;
            termIndex += termCount;
        totalPayloadLength = payloadOff;
        // read doc payload lengths
        for (int i = 0; i < numFields; ++i) {
            final int f = (int) flags.get(skip + i);
            final int termCount = (int) numTerms.get(skip + i);
            if ((f & PAYLOADS) != 0) {
                final int totalFreq = positionIndex[i][termCount];
                payloadIndex[i] = new int[totalFreq + 1];
                int posIdx = 0;
                payloadIndex[i][posIdx] = payloadLen;
                for (int j = 0; j < termCount; ++j) {
                    final int freq = termFreqs[termIndex + j];
                    for (int k = 0; k < freq; ++k) {
                        final int payloadLength = (int);
                        payloadLen += payloadLength;
                        payloadIndex[i][posIdx + 1] = payloadLen;
                assert posIdx == totalFreq;
            termIndex += termCount;
        totalPayloadLength += payloadLen;
        for (int i = skip + numFields; i < totalFields; ++i) {
            final int f = (int) flags.get(i);
            final int termCount = (int) numTerms.get(i);
            if ((f & PAYLOADS) != 0) {
                for (int j = 0; j < termCount; ++j) {
                    final int freq = termFreqs[termIndex + j];
                    for (int k = 0; k < freq; ++k) {
                        totalPayloadLength +=;
            termIndex += termCount;
        assert termIndex == totalTerms : termIndex + " " + totalTerms;
    // decompress data
    final BytesRef suffixBytes = new BytesRef();
    decompressor.decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
    suffixBytes.length = docLen;
    final BytesRef payloadBytes = new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
    final int[] fieldFlags = new int[numFields];
    for (int i = 0; i < numFields; ++i) {
        fieldFlags[i] = (int) flags.get(skip + i);
    final int[] fieldNumTerms = new int[numFields];
    for (int i = 0; i < numFields; ++i) {
        fieldNumTerms[i] = (int) numTerms.get(skip + i);
    final int[][] fieldTermFreqs = new int[numFields][];
        int termIdx = 0;
        for (int i = 0; i < skip; ++i) {
            termIdx += numTerms.get(i);
        for (int i = 0; i < numFields; ++i) {
            final int termCount = (int) numTerms.get(skip + i);
            fieldTermFreqs[i] = new int[termCount];
            for (int j = 0; j < termCount; ++j) {
                fieldTermFreqs[i][j] = termFreqs[termIdx++];
    assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
    return new TVFields(fieldNums, fieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths, prefixLengths, suffixLengths, fieldTermFreqs, positionIndex, positions, startOffsets, lengths, payloadBytes, payloadIndex, suffixBytes);
Also used : CorruptIndexException(org.apache.lucene.index.CorruptIndexException) TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader) PackedInts(org.apache.lucene.util.packed.PackedInts) BlockPackedReaderIterator(org.apache.lucene.util.packed.BlockPackedReaderIterator) BytesRef(org.apache.lucene.util.BytesRef) LongsRef(org.apache.lucene.util.LongsRef)

Example 27 with CorruptIndexException

use of org.apache.lucene.index.CorruptIndexException in project lucene-solr by apache.

the class CompressingTermVectorsWriter method merge.

public int merge(MergeState mergeState) throws IOException {
    if (mergeState.needsIndexSort) {
        // being copied over...?
        return super.merge(mergeState);
    int docCount = 0;
    int numReaders = mergeState.maxDocs.length;
    MatchingReaders matching = new MatchingReaders(mergeState);
    for (int readerIndex = 0; readerIndex < numReaders; readerIndex++) {
        CompressingTermVectorsReader matchingVectorsReader = null;
        final TermVectorsReader vectorsReader = mergeState.termVectorsReaders[readerIndex];
        if (matching.matchingReaders[readerIndex]) {
            // we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
            if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
                matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
        final int maxDoc = mergeState.maxDocs[readerIndex];
        final Bits liveDocs = mergeState.liveDocs[readerIndex];
        if (matchingVectorsReader != null && matchingVectorsReader.getCompressionMode() == compressionMode && matchingVectorsReader.getChunkSize() == chunkSize && matchingVectorsReader.getVersion() == VERSION_CURRENT && matchingVectorsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT && BULK_MERGE_ENABLED && liveDocs == null && !tooDirty(matchingVectorsReader)) {
            // optimized merge, raw byte copy
            // its not worth fine-graining this if there are deletions.
            // flush any pending chunks
            if (!pendingDocs.isEmpty()) {
                // incomplete: we had to force this flush
            // iterate over each chunk. we use the vectors index to find chunk boundaries,
            // read the docstart + doccount from the chunk header (we write a new header, since doc numbers will change),
            // and just copy the bytes directly.
            IndexInput rawDocs = matchingVectorsReader.getVectorsStream();
            CompressingStoredFieldsIndexReader index = matchingVectorsReader.getIndexReader();
            int docID = 0;
            while (docID < maxDoc) {
                // read header
                int base = rawDocs.readVInt();
                if (base != docID) {
                    throw new CorruptIndexException("invalid state: base=" + base + ", docID=" + docID, rawDocs);
                int bufferedDocs = rawDocs.readVInt();
                // write a new index entry and new header for this chunk.
                indexWriter.writeIndex(bufferedDocs, vectorsStream.getFilePointer());
                // rebase
                docID += bufferedDocs;
                docCount += bufferedDocs;
                numDocs += bufferedDocs;
                if (docID > maxDoc) {
                    throw new CorruptIndexException("invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, rawDocs);
                // copy bytes until the next chunk boundary (or end of chunk data).
                // using the stored fields index for this isn't the most efficient, but fast enough
                // and is a source of redundancy for detecting bad things.
                final long end;
                if (docID == maxDoc) {
                    end = matchingVectorsReader.getMaxPointer();
                } else {
                    end = index.getStartPointer(docID);
                vectorsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());
            if (rawDocs.getFilePointer() != matchingVectorsReader.getMaxPointer()) {
                throw new CorruptIndexException("invalid state: pos=" + rawDocs.getFilePointer() + ", max=" + matchingVectorsReader.getMaxPointer(), rawDocs);
            // since we bulk merged all chunks, we inherit any dirty ones from this segment.
            numChunks += matchingVectorsReader.getNumChunks();
            numDirtyChunks += matchingVectorsReader.getNumDirtyChunks();
        } else {
            // naive merge...
            if (vectorsReader != null) {
            for (int i = 0; i < maxDoc; i++) {
                if (liveDocs != null && liveDocs.get(i) == false) {
                Fields vectors;
                if (vectorsReader == null) {
                    vectors = null;
                } else {
                    vectors = vectorsReader.get(i);
                addAllDocVectors(vectors, mergeState);
    finish(mergeState.mergeFieldInfos, docCount);
    return docCount;
Also used : Fields(org.apache.lucene.index.Fields) Bits(org.apache.lucene.util.Bits) IndexInput( CorruptIndexException(org.apache.lucene.index.CorruptIndexException) TermVectorsReader(org.apache.lucene.codecs.TermVectorsReader)

Example 28 with CorruptIndexException

use of org.apache.lucene.index.CorruptIndexException in project lucene-solr by apache.

the class Lucene70DocValuesProducer method readFields.

private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException {
    for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
        FieldInfo info = infos.fieldInfo(fieldNumber);
        if (info == null) {
            throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
        byte type = meta.readByte();
        if (type == Lucene70DocValuesFormat.NUMERIC) {
            numerics.put(, readNumeric(meta));
        } else if (type == Lucene70DocValuesFormat.BINARY) {
            binaries.put(, readBinary(meta));
        } else if (type == Lucene70DocValuesFormat.SORTED) {
            sorted.put(, readSorted(meta));
        } else if (type == Lucene70DocValuesFormat.SORTED_SET) {
            sortedSets.put(, readSortedSet(meta));
        } else if (type == Lucene70DocValuesFormat.SORTED_NUMERIC) {
            sortedNumerics.put(, readSortedNumeric(meta));
        } else {
            throw new CorruptIndexException("invalid type: " + type, meta);
Also used : CorruptIndexException(org.apache.lucene.index.CorruptIndexException) FieldInfo(org.apache.lucene.index.FieldInfo)

Example 29 with CorruptIndexException

use of org.apache.lucene.index.CorruptIndexException in project lucene-solr by apache.

the class Lucene70NormsProducer method readFields.

private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
    for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
        FieldInfo info = infos.fieldInfo(fieldNumber);
        if (info == null) {
            throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
        } else if (!info.hasNorms()) {
            throw new CorruptIndexException("Invalid field: " +, meta);
        NormsEntry entry = new NormsEntry();
        entry.docsWithFieldOffset = meta.readLong();
        entry.docsWithFieldLength = meta.readLong();
        entry.numDocsWithField = meta.readInt();
        entry.bytesPerNorm = meta.readByte();
        switch(entry.bytesPerNorm) {
            case 0:
            case 1:
            case 2:
            case 4:
            case 8:
                throw new CorruptIndexException("Invalid bytesPerValue: " + entry.bytesPerNorm + ", field: " +, meta);
        entry.normsOffset = meta.readLong();
        norms.put(info.number, entry);
Also used : CorruptIndexException(org.apache.lucene.index.CorruptIndexException) FieldInfo(org.apache.lucene.index.FieldInfo)

Example 30 with CorruptIndexException

use of org.apache.lucene.index.CorruptIndexException in project lucene-solr by apache.

the class Lucene50FieldInfosFormat method read.

public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException {
    final String fileName = IndexFileNames.segmentFileName(, segmentSuffix, EXTENSION);
    try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) {
        Throwable priorE = null;
        FieldInfo[] infos = null;
        try {
            CodecUtil.checkIndexHeader(input, Lucene50FieldInfosFormat.CODEC_NAME, Lucene50FieldInfosFormat.FORMAT_START, Lucene50FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix);
            //read in the size
            final int size = input.readVInt();
            infos = new FieldInfo[size];
            // previous field's attribute map, we share when possible:
            Map<String, String> lastAttributes = Collections.emptyMap();
            for (int i = 0; i < size; i++) {
                String name = input.readString();
                final int fieldNumber = input.readVInt();
                if (fieldNumber < 0) {
                    throw new CorruptIndexException("invalid field number for field: " + name + ", fieldNumber=" + fieldNumber, input);
                byte bits = input.readByte();
                boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
                boolean omitNorms = (bits & OMIT_NORMS) != 0;
                boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
                final IndexOptions indexOptions = getIndexOptions(input, input.readByte());
                // DV Types are packed in one byte
                final DocValuesType docValuesType = getDocValuesType(input, input.readByte());
                final long dvGen = input.readLong();
                Map<String, String> attributes = input.readMapOfStrings();
                // just use the last field's map if its the same
                if (attributes.equals(lastAttributes)) {
                    attributes = lastAttributes;
                lastAttributes = attributes;
                try {
                    infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, attributes, 0, 0);
                } catch (IllegalStateException e) {
                    throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
        } catch (Throwable exception) {
            priorE = exception;
        } finally {
            CodecUtil.checkFooter(input, priorE);
        return new FieldInfos(infos);
Also used : ChecksumIndexInput( IndexOptions(org.apache.lucene.index.IndexOptions) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) FieldInfos(org.apache.lucene.index.FieldInfos) DocValuesType(org.apache.lucene.index.DocValuesType) FieldInfo(org.apache.lucene.index.FieldInfo)


CorruptIndexException (org.apache.lucene.index.CorruptIndexException)64 ChecksumIndexInput ( IndexFormatTooNewException (org.apache.lucene.index.IndexFormatTooNewException)17 IndexFormatTooOldException (org.apache.lucene.index.IndexFormatTooOldException)17 Directory ( IndexInput ( IndexOutput ( IOException ( ArrayList (java.util.ArrayList)9 FileNotFoundException ( RAMDirectory ( BytesRef (org.apache.lucene.util.BytesRef)8 EOFException ( HashMap (java.util.HashMap)7 IOContext ( NoSuchFileException (java.nio.file.NoSuchFileException)6 AlreadyClosedException ( List (java.util.List)5 ElasticsearchException (org.elasticsearch.ElasticsearchException)5 AccessDeniedException (java.nio.file.AccessDeniedException)4