Search in sources :

Example 1 with OrcCorruptionException

use of io.trino.orc.OrcCorruptionException in project trino by trinodb.

the class UnionColumnReader method readBlock.

@Override
public Block readBlock() throws IOException {
    if (!rowGroupOpen) {
        openRowGroup();
    }
    if (readOffset > 0) {
        if (presentStream != null) {
            readOffset = presentStream.countBitsSet(readOffset);
        }
        if (readOffset > 0) {
            if (dataStream == null) {
                throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is not null but data stream is missing");
            }
            int[] readOffsets = new int[fieldReaders.size()];
            for (byte tag : dataStream.next(readOffset)) {
                readOffsets[tag]++;
            }
            for (int i = 0; i < fieldReaders.size(); i++) {
                fieldReaders.get(i).prepareNextRead(readOffsets[i]);
            }
        }
    }
    boolean[] nullVector = null;
    Block[] blocks;
    if (presentStream == null) {
        blocks = getBlocks(nextBatchSize);
    } else {
        nullVector = new boolean[nextBatchSize];
        int nullValues = presentStream.getUnsetBits(nextBatchSize, nullVector);
        if (nullValues != nextBatchSize) {
            blocks = getBlocks(nextBatchSize - nullValues);
        } else {
            List<Type> typeParameters = type.getTypeParameters();
            blocks = new Block[typeParameters.size() + 1];
            blocks[0] = TINYINT.createBlockBuilder(null, 0).build();
            for (int i = 0; i < typeParameters.size(); i++) {
                blocks[i + 1] = typeParameters.get(i).createBlockBuilder(null, 0).build();
            }
        }
    }
    verify(Arrays.stream(blocks).mapToInt(Block::getPositionCount).distinct().count() == 1);
    Block rowBlock = RowBlock.fromFieldBlocks(nextBatchSize, Optional.ofNullable(nullVector), blocks);
    readOffset = 0;
    nextBatchSize = 0;
    return rowBlock;
}
Also used : Type(io.trino.spi.type.Type) RowType(io.trino.spi.type.RowType) ReaderUtils.verifyStreamType(io.trino.orc.reader.ReaderUtils.verifyStreamType) LazyBlock(io.trino.spi.block.LazyBlock) Block(io.trino.spi.block.Block) RunLengthEncodedBlock(io.trino.spi.block.RunLengthEncodedBlock) RowBlock(io.trino.spi.block.RowBlock) ByteArrayBlock(io.trino.spi.block.ByteArrayBlock) OrcCorruptionException(io.trino.orc.OrcCorruptionException)

Example 2 with OrcCorruptionException

use of io.trino.orc.OrcCorruptionException in project trino by trinodb.

the class SliceDictionaryColumnReader method readBlock.

@Override
public Block readBlock() throws IOException {
    if (!rowGroupOpen) {
        openRowGroup();
    }
    if (readOffset > 0) {
        if (presentStream != null) {
            // skip ahead the present bit reader, but count the set bits
            // and use this as the skip size for the length reader
            readOffset = presentStream.countBitsSet(readOffset);
        }
        if (readOffset > 0) {
            if (dataStream == null) {
                throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is not null but data stream is missing");
            }
            dataStream.skip(readOffset);
        }
    }
    Block block;
    if (dataStream == null) {
        if (presentStream == null) {
            throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is null but present stream is missing");
        }
        presentStream.skip(nextBatchSize);
        block = readAllNullsBlock();
    } else if (presentStream == null) {
        block = readNonNullBlock();
    } else {
        boolean[] isNull = new boolean[nextBatchSize];
        int nullCount = presentStream.getUnsetBits(nextBatchSize, isNull);
        if (nullCount == 0) {
            block = readNonNullBlock();
        } else if (nullCount != nextBatchSize) {
            block = readNullBlock(isNull, nextBatchSize - nullCount);
        } else {
            block = readAllNullsBlock();
        }
    }
    readOffset = 0;
    nextBatchSize = 0;
    return block;
}
Also used : DictionaryBlock(io.trino.spi.block.DictionaryBlock) Block(io.trino.spi.block.Block) RunLengthEncodedBlock(io.trino.spi.block.RunLengthEncodedBlock) VariableWidthBlock(io.trino.spi.block.VariableWidthBlock) OrcCorruptionException(io.trino.orc.OrcCorruptionException)

Example 3 with OrcCorruptionException

use of io.trino.orc.OrcCorruptionException in project trino by trinodb.

the class SliceDirectColumnReader method readBlock.

@Override
public Block readBlock() throws IOException {
    if (!rowGroupOpen) {
        openRowGroup();
    }
    if (readOffset > 0) {
        if (presentStream != null) {
            // skip ahead the present bit reader, but count the set bits
            // and use this as the skip size for the length reader
            readOffset = presentStream.countBitsSet(readOffset);
        }
        if (readOffset > 0) {
            if (lengthStream == null) {
                throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is not null but length stream is missing");
            }
            long dataSkipSize = lengthStream.sum(readOffset);
            if (dataSkipSize > 0) {
                if (dataStream == null) {
                    throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is not null but data stream is missing");
                }
                dataStream.skip(dataSkipSize);
            }
        }
    }
    if (lengthStream == null) {
        if (presentStream == null) {
            throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is null but present stream is missing");
        }
        presentStream.skip(nextBatchSize);
        Block nullValueBlock = readAllNullsBlock();
        readOffset = 0;
        nextBatchSize = 0;
        return nullValueBlock;
    }
    // create new isNullVector and offsetVector for VariableWidthBlock
    boolean[] isNullVector = null;
    // We will use the offsetVector as the buffer to read the length values from lengthStream,
    // and the length values will be converted in-place to an offset vector.
    int[] offsetVector = new int[nextBatchSize + 1];
    if (presentStream == null) {
        lengthStream.next(offsetVector, nextBatchSize);
    } else {
        isNullVector = new boolean[nextBatchSize];
        int nullCount = presentStream.getUnsetBits(nextBatchSize, isNullVector);
        if (nullCount == nextBatchSize) {
            // all nulls
            Block nullValueBlock = readAllNullsBlock();
            readOffset = 0;
            nextBatchSize = 0;
            return nullValueBlock;
        }
        if (lengthStream == null) {
            throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is not null but length stream is missing");
        }
        if (nullCount == 0) {
            isNullVector = null;
            lengthStream.next(offsetVector, nextBatchSize);
        } else {
            lengthStream.next(offsetVector, nextBatchSize - nullCount);
            unpackLengthNulls(offsetVector, isNullVector, nextBatchSize - nullCount);
        }
    }
    // Calculate the total length for all entries. Note that the values in the offsetVector are still length values now.
    long totalLength = 0;
    for (int i = 0; i < nextBatchSize; i++) {
        totalLength += offsetVector[i];
    }
    int currentBatchSize = nextBatchSize;
    readOffset = 0;
    nextBatchSize = 0;
    if (totalLength == 0) {
        return new VariableWidthBlock(currentBatchSize, EMPTY_SLICE, offsetVector, Optional.ofNullable(isNullVector));
    }
    if (totalLength > ONE_GIGABYTE) {
        throw new TrinoException(GENERIC_INTERNAL_ERROR, format("Values in column \"%s\" are too large to process for Trino. %s column values are larger than 1GB [%s]", column.getPath(), nextBatchSize, column.getOrcDataSourceId()));
    }
    if (dataStream == null) {
        throw new OrcCorruptionException(column.getOrcDataSourceId(), "Value is not null but data stream is missing");
    }
    // allocate enough space to read
    byte[] data = new byte[toIntExact(totalLength)];
    Slice slice = Slices.wrappedBuffer(data);
    if (maxCodePointCount < 0) {
        // unbounded, simply read all data in on shot
        dataStream.next(data, 0, data.length);
        convertLengthVectorToOffsetVector(offsetVector);
    } else {
        // We do the following operations together in the for loop:
        // * truncate strings
        // * convert original length values in offsetVector into truncated offset values
        int currentLength = offsetVector[0];
        offsetVector[0] = 0;
        for (int i = 1; i <= currentBatchSize; i++) {
            int nextLength = offsetVector[i];
            if (isNullVector != null && isNullVector[i - 1]) {
                checkState(currentLength == 0, "Corruption in slice direct stream: length is non-zero for null entry");
                offsetVector[i] = offsetVector[i - 1];
                currentLength = nextLength;
                continue;
            }
            int offset = offsetVector[i - 1];
            // read data without truncation
            dataStream.next(data, offset, offset + currentLength);
            // adjust offsetVector with truncated length
            int truncatedLength = computeTruncatedLength(slice, offset, currentLength, maxCodePointCount, isCharType);
            verify(truncatedLength >= 0);
            offsetVector[i] = offset + truncatedLength;
            currentLength = nextLength;
        }
    }
    // this can lead to over-retention but unlikely to happen given truncation rarely happens
    return new VariableWidthBlock(currentBatchSize, slice, offsetVector, Optional.ofNullable(isNullVector));
}
Also used : Slice(io.airlift.slice.Slice) Block(io.trino.spi.block.Block) RunLengthEncodedBlock(io.trino.spi.block.RunLengthEncodedBlock) VariableWidthBlock(io.trino.spi.block.VariableWidthBlock) TrinoException(io.trino.spi.TrinoException) OrcCorruptionException(io.trino.orc.OrcCorruptionException) VariableWidthBlock(io.trino.spi.block.VariableWidthBlock)

Example 4 with OrcCorruptionException

use of io.trino.orc.OrcCorruptionException in project trino by trinodb.

the class ByteInputStream method next.

public void next(byte[] values, int items) throws IOException {
    int outputOffset = 0;
    while (outputOffset < items) {
        if (offset == length) {
            readNextBlock();
        }
        if (length == 0) {
            throw new OrcCorruptionException(input.getOrcDataSourceId(), "Unexpected end of stream");
        }
        int chunkSize = min(items - outputOffset, length - offset);
        System.arraycopy(buffer, offset, values, outputOffset, chunkSize);
        outputOffset += chunkSize;
        offset += chunkSize;
    }
}
Also used : OrcCorruptionException(io.trino.orc.OrcCorruptionException) ByteStreamCheckpoint(io.trino.orc.checkpoint.ByteStreamCheckpoint)

Example 5 with OrcCorruptionException

use of io.trino.orc.OrcCorruptionException in project trino by trinodb.

the class CompressedOrcChunkLoader method seekToCheckpoint.

@Override
public void seekToCheckpoint(long checkpoint) throws IOException {
    int compressedOffset = decodeCompressedBlockOffset(checkpoint);
    if (compressedOffset >= dataReader.getSize()) {
        throw new OrcCorruptionException(dataReader.getOrcDataSourceId(), "Seek past end of stream");
    }
    // is the compressed offset within the current compressed buffer
    if (compressedBufferStart <= compressedOffset && compressedOffset < compressedBufferStart + compressedBufferStream.length()) {
        compressedBufferStream.setPosition(compressedOffset - compressedBufferStart);
    } else {
        compressedBufferStart = compressedOffset;
        compressedBufferStream = EMPTY_SLICE.getInput();
    }
    nextUncompressedOffset = decodeDecompressedOffset(checkpoint);
    lastCheckpoint = checkpoint;
}
Also used : OrcCorruptionException(io.trino.orc.OrcCorruptionException) InputStreamCheckpoint.createInputStreamCheckpoint(io.trino.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint)

Aggregations

OrcCorruptionException (io.trino.orc.OrcCorruptionException)33 Block (io.trino.spi.block.Block)13 RunLengthEncodedBlock (io.trino.spi.block.RunLengthEncodedBlock)11 LongStreamCheckpoint (io.trino.orc.checkpoint.LongStreamCheckpoint)7 DecimalStreamCheckpoint (io.trino.orc.checkpoint.DecimalStreamCheckpoint)4 InputStreamCheckpoint.createInputStreamCheckpoint (io.trino.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint)4 LongStreamV2Checkpoint (io.trino.orc.checkpoint.LongStreamV2Checkpoint)4 ByteArrayBlock (io.trino.spi.block.ByteArrayBlock)4 LongArrayBlock (io.trino.spi.block.LongArrayBlock)4 Slice (io.airlift.slice.Slice)3 LongStreamV1Checkpoint (io.trino.orc.checkpoint.LongStreamV1Checkpoint)3 ByteStreamCheckpoint (io.trino.orc.checkpoint.ByteStreamCheckpoint)2 ReaderUtils.verifyStreamType (io.trino.orc.reader.ReaderUtils.verifyStreamType)2 IntArrayBlock (io.trino.spi.block.IntArrayBlock)2 LazyBlock (io.trino.spi.block.LazyBlock)2 RowBlock (io.trino.spi.block.RowBlock)2 VariableWidthBlock (io.trino.spi.block.VariableWidthBlock)2 RowType (io.trino.spi.type.RowType)2 Type (io.trino.spi.type.Type)2 ColumnReaders.createColumnReader (io.trino.orc.reader.ColumnReaders.createColumnReader)1