Search in sources :

Example 1 with RcFileDecoderUtils.findFirstSyncPosition

use of io.trino.rcfile.RcFileDecoderUtils.findFirstSyncPosition in project trino by trinodb.

the class RcFileReader method advance.

public int advance() throws IOException {
    if (closed) {
        return -1;
    }
    rowGroupPosition += ColumnData.MAX_SIZE;
    currentChunkRowCount = min(ColumnData.MAX_SIZE, rowGroupRowCount - rowGroupPosition);
    // do we still have rows in the current row group
    if (currentChunkRowCount > 0) {
        validateWritePageChecksum();
        return currentChunkRowCount;
    }
    // are we at the end?
    if (input.remaining() == 0) {
        close();
        return -1;
    }
    // read uncompressed size of row group (which is useless information)
    verify(input.remaining() >= SIZE_OF_INT, "RCFile truncated %s", dataSource.getId());
    int unusedRowGroupSize = Integer.reverseBytes(input.readInt());
    // read sequence sync if present
    if (unusedRowGroupSize == -1) {
        verify(input.remaining() >= SIZE_OF_LONG + SIZE_OF_LONG + SIZE_OF_INT, "RCFile truncated %s", dataSource.getId());
        // NOTE: this decision must agree with RcFileDecoderUtils.findFirstSyncPosition
        if (input.position() - SIZE_OF_INT >= end) {
            close();
            return -1;
        }
        verify(syncFirst == input.readLong() && syncSecond == input.readLong(), "Invalid sync in RCFile %s", dataSource.getId());
        // read the useless uncompressed length
        unusedRowGroupSize = Integer.reverseBytes(input.readInt());
    } else if (rowsRead > 0) {
        validateWrite(writeValidation -> false, "Expected sync sequence for every row group except the first one");
    }
    verify(unusedRowGroupSize > 0, "Invalid uncompressed row group length %s", unusedRowGroupSize);
    // read row group header
    int uncompressedHeaderSize = Integer.reverseBytes(input.readInt());
    int compressedHeaderSize = Integer.reverseBytes(input.readInt());
    if (compressedHeaderSize > compressedHeaderBuffer.length()) {
        compressedHeaderBuffer = Slices.allocate(compressedHeaderSize);
    }
    input.readBytes(compressedHeaderBuffer, 0, compressedHeaderSize);
    // decompress row group header
    Slice header;
    if (decompressor != null) {
        if (headerBuffer.length() < uncompressedHeaderSize) {
            headerBuffer = Slices.allocate(uncompressedHeaderSize);
        }
        Slice buffer = headerBuffer.slice(0, uncompressedHeaderSize);
        decompressor.decompress(compressedHeaderBuffer, buffer);
        header = buffer;
    } else {
        verify(compressedHeaderSize == uncompressedHeaderSize, "Invalid RCFile %s", dataSource.getId());
        header = compressedHeaderBuffer;
    }
    BasicSliceInput headerInput = header.getInput();
    // read number of rows in row group
    rowGroupRowCount = toIntExact(readVInt(headerInput));
    rowsRead += rowGroupRowCount;
    rowGroupPosition = 0;
    currentChunkRowCount = min(ColumnData.MAX_SIZE, rowGroupRowCount);
    // set column buffers
    int totalCompressedDataSize = 0;
    for (int columnIndex = 0; columnIndex < columnCount; columnIndex++) {
        int compressedDataSize = toIntExact(readVInt(headerInput));
        totalCompressedDataSize += compressedDataSize;
        int uncompressedDataSize = toIntExact(readVInt(headerInput));
        if (decompressor == null && compressedDataSize != uncompressedDataSize) {
            throw corrupt("Invalid RCFile %s", dataSource.getId());
        }
        int lengthsSize = toIntExact(readVInt(headerInput));
        Slice lengthsBuffer = headerInput.readSlice(lengthsSize);
        if (readColumns.containsKey(columnIndex)) {
            Slice dataBuffer = input.readSlice(compressedDataSize);
            columns[columnIndex].setBuffers(lengthsBuffer, dataBuffer, uncompressedDataSize);
        } else {
            skipFully(input, compressedDataSize);
        }
    }
    // this value is not used but validate it is correct since it might signal corruption
    verify(unusedRowGroupSize == totalCompressedDataSize + uncompressedHeaderSize, "Invalid row group size");
    validateWriteRowGroupChecksum();
    validateWritePageChecksum();
    return currentChunkRowCount;
}
Also used : Slice(io.airlift.slice.Slice) BufferReference(io.airlift.slice.ChunkedSliceInput.BufferReference) Type(io.trino.spi.type.Type) Page(io.trino.spi.Page) SliceInput(io.airlift.slice.SliceInput) Unit(io.airlift.units.DataSize.Unit) WriteChecksum(io.trino.rcfile.RcFileWriteValidation.WriteChecksum) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) RcFileDecoderUtils.findFirstSyncPosition(io.trino.rcfile.RcFileDecoderUtils.findFirstSyncPosition) WriteChecksumBuilder.createWriteChecksumBuilder(io.trino.rcfile.RcFileWriteValidation.WriteChecksumBuilder.createWriteChecksumBuilder) Block(io.trino.spi.block.Block) Slices(io.airlift.slice.Slices) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) Math.toIntExact(java.lang.Math.toIntExact) RunLengthEncodedBlock(io.trino.spi.block.RunLengthEncodedBlock) SIZE_OF_INT(io.airlift.slice.SizeOf.SIZE_OF_INT) SliceLoader(io.airlift.slice.ChunkedSliceInput.SliceLoader) ByteStreams.skipFully(com.google.common.io.ByteStreams.skipFully) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) RcFileDecoderUtils.readVInt(io.trino.rcfile.RcFileDecoderUtils.readVInt) IOException(java.io.IOException) Math.min(java.lang.Math.min) BasicSliceInput(io.airlift.slice.BasicSliceInput) Preconditions.checkState(com.google.common.base.Preconditions.checkState) UncheckedIOException(java.io.UncheckedIOException) DataSize(io.airlift.units.DataSize) List(java.util.List) SIZE_OF_LONG(io.airlift.slice.SizeOf.SIZE_OF_LONG) Closeable(java.io.Closeable) Entry(java.util.Map.Entry) WriteChecksumBuilder(io.trino.rcfile.RcFileWriteValidation.WriteChecksumBuilder) Optional(java.util.Optional) ChunkedSliceInput(io.airlift.slice.ChunkedSliceInput) Slice(io.airlift.slice.Slice) BasicSliceInput(io.airlift.slice.BasicSliceInput)

Aggregations

Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ByteStreams.skipFully (com.google.common.io.ByteStreams.skipFully)1 BasicSliceInput (io.airlift.slice.BasicSliceInput)1 ChunkedSliceInput (io.airlift.slice.ChunkedSliceInput)1 BufferReference (io.airlift.slice.ChunkedSliceInput.BufferReference)1 SliceLoader (io.airlift.slice.ChunkedSliceInput.SliceLoader)1 SIZE_OF_INT (io.airlift.slice.SizeOf.SIZE_OF_INT)1 SIZE_OF_LONG (io.airlift.slice.SizeOf.SIZE_OF_LONG)1 Slice (io.airlift.slice.Slice)1 SliceInput (io.airlift.slice.SliceInput)1 Slices (io.airlift.slice.Slices)1 DataSize (io.airlift.units.DataSize)1 Unit (io.airlift.units.DataSize.Unit)1 RcFileDecoderUtils.findFirstSyncPosition (io.trino.rcfile.RcFileDecoderUtils.findFirstSyncPosition)1 RcFileDecoderUtils.readVInt (io.trino.rcfile.RcFileDecoderUtils.readVInt)1 WriteChecksum (io.trino.rcfile.RcFileWriteValidation.WriteChecksum)1 WriteChecksumBuilder (io.trino.rcfile.RcFileWriteValidation.WriteChecksumBuilder)1 WriteChecksumBuilder.createWriteChecksumBuilder (io.trino.rcfile.RcFileWriteValidation.WriteChecksumBuilder.createWriteChecksumBuilder)1