Search in sources :

Example 21 with OrcCorruptionException

use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.

the class TestDwrfMetadataReader method testReadStripeFooterThrowsForLargeStreams.

@Test
public void testReadStripeFooterThrowsForLargeStreams() {
    DwrfProto.Stream stream = DwrfProto.Stream.newBuilder().setKind(DATA).setLength(Long.MAX_VALUE).build();
    DwrfProto.StripeFooter protoStripeFooter = DwrfProto.StripeFooter.newBuilder().addStreams(stream).build();
    byte[] data = protoStripeFooter.toByteArray();
    InputStream inputStream = new ByteArrayInputStream(data);
    OrcDataSourceId orcDataSourceId = new OrcDataSourceId("test");
    OrcCorruptionException ex = expectThrows(OrcCorruptionException.class, () -> dwrfMetadataReader.readStripeFooter(orcDataSourceId, ImmutableList.of(), inputStream));
    assertEquals(ex.getMessage(), "java.io.IOException: Malformed ORC file. Stream size 9223372036854775807 of one of the streams for column 0 is larger than supported size 2147483647 [test]");
}
Also used : OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException) Test(org.testng.annotations.Test)

Example 22 with OrcCorruptionException

use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.

the class FloatBatchStreamReader method readBlock.

@Override
public Block readBlock() throws IOException {
    if (!rowGroupOpen) {
        openRowGroup();
    }
    if (readOffset > 0) {
        if (presentStream != null) {
            // skip ahead the present bit reader, but count the set bits
            // and use this as the skip size for the data reader
            readOffset = presentStream.countBitsSet(readOffset);
        }
        if (readOffset > 0) {
            if (dataStream == null) {
                throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
            }
            dataStream.skip(readOffset);
        }
    }
    if (dataStream == null && presentStream != null) {
        presentStream.skip(nextBatchSize);
        Block nullValueBlock = RunLengthEncodedBlock.create(REAL, null, nextBatchSize);
        readOffset = 0;
        nextBatchSize = 0;
        return nullValueBlock;
    }
    BlockBuilder builder = REAL.createBlockBuilder(null, nextBatchSize);
    if (presentStream == null) {
        if (dataStream == null) {
            throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
        }
        dataStream.nextVector(REAL, nextBatchSize, builder);
    } else {
        for (int i = 0; i < nextBatchSize; i++) {
            if (presentStream.nextBit()) {
                REAL.writeLong(builder, floatToRawIntBits(dataStream.next()));
            } else {
                builder.appendNull();
            }
        }
    }
    readOffset = 0;
    nextBatchSize = 0;
    return builder.build();
}
Also used : RunLengthEncodedBlock(com.facebook.presto.common.block.RunLengthEncodedBlock) Block(com.facebook.presto.common.block.Block) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException) BlockBuilder(com.facebook.presto.common.block.BlockBuilder)

Example 23 with OrcCorruptionException

use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.

the class SliceDictionarySelectiveReader method openRowGroup.

private void openRowGroup() throws IOException {
    // read the dictionary
    if (!stripeDictionaryOpen) {
        if (stripeDictionarySize > 0) {
            // resize the dictionary lengths array if necessary
            if (stripeDictionaryLength.length < stripeDictionarySize) {
                stripeDictionaryLength = new int[stripeDictionarySize];
            }
            // read the lengths
            LongInputStream lengthStream = stripeDictionaryLengthStreamSource.openStream();
            if (lengthStream == null) {
                throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Dictionary is not empty but dictionary length stream is not present");
            }
            lengthStream.nextIntVector(stripeDictionarySize, stripeDictionaryLength, 0);
            long dataLength = 0;
            for (int i = 0; i < stripeDictionarySize; i++) {
                dataLength += stripeDictionaryLength[i];
            }
            dictionaryData = ensureCapacity(dictionaryData, toIntExact(dataLength));
            dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + 2);
            // read dictionary values
            ByteArrayInputStream dictionaryDataStream = stripeDictionaryDataStreamSource.openStream();
            readDictionary(dictionaryDataStream, stripeDictionarySize, stripeDictionaryLength, 0, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
        } else {
            dictionaryData = EMPTY_DICTIONARY_DATA;
            dictionaryOffsetVector = EMPTY_DICTIONARY_OFFSETS;
        }
        // If there is no rowgroup dictionary, we only need to wrap the stripe dictionary once per stripe because wrapping dictionary is very expensive.
        dictionaryWrapped = false;
    }
    // read row group dictionary
    RowGroupDictionaryLengthInputStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream();
    if (dictionaryLengthStream != null) {
        int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount();
        rowGroupDictionaryLength = ensureCapacity(rowGroupDictionaryLength, rowGroupDictionarySize);
        // read the lengths
        dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength, 0);
        long dataLength = 0;
        for (int i = 0; i < rowGroupDictionarySize; i++) {
            dataLength += rowGroupDictionaryLength[i];
        }
        dictionaryData = ensureCapacity(dictionaryData, dictionaryOffsetVector[stripeDictionarySize] + toIntExact(dataLength), MEDIUM, PRESERVE);
        dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 2, MEDIUM, PRESERVE);
        dictionaryWrapped = false;
        // read dictionary values
        ByteArrayInputStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream();
        readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, stripeDictionarySize, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
        currentDictionarySize = stripeDictionarySize + rowGroupDictionarySize + 1;
        initiateEvaluationStatus(stripeDictionarySize + rowGroupDictionarySize + 1);
    } else {
        // there is no row group dictionary so use the stripe dictionary
        currentDictionarySize = stripeDictionarySize + 1;
        initiateEvaluationStatus(stripeDictionarySize + 1);
    }
    dictionaryOffsetVector[currentDictionarySize] = dictionaryOffsetVector[currentDictionarySize - 1];
    stripeDictionaryOpen = true;
    presentStream = presentStreamSource.openStream();
    inDictionaryStream = inDictionaryStreamSource.openStream();
    dataStream = dataStreamSource.openStream();
    rowGroupOpen = true;
}
Also used : ByteArrayInputStream(com.facebook.presto.orc.stream.ByteArrayInputStream) RowGroupDictionaryLengthInputStream(com.facebook.presto.orc.stream.RowGroupDictionaryLengthInputStream) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException) LongInputStream(com.facebook.presto.orc.stream.LongInputStream)

Example 24 with OrcCorruptionException

use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.

the class ListBatchStreamReader method readBlock.

@Override
public Block readBlock() throws IOException {
    if (!rowGroupOpen) {
        openRowGroup();
    }
    if (readOffset > 0) {
        if (presentStream != null) {
            // skip ahead the present bit reader, but count the set bits
            // and use this as the skip size for the data reader
            readOffset = presentStream.countBitsSet(readOffset);
        }
        if (readOffset > 0) {
            if (lengthStream == null) {
                throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
            }
            long elementSkipSize = lengthStream.sum(readOffset);
            elementStreamReader.prepareNextRead(toIntExact(elementSkipSize));
        }
    }
    // We will use the offsetVector as the buffer to read the length values from lengthStream,
    // and the length values will be converted in-place to an offset vector.
    int[] offsetVector = new int[nextBatchSize + 1];
    boolean[] nullVector = null;
    if (presentStream == null) {
        if (lengthStream == null) {
            throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
        }
        lengthStream.next(offsetVector, nextBatchSize);
    } else {
        nullVector = new boolean[nextBatchSize];
        int nullValues = presentStream.getUnsetBits(nextBatchSize, nullVector);
        if (nullValues != nextBatchSize) {
            if (lengthStream == null) {
                throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
            }
            lengthStream.next(offsetVector, nextBatchSize - nullValues);
            unpackLengthNulls(offsetVector, nullVector, nextBatchSize - nullValues);
        }
    }
    convertLengthVectorToOffsetVector(offsetVector);
    int elementCount = offsetVector[offsetVector.length - 1];
    Block elements;
    if (elementCount > 0) {
        elementStreamReader.prepareNextRead(elementCount);
        elements = elementStreamReader.readBlock();
    } else {
        elements = elementType.createBlockBuilder(null, 0).build();
    }
    Block arrayBlock = ArrayBlock.fromElementBlock(nextBatchSize, Optional.ofNullable(nullVector), offsetVector, elements);
    readOffset = 0;
    nextBatchSize = 0;
    return arrayBlock;
}
Also used : ArrayBlock(com.facebook.presto.common.block.ArrayBlock) Block(com.facebook.presto.common.block.Block) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Example 25 with OrcCorruptionException

use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.

the class DwrfMetadataReader method decryptAndCombineFileStatistics.

private List<ColumnStatistics> decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion, DwrfEncryption dwrfEncryption, EncryptionLibrary encryptionLibrary, List<ColumnStatistics> fileStats, List<StripeInformation> fileStripes, Map<Integer, Slice> nodeToIntermediateKeys, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) {
    requireNonNull(dwrfEncryption, "dwrfEncryption is null");
    requireNonNull(encryptionLibrary, "encryptionLibrary is null");
    if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
        return fileStats;
    }
    ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
    List<EncryptionGroup> encryptionGroups = dwrfEncryption.getEncryptionGroups();
    List<byte[]> stripeKeys = null;
    if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
        stripeKeys = fileStripes.get(0).getKeyMetadata();
        checkState(stripeKeys.size() == encryptionGroups.size(), "Number of keys in the first stripe must be the same as the number of encryption groups");
    }
    // node is added to the encryption group
    for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
        EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
        DwrfDataEncryptor decryptor = null;
        List<Integer> nodes = encryptionGroup.getNodes();
        for (int i = 0; i < nodes.size(); i++) {
            Integer nodeId = nodes.get(i);
            // do decryption only for those nodes that are requested (part of the projection)
            if (!nodeToIntermediateKeys.containsKey(nodeId)) {
                continue;
            }
            if (decryptor == null) {
                // DEK for the FileStats can be stored either in the footer or/and in the first stripe.
                // The key in the footer takes priority over the key in the first stripe.
                byte[] encryptedDataKeyWithMeta = null;
                if (encryptionGroup.getKeyMetadata().isPresent()) {
                    encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
                } else if (stripeKeys != null) {
                    encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
                }
                checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
                // decrypt the DEK which is encrypted using the IEK passed into a record reader
                byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
                byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
                decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
            }
            // decrypt the FileStats
            Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
            try (OrcInputStream inputStream = new OrcInputStream(orcDataSource.getId(), // Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
            new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), new BasicSliceInput(encryptedFileStats), decompressor, Optional.of(decryptor), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, encryptedFileStats.length())) {
                CodedInputStream input = CodedInputStream.newInstance(inputStream);
                DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
                // FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
                for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
                    decryptedFileStats[nodeId + statsIdx] = toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
                }
            } catch (IOException e) {
                throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
            }
        }
    }
    return ImmutableList.copyOf(decryptedFileStats);
}
Also used : ColumnStatistics.createColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.createColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) DwrfDataEncryptor(com.facebook.presto.orc.DwrfDataEncryptor) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) IOException(java.io.IOException) BasicSliceInput(io.airlift.slice.BasicSliceInput) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) OrcMetadataReader.byteStringToSlice(com.facebook.presto.orc.metadata.OrcMetadataReader.byteStringToSlice) Slice(io.airlift.slice.Slice) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Aggregations

OrcCorruptionException (com.facebook.presto.orc.OrcCorruptionException)53 Block (com.facebook.presto.common.block.Block)12 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)10 BlockBuilderStatus (com.facebook.presto.spi.block.BlockBuilderStatus)10 RunLengthEncodedBlock (com.facebook.presto.common.block.RunLengthEncodedBlock)9 BlockBuilder (com.facebook.presto.spi.block.BlockBuilder)8 LongStreamV2Checkpoint (com.facebook.presto.orc.checkpoint.LongStreamV2Checkpoint)6 InputStreamCheckpoint.createInputStreamCheckpoint (com.facebook.presto.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint)5 Slice (io.airlift.slice.Slice)5 BlockBuilder (com.facebook.presto.common.block.BlockBuilder)4 LongStreamV1Checkpoint (com.facebook.presto.orc.checkpoint.LongStreamV1Checkpoint)4 ByteStreamCheckpoint (com.facebook.presto.orc.checkpoint.ByteStreamCheckpoint)3 LongInputStream (com.facebook.presto.orc.stream.LongInputStream)3 Block (com.facebook.presto.spi.block.Block)3 ByteArrayBlock (com.facebook.presto.common.block.ByteArrayBlock)2 LongArrayBlock (com.facebook.presto.common.block.LongArrayBlock)2 VariableWidthBlock (com.facebook.presto.common.block.VariableWidthBlock)2 DecimalStreamCheckpoint (com.facebook.presto.orc.checkpoint.DecimalStreamCheckpoint)2 DwrfProto (com.facebook.presto.orc.proto.DwrfProto)2 ByteArrayInputStream (com.facebook.presto.orc.stream.ByteArrayInputStream)2