use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.
the class TestDwrfMetadataReader method testReadStripeFooterThrowsForLargeStreams.
@Test
public void testReadStripeFooterThrowsForLargeStreams() {
DwrfProto.Stream stream = DwrfProto.Stream.newBuilder().setKind(DATA).setLength(Long.MAX_VALUE).build();
DwrfProto.StripeFooter protoStripeFooter = DwrfProto.StripeFooter.newBuilder().addStreams(stream).build();
byte[] data = protoStripeFooter.toByteArray();
InputStream inputStream = new ByteArrayInputStream(data);
OrcDataSourceId orcDataSourceId = new OrcDataSourceId("test");
OrcCorruptionException ex = expectThrows(OrcCorruptionException.class, () -> dwrfMetadataReader.readStripeFooter(orcDataSourceId, ImmutableList.of(), inputStream));
assertEquals(ex.getMessage(), "java.io.IOException: Malformed ORC file. Stream size 9223372036854775807 of one of the streams for column 0 is larger than supported size 2147483647 [test]");
}
use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.
the class FloatBatchStreamReader method readBlock.
@Override
public Block readBlock() throws IOException {
if (!rowGroupOpen) {
openRowGroup();
}
if (readOffset > 0) {
if (presentStream != null) {
// skip ahead the present bit reader, but count the set bits
// and use this as the skip size for the data reader
readOffset = presentStream.countBitsSet(readOffset);
}
if (readOffset > 0) {
if (dataStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
}
dataStream.skip(readOffset);
}
}
if (dataStream == null && presentStream != null) {
presentStream.skip(nextBatchSize);
Block nullValueBlock = RunLengthEncodedBlock.create(REAL, null, nextBatchSize);
readOffset = 0;
nextBatchSize = 0;
return nullValueBlock;
}
BlockBuilder builder = REAL.createBlockBuilder(null, nextBatchSize);
if (presentStream == null) {
if (dataStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
}
dataStream.nextVector(REAL, nextBatchSize, builder);
} else {
for (int i = 0; i < nextBatchSize; i++) {
if (presentStream.nextBit()) {
REAL.writeLong(builder, floatToRawIntBits(dataStream.next()));
} else {
builder.appendNull();
}
}
}
readOffset = 0;
nextBatchSize = 0;
return builder.build();
}
use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.
the class SliceDictionarySelectiveReader method openRowGroup.
private void openRowGroup() throws IOException {
// read the dictionary
if (!stripeDictionaryOpen) {
if (stripeDictionarySize > 0) {
// resize the dictionary lengths array if necessary
if (stripeDictionaryLength.length < stripeDictionarySize) {
stripeDictionaryLength = new int[stripeDictionarySize];
}
// read the lengths
LongInputStream lengthStream = stripeDictionaryLengthStreamSource.openStream();
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Dictionary is not empty but dictionary length stream is not present");
}
lengthStream.nextIntVector(stripeDictionarySize, stripeDictionaryLength, 0);
long dataLength = 0;
for (int i = 0; i < stripeDictionarySize; i++) {
dataLength += stripeDictionaryLength[i];
}
dictionaryData = ensureCapacity(dictionaryData, toIntExact(dataLength));
dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + 2);
// read dictionary values
ByteArrayInputStream dictionaryDataStream = stripeDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, stripeDictionarySize, stripeDictionaryLength, 0, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
} else {
dictionaryData = EMPTY_DICTIONARY_DATA;
dictionaryOffsetVector = EMPTY_DICTIONARY_OFFSETS;
}
// If there is no rowgroup dictionary, we only need to wrap the stripe dictionary once per stripe because wrapping dictionary is very expensive.
dictionaryWrapped = false;
}
// read row group dictionary
RowGroupDictionaryLengthInputStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream();
if (dictionaryLengthStream != null) {
int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount();
rowGroupDictionaryLength = ensureCapacity(rowGroupDictionaryLength, rowGroupDictionarySize);
// read the lengths
dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength, 0);
long dataLength = 0;
for (int i = 0; i < rowGroupDictionarySize; i++) {
dataLength += rowGroupDictionaryLength[i];
}
dictionaryData = ensureCapacity(dictionaryData, dictionaryOffsetVector[stripeDictionarySize] + toIntExact(dataLength), MEDIUM, PRESERVE);
dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 2, MEDIUM, PRESERVE);
dictionaryWrapped = false;
// read dictionary values
ByteArrayInputStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, stripeDictionarySize, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
currentDictionarySize = stripeDictionarySize + rowGroupDictionarySize + 1;
initiateEvaluationStatus(stripeDictionarySize + rowGroupDictionarySize + 1);
} else {
// there is no row group dictionary so use the stripe dictionary
currentDictionarySize = stripeDictionarySize + 1;
initiateEvaluationStatus(stripeDictionarySize + 1);
}
dictionaryOffsetVector[currentDictionarySize] = dictionaryOffsetVector[currentDictionarySize - 1];
stripeDictionaryOpen = true;
presentStream = presentStreamSource.openStream();
inDictionaryStream = inDictionaryStreamSource.openStream();
dataStream = dataStreamSource.openStream();
rowGroupOpen = true;
}
use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.
the class ListBatchStreamReader method readBlock.
@Override
public Block readBlock() throws IOException {
if (!rowGroupOpen) {
openRowGroup();
}
if (readOffset > 0) {
if (presentStream != null) {
// skip ahead the present bit reader, but count the set bits
// and use this as the skip size for the data reader
readOffset = presentStream.countBitsSet(readOffset);
}
if (readOffset > 0) {
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
}
long elementSkipSize = lengthStream.sum(readOffset);
elementStreamReader.prepareNextRead(toIntExact(elementSkipSize));
}
}
// We will use the offsetVector as the buffer to read the length values from lengthStream,
// and the length values will be converted in-place to an offset vector.
int[] offsetVector = new int[nextBatchSize + 1];
boolean[] nullVector = null;
if (presentStream == null) {
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
}
lengthStream.next(offsetVector, nextBatchSize);
} else {
nullVector = new boolean[nextBatchSize];
int nullValues = presentStream.getUnsetBits(nextBatchSize, nullVector);
if (nullValues != nextBatchSize) {
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but data stream is not present");
}
lengthStream.next(offsetVector, nextBatchSize - nullValues);
unpackLengthNulls(offsetVector, nullVector, nextBatchSize - nullValues);
}
}
convertLengthVectorToOffsetVector(offsetVector);
int elementCount = offsetVector[offsetVector.length - 1];
Block elements;
if (elementCount > 0) {
elementStreamReader.prepareNextRead(elementCount);
elements = elementStreamReader.readBlock();
} else {
elements = elementType.createBlockBuilder(null, 0).build();
}
Block arrayBlock = ArrayBlock.fromElementBlock(nextBatchSize, Optional.ofNullable(nullVector), offsetVector, elements);
readOffset = 0;
nextBatchSize = 0;
return arrayBlock;
}
use of com.facebook.presto.orc.OrcCorruptionException in project presto by prestodb.
the class DwrfMetadataReader method decryptAndCombineFileStatistics.
private List<ColumnStatistics> decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion, DwrfEncryption dwrfEncryption, EncryptionLibrary encryptionLibrary, List<ColumnStatistics> fileStats, List<StripeInformation> fileStripes, Map<Integer, Slice> nodeToIntermediateKeys, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) {
requireNonNull(dwrfEncryption, "dwrfEncryption is null");
requireNonNull(encryptionLibrary, "encryptionLibrary is null");
if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
return fileStats;
}
ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
List<EncryptionGroup> encryptionGroups = dwrfEncryption.getEncryptionGroups();
List<byte[]> stripeKeys = null;
if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
stripeKeys = fileStripes.get(0).getKeyMetadata();
checkState(stripeKeys.size() == encryptionGroups.size(), "Number of keys in the first stripe must be the same as the number of encryption groups");
}
// node is added to the encryption group
for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
DwrfDataEncryptor decryptor = null;
List<Integer> nodes = encryptionGroup.getNodes();
for (int i = 0; i < nodes.size(); i++) {
Integer nodeId = nodes.get(i);
// do decryption only for those nodes that are requested (part of the projection)
if (!nodeToIntermediateKeys.containsKey(nodeId)) {
continue;
}
if (decryptor == null) {
// DEK for the FileStats can be stored either in the footer or/and in the first stripe.
// The key in the footer takes priority over the key in the first stripe.
byte[] encryptedDataKeyWithMeta = null;
if (encryptionGroup.getKeyMetadata().isPresent()) {
encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
} else if (stripeKeys != null) {
encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
}
checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
// decrypt the DEK which is encrypted using the IEK passed into a record reader
byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
}
// decrypt the FileStats
Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
try (OrcInputStream inputStream = new OrcInputStream(orcDataSource.getId(), // Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), new BasicSliceInput(encryptedFileStats), decompressor, Optional.of(decryptor), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, encryptedFileStats.length())) {
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
// FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
decryptedFileStats[nodeId + statsIdx] = toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
}
} catch (IOException e) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
}
}
}
return ImmutableList.copyOf(decryptedFileStats);
}
Aggregations