use of com.facebook.presto.orc.stream.OrcInputStream in project presto by prestodb.
the class DwrfMetadataReader method decryptAndCombineFileStatistics.
private List<ColumnStatistics> decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion, DwrfEncryption dwrfEncryption, EncryptionLibrary encryptionLibrary, List<ColumnStatistics> fileStats, List<StripeInformation> fileStripes, Map<Integer, Slice> nodeToIntermediateKeys, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) {
requireNonNull(dwrfEncryption, "dwrfEncryption is null");
requireNonNull(encryptionLibrary, "encryptionLibrary is null");
if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
return fileStats;
}
ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
List<EncryptionGroup> encryptionGroups = dwrfEncryption.getEncryptionGroups();
List<byte[]> stripeKeys = null;
if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
stripeKeys = fileStripes.get(0).getKeyMetadata();
checkState(stripeKeys.size() == encryptionGroups.size(), "Number of keys in the first stripe must be the same as the number of encryption groups");
}
// node is added to the encryption group
for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
DwrfDataEncryptor decryptor = null;
List<Integer> nodes = encryptionGroup.getNodes();
for (int i = 0; i < nodes.size(); i++) {
Integer nodeId = nodes.get(i);
// do decryption only for those nodes that are requested (part of the projection)
if (!nodeToIntermediateKeys.containsKey(nodeId)) {
continue;
}
if (decryptor == null) {
// DEK for the FileStats can be stored either in the footer or/and in the first stripe.
// The key in the footer takes priority over the key in the first stripe.
byte[] encryptedDataKeyWithMeta = null;
if (encryptionGroup.getKeyMetadata().isPresent()) {
encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
} else if (stripeKeys != null) {
encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
}
checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
// decrypt the DEK which is encrypted using the IEK passed into a record reader
byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
}
// decrypt the FileStats
Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
try (OrcInputStream inputStream = new OrcInputStream(orcDataSource.getId(), // Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), new BasicSliceInput(encryptedFileStats), decompressor, Optional.of(decryptor), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, encryptedFileStats.length())) {
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
// FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
decryptedFileStats[nodeId + statsIdx] = toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
}
} catch (IOException e) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
}
}
}
return ImmutableList.copyOf(decryptedFileStats);
}
use of com.facebook.presto.orc.stream.OrcInputStream in project presto by prestodb.
the class StripeReader method readStripeFooter.
public StripeFooter readStripeFooter(StripeInformation stripe, AbstractAggregatedMemoryContext systemMemoryUsage) throws IOException {
long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength();
int tailLength = toIntExact(stripe.getFooterLength());
// read the footer
byte[] tailBuffer = new byte[tailLength];
orcDataSource.readFully(offset, tailBuffer);
try (InputStream inputStream = new OrcInputStream(orcDataSource.toString(), Slices.wrappedBuffer(tailBuffer).getInput(), compressionKind, bufferSize, systemMemoryUsage)) {
return metadataReader.readStripeFooter(hiveWriterVersion, types, inputStream);
}
}
use of com.facebook.presto.orc.stream.OrcInputStream in project presto by prestodb.
the class StripeReader method readColumnIndexes.
private Map<Integer, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes) throws IOException {
ImmutableMap.Builder<Integer, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
Stream stream = entry.getValue();
if (stream.getStreamKind() == ROW_INDEX) {
OrcInputStream inputStream = streamsData.get(entry.getKey());
List<HiveBloomFilter> bloomFilters = bloomFilterIndexes.get(stream.getColumn());
List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
if (bloomFilters != null && !bloomFilters.isEmpty()) {
ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
for (int i = 0; i < rowGroupIndexes.size(); i++) {
RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
}
rowGroupIndexes = newRowGroupIndexes.build();
}
columnIndexes.put(stream.getColumn(), rowGroupIndexes);
}
}
return columnIndexes.build();
}
use of com.facebook.presto.orc.stream.OrcInputStream in project presto by prestodb.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext systemMemoryUsage) throws IOException {
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
List<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
// get streams for selected columns
Map<StreamId, Stream> streams = new HashMap<>();
boolean hasRowGroupDictionary = false;
for (Stream stream : stripeFooter.getStreams()) {
if (includedOrcColumns.contains(stream.getColumn())) {
streams.put(new StreamId(stream), stream);
ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind();
if (columnEncoding == DICTIONARY && stream.getStreamKind() == StreamKind.IN_DICTIONARY) {
hasRowGroupDictionary = true;
}
}
}
// handle stripes with more than one row group or a dictionary
if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
// read the bloom filter for each column
Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
// read the row index for each column
Map<Integer, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes);
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
systemMemoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// we must fail because the length of the row group dictionary is contained in the checkpoint stream.
if (hasRowGroupDictionary) {
throw new OrcCorruptionException(e, "ORC file %s has corrupt checkpoints", orcDataSource);
}
}
}
// stripe only has one row group and no dictionary
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
StreamId streamId = entry.getKey();
if (streamId.getStreamKind() != ROW_INDEX && streams.keySet().contains(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
// value streams
Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, StreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), new StreamSources(builder.build()));
return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
use of com.facebook.presto.orc.stream.OrcInputStream in project presto by prestodb.
the class OrcTester method getFileMetadata.
public static FileMetadata getFileMetadata(File inputFile, OrcEncoding encoding) throws IOException {
boolean zstdJniDecompressionEnabled = true;
DataSize dataSize = new DataSize(1, MEGABYTE);
OrcDataSource orcDataSource = new FileOrcDataSource(inputFile, dataSize, dataSize, dataSize, true);
RuntimeStats runtimeStats = new RuntimeStats();
OrcReader reader = new OrcReader(orcDataSource, encoding, new StorageOrcFileTailSource(), new StorageStripeMetadataSource(), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, new OrcReaderOptions(dataSize, dataSize, dataSize, zstdJniDecompressionEnabled), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, runtimeStats);
Footer footer = reader.getFooter();
Optional<OrcDecompressor> decompressor = createOrcDecompressor(orcDataSource.getId(), reader.getCompressionKind(), reader.getBufferSize(), zstdJniDecompressionEnabled);
ImmutableList.Builder<StripeFooter> stripes = new ImmutableList.Builder<>();
for (StripeInformation stripe : footer.getStripes()) {
// read the footer
byte[] tailBuffer = new byte[toIntExact(stripe.getFooterLength())];
orcDataSource.readFully(stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(), tailBuffer);
try (InputStream inputStream = new OrcInputStream(orcDataSource.getId(), new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), Slices.wrappedBuffer(tailBuffer).getInput(), decompressor, Optional.empty(), new TestingHiveOrcAggregatedMemoryContext(), tailBuffer.length)) {
StripeFooter stripeFooter = encoding.createMetadataReader(runtimeStats).readStripeFooter(orcDataSource.getId(), footer.getTypes(), inputStream);
stripes.add(stripeFooter);
}
}
return new FileMetadata(footer, stripes.build());
}
Aggregations