use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class TestDecryption method validateFileStatistics.
private static void validateFileStatistics(TempFile tempFile, Optional<DwrfWriterEncryption> dwrfWriterEncryption, Map<Integer, Slice> readerIntermediateKeys) throws IOException {
OrcReader readerNoKeys = OrcTester.createCustomOrcReader(tempFile, DWRF, false, ImmutableMap.of());
if (readerNoKeys.getFooter().getStripes().isEmpty()) {
// files w/o stripes don't have stats
assertEquals(readerNoKeys.getFooter().getFileStats().size(), 0);
return;
}
if (dwrfWriterEncryption.isPresent()) {
List<OrcType> types = readerNoKeys.getTypes();
List<ColumnStatistics> fileStatsNoKey = readerNoKeys.getFooter().getFileStats();
assertEquals(fileStatsNoKey.size(), types.size());
Set<Integer> allEncryptedNodes = dwrfWriterEncryption.get().getWriterEncryptionGroups().stream().flatMap(group -> group.getNodes().stream()).flatMap(node -> collectNodeTree(types, node).stream()).collect(Collectors.toSet());
for (Set<Integer> readerKeyNodes : Sets.powerSet(readerIntermediateKeys.keySet())) {
Map<Integer, Slice> readerKeys = new HashMap<>();
readerKeyNodes.forEach(node -> readerKeys.put(node, readerIntermediateKeys.get(node)));
// nodes that are supposed to be decrypted by the reader
Set<Integer> decryptedNodes = readerKeys.keySet().stream().flatMap(node -> collectNodeTree(types, node).stream()).collect(Collectors.toSet());
// decryptedNodes should be a subset of encrypted nodes
assertTrue(allEncryptedNodes.containsAll(decryptedNodes));
OrcReader readerWithKeys = OrcTester.createCustomOrcReader(tempFile, DWRF, false, readerIntermediateKeys);
List<ColumnStatistics> fileStatsWithKey = readerWithKeys.getFooter().getFileStats();
assertEquals(fileStatsWithKey.size(), types.size());
for (int node = 0; node < types.size(); node++) {
ColumnStatistics statsWithKey = fileStatsWithKey.get(node);
ColumnStatistics statsNoKey = fileStatsNoKey.get(node);
OrcType type = types.get(node);
// encrypted nodes should have no type info
if (allEncryptedNodes.contains(node)) {
assertTrue(hasNoTypeStats(statsNoKey));
} else {
assertStatsTypeMatch(statsNoKey, type);
assertStatsTypeMatch(statsWithKey, type);
assertEquals(statsNoKey, statsWithKey);
}
if (decryptedNodes.contains(node)) {
assertStatsTypeMatch(statsWithKey, type);
}
}
}
}
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class DwrfMetadataReader method decryptAndCombineFileStatistics.
private List<ColumnStatistics> decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion, DwrfEncryption dwrfEncryption, EncryptionLibrary encryptionLibrary, List<ColumnStatistics> fileStats, List<StripeInformation> fileStripes, Map<Integer, Slice> nodeToIntermediateKeys, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) {
requireNonNull(dwrfEncryption, "dwrfEncryption is null");
requireNonNull(encryptionLibrary, "encryptionLibrary is null");
if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
return fileStats;
}
ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
List<EncryptionGroup> encryptionGroups = dwrfEncryption.getEncryptionGroups();
List<byte[]> stripeKeys = null;
if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
stripeKeys = fileStripes.get(0).getKeyMetadata();
checkState(stripeKeys.size() == encryptionGroups.size(), "Number of keys in the first stripe must be the same as the number of encryption groups");
}
// node is added to the encryption group
for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
DwrfDataEncryptor decryptor = null;
List<Integer> nodes = encryptionGroup.getNodes();
for (int i = 0; i < nodes.size(); i++) {
Integer nodeId = nodes.get(i);
// do decryption only for those nodes that are requested (part of the projection)
if (!nodeToIntermediateKeys.containsKey(nodeId)) {
continue;
}
if (decryptor == null) {
// DEK for the FileStats can be stored either in the footer or/and in the first stripe.
// The key in the footer takes priority over the key in the first stripe.
byte[] encryptedDataKeyWithMeta = null;
if (encryptionGroup.getKeyMetadata().isPresent()) {
encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
} else if (stripeKeys != null) {
encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
}
checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
// decrypt the DEK which is encrypted using the IEK passed into a record reader
byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
}
// decrypt the FileStats
Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
try (OrcInputStream inputStream = new OrcInputStream(orcDataSource.getId(), // Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), new BasicSliceInput(encryptedFileStats), decompressor, Optional.of(decryptor), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, encryptedFileStats.length())) {
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
// FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
decryptedFileStats[nodeId + statsIdx] = toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
}
} catch (IOException e) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
}
}
}
return ImmutableList.copyOf(decryptedFileStats);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class DwrfMetadataReader method readFooter.
@Override
public Footer readFooter(HiveWriterVersion hiveWriterVersion, InputStream inputStream, DwrfEncryptionProvider dwrfEncryptionProvider, DwrfKeyProvider dwrfKeyProvider, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) throws IOException {
long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.Footer footer = DwrfProto.Footer.parseFrom(input);
List<ColumnStatistics> fileStats = toColumnStatistics(hiveWriterVersion, footer.getStatisticsList(), false);
List<StripeInformation> fileStripes = toStripeInformation(footer.getStripesList());
List<OrcType> types = toType(footer.getTypesList());
Optional<DwrfEncryption> encryption = footer.hasEncryption() ? Optional.of(toEncryption(footer.getEncryption())) : Optional.empty();
Optional<List<Integer>> stripeCacheOffsets = Optional.of(footer.getStripeCacheOffsetsList());
if (encryption.isPresent()) {
Map<Integer, Slice> keys = dwrfKeyProvider.getIntermediateKeys(types);
EncryptionLibrary encryptionLibrary = dwrfEncryptionProvider.getEncryptionLibrary(encryption.get().getKeyProvider());
fileStats = decryptAndCombineFileStatistics(hiveWriterVersion, encryption.get(), encryptionLibrary, fileStats, fileStripes, keys, orcDataSource, decompressor);
}
runtimeStats.addMetricValue("DwrfReadFooterTimeNanos", THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
OptionalLong rawSize = footer.hasRawDataSize() ? OptionalLong.of(footer.getRawDataSize()) : OptionalLong.empty();
return new Footer(footer.getNumberOfRows(), footer.getRowIndexStride(), rawSize, fileStripes, types, fileStats, toUserMetadata(footer.getMetadataList()), encryption, stripeCacheOffsets);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class StructColumnWriter method getIndexStreams.
@Override
public List<StreamDataOutput> getIndexStreams() throws IOException {
checkState(closed);
ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
int groupId = i;
ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
List<Integer> positions = createStructColumnPositionList(compressed, presentCheckpoint);
rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
}
Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false);
ImmutableList.Builder<StreamDataOutput> indexStreams = ImmutableList.builder();
indexStreams.add(new StreamDataOutput(slice, stream));
for (ColumnWriter structField : structFields) {
indexStreams.addAll(structField.getIndexStreams());
}
return indexStreams.build();
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class LongColumnWriter method finishRowGroup.
@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
rowGroupColumnStatistics.add(statistics);
columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
statisticsBuilder = statisticsBuilderSupplier.get();
return ImmutableMap.of(column, statistics);
}
Aggregations