Search in sources :

Example 16 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class TestDecryption method validateFileStatistics.

private static void validateFileStatistics(TempFile tempFile, Optional<DwrfWriterEncryption> dwrfWriterEncryption, Map<Integer, Slice> readerIntermediateKeys) throws IOException {
    OrcReader readerNoKeys = OrcTester.createCustomOrcReader(tempFile, DWRF, false, ImmutableMap.of());
    if (readerNoKeys.getFooter().getStripes().isEmpty()) {
        // files w/o stripes don't have stats
        assertEquals(readerNoKeys.getFooter().getFileStats().size(), 0);
        return;
    }
    if (dwrfWriterEncryption.isPresent()) {
        List<OrcType> types = readerNoKeys.getTypes();
        List<ColumnStatistics> fileStatsNoKey = readerNoKeys.getFooter().getFileStats();
        assertEquals(fileStatsNoKey.size(), types.size());
        Set<Integer> allEncryptedNodes = dwrfWriterEncryption.get().getWriterEncryptionGroups().stream().flatMap(group -> group.getNodes().stream()).flatMap(node -> collectNodeTree(types, node).stream()).collect(Collectors.toSet());
        for (Set<Integer> readerKeyNodes : Sets.powerSet(readerIntermediateKeys.keySet())) {
            Map<Integer, Slice> readerKeys = new HashMap<>();
            readerKeyNodes.forEach(node -> readerKeys.put(node, readerIntermediateKeys.get(node)));
            // nodes that are supposed to be decrypted by the reader
            Set<Integer> decryptedNodes = readerKeys.keySet().stream().flatMap(node -> collectNodeTree(types, node).stream()).collect(Collectors.toSet());
            // decryptedNodes should be a subset of encrypted nodes
            assertTrue(allEncryptedNodes.containsAll(decryptedNodes));
            OrcReader readerWithKeys = OrcTester.createCustomOrcReader(tempFile, DWRF, false, readerIntermediateKeys);
            List<ColumnStatistics> fileStatsWithKey = readerWithKeys.getFooter().getFileStats();
            assertEquals(fileStatsWithKey.size(), types.size());
            for (int node = 0; node < types.size(); node++) {
                ColumnStatistics statsWithKey = fileStatsWithKey.get(node);
                ColumnStatistics statsNoKey = fileStatsNoKey.get(node);
                OrcType type = types.get(node);
                // encrypted nodes should have no type info
                if (allEncryptedNodes.contains(node)) {
                    assertTrue(hasNoTypeStats(statsNoKey));
                } else {
                    assertStatsTypeMatch(statsNoKey, type);
                    assertStatsTypeMatch(statsWithKey, type);
                    assertEquals(statsNoKey, statsWithKey);
                }
                if (decryptedNodes.contains(node)) {
                    assertStatsTypeMatch(statsWithKey, type);
                }
            }
        }
    }
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Test(org.testng.annotations.Test) OrcReader.validateEncryption(com.facebook.presto.orc.OrcReader.validateEncryption) DwrfEncryption(com.facebook.presto.orc.metadata.DwrfEncryption) StripeReader.getDiskRanges(com.facebook.presto.orc.StripeReader.getDiskRanges) INT(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.INT) Arrays.asList(java.util.Arrays.asList) Slices(io.airlift.slice.Slices) Map(java.util.Map) HIVE_STORAGE_TIME_ZONE(com.facebook.presto.orc.OrcTester.HIVE_STORAGE_TIME_ZONE) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) ImmutableMap(com.google.common.collect.ImmutableMap) Footer(com.facebook.presto.orc.metadata.Footer) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) OrcTester.assertFileContentsPresto(com.facebook.presto.orc.OrcTester.assertFileContentsPresto) Assert.assertNotNull(org.testng.Assert.assertNotNull) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) NOOP_ORC_AGGREGATED_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcAggregatedMemoryContext.NOOP_ORC_AGGREGATED_MEMORY_CONTEXT) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) DEFAULT_SEQUENCE_ID(com.facebook.presto.orc.metadata.ColumnEncoding.DEFAULT_SEQUENCE_ID) Resources.getResource(com.google.common.io.Resources.getResource) DataSize(io.airlift.units.DataSize) List(java.util.List) DWRF(com.facebook.presto.orc.OrcEncoding.DWRF) Optional(java.util.Optional) AbstractOrcRecordReader.getDecryptionKeyMetadata(com.facebook.presto.orc.AbstractOrcRecordReader.getDecryptionKeyMetadata) MAP(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.MAP) IntStream(java.util.stream.IntStream) MAX_BLOCK_SIZE(com.facebook.presto.orc.OrcTester.MAX_BLOCK_SIZE) LIST(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.LIST) ROW_INDEX(com.facebook.presto.orc.metadata.Stream.StreamKind.ROW_INDEX) Slice(io.airlift.slice.Slice) VARCHAR(com.facebook.presto.common.type.VarcharType.VARCHAR) AbstractTestOrcReader.intsBetween(com.facebook.presto.orc.AbstractTestOrcReader.intsBetween) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) Assert.assertEquals(org.testng.Assert.assertEquals) HashMap(java.util.HashMap) UNKNOWN(com.facebook.presto.orc.metadata.KeyProvider.UNKNOWN) HashSet(java.util.HashSet) OptionalLong(java.util.OptionalLong) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) OrcTester.writeOrcColumnsPresto(com.facebook.presto.orc.OrcTester.writeOrcColumnsPresto) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) MAX_BATCH_SIZE(com.facebook.presto.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) STRUCT(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.STRUCT) IOException(java.io.IOException) File(java.io.File) Stream(com.facebook.presto.orc.metadata.Stream) EncryptionGroup(com.facebook.presto.orc.metadata.EncryptionGroup) StorageOrcFileTailSource(com.facebook.presto.orc.cache.StorageOrcFileTailSource) Collectors.toList(java.util.stream.Collectors.toList) OrcTester.rowType(com.facebook.presto.orc.OrcTester.rowType) Assert.assertTrue(org.testng.Assert.assertTrue) DATA(com.facebook.presto.orc.metadata.Stream.StreamKind.DATA) DwrfEncryptionInfo.createNodeToGroupMap(com.facebook.presto.orc.DwrfEncryptionInfo.createNodeToGroupMap) ZSTD(com.facebook.presto.orc.metadata.CompressionKind.ZSTD) OrcType(com.facebook.presto.orc.metadata.OrcType) HashMap(java.util.HashMap) Slice(io.airlift.slice.Slice)

Example 17 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class DwrfMetadataReader method decryptAndCombineFileStatistics.

private List<ColumnStatistics> decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion, DwrfEncryption dwrfEncryption, EncryptionLibrary encryptionLibrary, List<ColumnStatistics> fileStats, List<StripeInformation> fileStripes, Map<Integer, Slice> nodeToIntermediateKeys, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) {
    requireNonNull(dwrfEncryption, "dwrfEncryption is null");
    requireNonNull(encryptionLibrary, "encryptionLibrary is null");
    if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
        return fileStats;
    }
    ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
    List<EncryptionGroup> encryptionGroups = dwrfEncryption.getEncryptionGroups();
    List<byte[]> stripeKeys = null;
    if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
        stripeKeys = fileStripes.get(0).getKeyMetadata();
        checkState(stripeKeys.size() == encryptionGroups.size(), "Number of keys in the first stripe must be the same as the number of encryption groups");
    }
    // node is added to the encryption group
    for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
        EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
        DwrfDataEncryptor decryptor = null;
        List<Integer> nodes = encryptionGroup.getNodes();
        for (int i = 0; i < nodes.size(); i++) {
            Integer nodeId = nodes.get(i);
            // do decryption only for those nodes that are requested (part of the projection)
            if (!nodeToIntermediateKeys.containsKey(nodeId)) {
                continue;
            }
            if (decryptor == null) {
                // DEK for the FileStats can be stored either in the footer or/and in the first stripe.
                // The key in the footer takes priority over the key in the first stripe.
                byte[] encryptedDataKeyWithMeta = null;
                if (encryptionGroup.getKeyMetadata().isPresent()) {
                    encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
                } else if (stripeKeys != null) {
                    encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
                }
                checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
                // decrypt the DEK which is encrypted using the IEK passed into a record reader
                byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
                byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
                decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
            }
            // decrypt the FileStats
            Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
            try (OrcInputStream inputStream = new OrcInputStream(orcDataSource.getId(), // Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
            new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), new BasicSliceInput(encryptedFileStats), decompressor, Optional.of(decryptor), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, encryptedFileStats.length())) {
                CodedInputStream input = CodedInputStream.newInstance(inputStream);
                DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
                // FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
                for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
                    decryptedFileStats[nodeId + statsIdx] = toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
                }
            } catch (IOException e) {
                throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
            }
        }
    }
    return ImmutableList.copyOf(decryptedFileStats);
}
Also used : ColumnStatistics.createColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.createColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) DwrfDataEncryptor(com.facebook.presto.orc.DwrfDataEncryptor) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) IOException(java.io.IOException) BasicSliceInput(io.airlift.slice.BasicSliceInput) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) OrcMetadataReader.byteStringToSlice(com.facebook.presto.orc.metadata.OrcMetadataReader.byteStringToSlice) Slice(io.airlift.slice.Slice) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Example 18 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class DwrfMetadataReader method readFooter.

@Override
public Footer readFooter(HiveWriterVersion hiveWriterVersion, InputStream inputStream, DwrfEncryptionProvider dwrfEncryptionProvider, DwrfKeyProvider dwrfKeyProvider, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) throws IOException {
    long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
    CodedInputStream input = CodedInputStream.newInstance(inputStream);
    DwrfProto.Footer footer = DwrfProto.Footer.parseFrom(input);
    List<ColumnStatistics> fileStats = toColumnStatistics(hiveWriterVersion, footer.getStatisticsList(), false);
    List<StripeInformation> fileStripes = toStripeInformation(footer.getStripesList());
    List<OrcType> types = toType(footer.getTypesList());
    Optional<DwrfEncryption> encryption = footer.hasEncryption() ? Optional.of(toEncryption(footer.getEncryption())) : Optional.empty();
    Optional<List<Integer>> stripeCacheOffsets = Optional.of(footer.getStripeCacheOffsetsList());
    if (encryption.isPresent()) {
        Map<Integer, Slice> keys = dwrfKeyProvider.getIntermediateKeys(types);
        EncryptionLibrary encryptionLibrary = dwrfEncryptionProvider.getEncryptionLibrary(encryption.get().getKeyProvider());
        fileStats = decryptAndCombineFileStatistics(hiveWriterVersion, encryption.get(), encryptionLibrary, fileStats, fileStripes, keys, orcDataSource, decompressor);
    }
    runtimeStats.addMetricValue("DwrfReadFooterTimeNanos", THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
    OptionalLong rawSize = footer.hasRawDataSize() ? OptionalLong.of(footer.getRawDataSize()) : OptionalLong.empty();
    return new Footer(footer.getNumberOfRows(), footer.getRowIndexStride(), rawSize, fileStripes, types, fileStats, toUserMetadata(footer.getMetadataList()), encryption, stripeCacheOffsets);
}
Also used : ColumnStatistics.createColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.createColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) EncryptionLibrary(com.facebook.presto.orc.EncryptionLibrary) OrcMetadataReader.byteStringToSlice(com.facebook.presto.orc.metadata.OrcMetadataReader.byteStringToSlice) Slice(io.airlift.slice.Slice) OptionalLong(java.util.OptionalLong) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList)

Example 19 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class StructColumnWriter method getIndexStreams.

@Override
public List<StreamDataOutput> getIndexStreams() throws IOException {
    checkState(closed);
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        List<Integer> positions = createStructColumnPositionList(compressed, presentCheckpoint);
        rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
    }
    Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
    Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false);
    ImmutableList.Builder<StreamDataOutput> indexStreams = ImmutableList.builder();
    indexStreams.add(new StreamDataOutput(slice, stream));
    for (ColumnWriter structField : structFields) {
        indexStreams.addAll(structField.getIndexStreams());
    }
    return indexStreams.build();
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) BooleanStreamCheckpoint(com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint) ImmutableList(com.google.common.collect.ImmutableList) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) BooleanStreamCheckpoint(com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PresentOutputStream(com.facebook.presto.orc.stream.PresentOutputStream) Stream(com.facebook.presto.orc.metadata.Stream)

Example 20 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class LongColumnWriter method finishRowGroup.

@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    rowGroupColumnStatistics.add(statistics);
    columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
    statisticsBuilder = statisticsBuilderSupplier.get();
    return ImmutableMap.of(column, statistics);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics)

Aggregations

ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)46 ImmutableList (com.google.common.collect.ImmutableList)22 Slice (io.airlift.slice.Slice)22 List (java.util.List)22 ArrayList (java.util.ArrayList)19 Stream (com.facebook.presto.orc.metadata.Stream)18 StreamDataOutput (com.facebook.presto.orc.stream.StreamDataOutput)15 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)14 BooleanStreamCheckpoint (com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)12 PresentOutputStream (com.facebook.presto.orc.stream.PresentOutputStream)12 ImmutableMap (com.google.common.collect.ImmutableMap)11 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)7 OrcType (com.facebook.presto.orc.metadata.OrcType)7 LongOutputStream (com.facebook.presto.orc.stream.LongOutputStream)7 Map (java.util.Map)7 Type (com.facebook.presto.common.type.Type)6 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 ColumnEncoding (com.facebook.presto.orc.metadata.ColumnEncoding)5 StripeEncryptionGroup (com.facebook.presto.orc.metadata.StripeEncryptionGroup)5