Search in sources :

Example 71 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class StripeReader method selectRowGroups.

private Set<Integer> selectRowGroups(StripeInformation stripe, Map<StreamId, List<RowGroupIndex>> columnIndexes) {
    long rowsInStripe = stripe.getNumberOfRows();
    int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup);
    ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder();
    long remainingRows = rowsInStripe;
    for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) {
        int rows = toIntExact(Math.min(remainingRows, rowsInRowGroup));
        Map<Integer, ColumnStatistics> statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup);
        if (predicate.matches(rows, statistics)) {
            selectedRowGroups.add(rowGroup);
        }
        remainingRows -= rows;
    }
    return selectedRowGroups.build();
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ColumnStatistics.mergeColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics) ImmutableSet(com.google.common.collect.ImmutableSet) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint)

Example 72 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class StripeReader method getRowGroupStatistics.

private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) {
    requireNonNull(rootStructType, "rootStructType is null");
    checkArgument(rootStructType.getOrcTypeKind() == STRUCT);
    requireNonNull(columnIndexes, "columnIndexes is null");
    checkArgument(rowGroup >= 0, "rowGroup is negative");
    Map<Integer, List<ColumnStatistics>> groupedColumnStatistics = new HashMap<>();
    for (Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) {
        if (!entry.getValue().isEmpty() && entry.getValue().get(rowGroup) != null) {
            groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>()).add(entry.getValue().get(rowGroup).getColumnStatistics());
        }
    }
    ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder();
    for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) {
        List<ColumnStatistics> columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal));
        if (columnStatistics != null) {
            if (columnStatistics.size() == 1) {
                statistics.put(ordinal, getOnlyElement(columnStatistics));
            } else {
                // Merge statistics from different streams
                // This can happen if map is represented as struct (DWRF only)
                statistics.put(ordinal, mergeColumnStatistics(columnStatistics));
            }
        }
    }
    return statistics.build();
}
Also used : StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) OrcTypeKind(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) INDEX(com.facebook.presto.orc.metadata.Stream.StreamArea.INDEX) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) StreamKind(com.facebook.presto.orc.metadata.Stream.StreamKind) InvalidCheckpointException(com.facebook.presto.orc.checkpoint.InvalidCheckpointException) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) InputStreamSource(com.facebook.presto.orc.stream.InputStreamSource) Collection(java.util.Collection) Set(java.util.Set) CheckpointInputStreamSource.createCheckpointStreamSource(com.facebook.presto.orc.stream.CheckpointInputStreamSource.createCheckpointStreamSource) ValueInputStreamSource(com.facebook.presto.orc.stream.ValueInputStreamSource) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) List(java.util.List) Entry(java.util.Map.Entry) InputStreamSources(com.facebook.presto.orc.stream.InputStreamSources) Optional(java.util.Optional) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) DICTIONARY(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY) DwrfSequenceEncoding(com.facebook.presto.orc.metadata.DwrfSequenceEncoding) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) SortedMap(java.util.SortedMap) DICTIONARY_DATA(com.facebook.presto.orc.metadata.Stream.StreamKind.DICTIONARY_DATA) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) ROW_INDEX(com.facebook.presto.orc.metadata.Stream.StreamKind.ROW_INDEX) Slice(io.airlift.slice.Slice) LENGTH(com.facebook.presto.orc.metadata.Stream.StreamKind.LENGTH) HashMap(java.util.HashMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) DwrfMetadataReader.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataReader.toStripeEncryptionGroup) Multimap(com.google.common.collect.Multimap) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) HiveWriterVersion(com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicates(com.google.common.base.Predicates) OrcType(com.facebook.presto.orc.metadata.OrcType) Math.toIntExact(java.lang.Math.toIntExact) ImmutableMultimap(com.google.common.collect.ImmutableMultimap) ColumnStatistics.mergeColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics) BLOOM_FILTER(com.facebook.presto.orc.metadata.Stream.StreamKind.BLOOM_FILTER) NOOP_ORC_LOCAL_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcLocalMemoryContext.NOOP_ORC_LOCAL_MEMORY_CONTEXT) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind) DICTIONARY_V2(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2) STRUCT(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.STRUCT) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) IOException(java.io.IOException) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Maps(com.google.common.collect.Maps) Stream(com.facebook.presto.orc.metadata.Stream) Math.multiplyExact(java.lang.Math.multiplyExact) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) Checkpoints.getStreamCheckpoints(com.facebook.presto.orc.checkpoint.Checkpoints.getStreamCheckpoints) ValueStreams(com.facebook.presto.orc.stream.ValueStreams) VisibleForTesting(com.google.common.annotations.VisibleForTesting) MetadataReader(com.facebook.presto.orc.metadata.MetadataReader) InputStream(java.io.InputStream) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ColumnStatistics.mergeColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ImmutableMap(com.google.common.collect.ImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList)

Example 73 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class DwrfMetadataReader method decryptAndCombineFileStatistics.

private List<ColumnStatistics> decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion, DwrfEncryption dwrfEncryption, EncryptionLibrary encryptionLibrary, List<ColumnStatistics> fileStats, List<StripeInformation> fileStripes, Map<Integer, Slice> nodeToIntermediateKeys, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) {
    requireNonNull(dwrfEncryption, "dwrfEncryption is null");
    requireNonNull(encryptionLibrary, "encryptionLibrary is null");
    if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
        return fileStats;
    }
    ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
    List<EncryptionGroup> encryptionGroups = dwrfEncryption.getEncryptionGroups();
    List<byte[]> stripeKeys = null;
    if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
        stripeKeys = fileStripes.get(0).getKeyMetadata();
        checkState(stripeKeys.size() == encryptionGroups.size(), "Number of keys in the first stripe must be the same as the number of encryption groups");
    }
    // node is added to the encryption group
    for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
        EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
        DwrfDataEncryptor decryptor = null;
        List<Integer> nodes = encryptionGroup.getNodes();
        for (int i = 0; i < nodes.size(); i++) {
            Integer nodeId = nodes.get(i);
            // do decryption only for those nodes that are requested (part of the projection)
            if (!nodeToIntermediateKeys.containsKey(nodeId)) {
                continue;
            }
            if (decryptor == null) {
                // DEK for the FileStats can be stored either in the footer or/and in the first stripe.
                // The key in the footer takes priority over the key in the first stripe.
                byte[] encryptedDataKeyWithMeta = null;
                if (encryptionGroup.getKeyMetadata().isPresent()) {
                    encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
                } else if (stripeKeys != null) {
                    encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
                }
                checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
                // decrypt the DEK which is encrypted using the IEK passed into a record reader
                byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
                byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
                decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
            }
            // decrypt the FileStats
            Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
            try (OrcInputStream inputStream = new OrcInputStream(orcDataSource.getId(), // Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
            new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), new BasicSliceInput(encryptedFileStats), decompressor, Optional.of(decryptor), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, encryptedFileStats.length())) {
                CodedInputStream input = CodedInputStream.newInstance(inputStream);
                DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
                // FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
                for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
                    decryptedFileStats[nodeId + statsIdx] = toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
                }
            } catch (IOException e) {
                throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
            }
        }
    }
    return ImmutableList.copyOf(decryptedFileStats);
}
Also used : ColumnStatistics.createColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.createColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) DwrfDataEncryptor(com.facebook.presto.orc.DwrfDataEncryptor) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) IOException(java.io.IOException) BasicSliceInput(io.airlift.slice.BasicSliceInput) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) OrcMetadataReader.byteStringToSlice(com.facebook.presto.orc.metadata.OrcMetadataReader.byteStringToSlice) Slice(io.airlift.slice.Slice) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Example 74 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class TupleDomainOrcPredicate method matches.

@Override
public boolean matches(long numberOfRows, Map<Integer, ColumnStatistics> statisticsByColumnIndex) {
    Optional<Map<C, Domain>> optionalEffectivePredicateDomains = effectivePredicate.getDomains();
    if (!optionalEffectivePredicateDomains.isPresent()) {
        // effective predicate is none, so skip this section
        return false;
    }
    Map<C, Domain> effectivePredicateDomains = optionalEffectivePredicateDomains.get();
    for (ColumnReference<C> columnReference : columnReferences) {
        Domain predicateDomain = effectivePredicateDomains.get(columnReference.getColumn());
        if (predicateDomain == null) {
            // no predicate on this column, so we can't exclude this section
            continue;
        }
        ColumnStatistics columnStatistics = statisticsByColumnIndex.get(columnReference.getOrdinal());
        if (columnStatistics == null) {
            // no statistics for this column, so we can't exclude this section
            continue;
        }
        if (!columnOverlaps(columnReference, predicateDomain, numberOfRows, columnStatistics)) {
            return false;
        }
    }
    // this section was not excluded
    return true;
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) Map(java.util.Map)

Example 75 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class IcebergOrcFileWriter method computeMetrics.

private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcRowTypes.size(); i++) {
        if (excludedColumns.contains(i)) {
            continue;
        }
        OrcType orcColumn = orcRowTypes.get(i);
        ColumnStatistics orcColumnStats = columnStatistics.get(i);
        int icebergId = getIcebergId(orcColumn);
        NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
    null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ByteBuffer(java.nio.ByteBuffer) ImmutableMap(com.google.common.collect.ImmutableMap) NestedField(org.apache.iceberg.types.Types.NestedField) Metrics(org.apache.iceberg.Metrics) OrcType(com.facebook.presto.orc.metadata.OrcType)

Aggregations

ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)99 ImmutableList (com.google.common.collect.ImmutableList)46 Slice (io.airlift.slice.Slice)46 List (java.util.List)46 Stream (com.facebook.presto.orc.metadata.Stream)38 ArrayList (java.util.ArrayList)38 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)32 StreamDataOutput (com.facebook.presto.orc.stream.StreamDataOutput)32 BooleanStreamCheckpoint (com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)26 PresentOutputStream (com.facebook.presto.orc.stream.PresentOutputStream)26 ImmutableMap (com.google.common.collect.ImmutableMap)23 LongOutputStream (com.facebook.presto.orc.stream.LongOutputStream)16 OrcType (com.facebook.presto.orc.metadata.OrcType)15 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)14 Map (java.util.Map)14 Type (com.facebook.presto.common.type.Type)13 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)12 IOException (java.io.IOException)12 HashMap (java.util.HashMap)12 Optional (java.util.Optional)12