use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class StripeReader method selectRowGroups.
private Set<Integer> selectRowGroups(StripeInformation stripe, Map<StreamId, List<RowGroupIndex>> columnIndexes) {
long rowsInStripe = stripe.getNumberOfRows();
int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup);
ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder();
long remainingRows = rowsInStripe;
for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) {
int rows = toIntExact(Math.min(remainingRows, rowsInRowGroup));
Map<Integer, ColumnStatistics> statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup);
if (predicate.matches(rows, statistics)) {
selectedRowGroups.add(rowGroup);
}
remainingRows -= rows;
}
return selectedRowGroups.build();
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class StripeReader method getRowGroupStatistics.
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) {
requireNonNull(rootStructType, "rootStructType is null");
checkArgument(rootStructType.getOrcTypeKind() == STRUCT);
requireNonNull(columnIndexes, "columnIndexes is null");
checkArgument(rowGroup >= 0, "rowGroup is negative");
Map<Integer, List<ColumnStatistics>> groupedColumnStatistics = new HashMap<>();
for (Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) {
if (!entry.getValue().isEmpty() && entry.getValue().get(rowGroup) != null) {
groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>()).add(entry.getValue().get(rowGroup).getColumnStatistics());
}
}
ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder();
for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) {
List<ColumnStatistics> columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal));
if (columnStatistics != null) {
if (columnStatistics.size() == 1) {
statistics.put(ordinal, getOnlyElement(columnStatistics));
} else {
// Merge statistics from different streams
// This can happen if map is represented as struct (DWRF only)
statistics.put(ordinal, mergeColumnStatistics(columnStatistics));
}
}
}
return statistics.build();
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class DwrfMetadataReader method decryptAndCombineFileStatistics.
private List<ColumnStatistics> decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion, DwrfEncryption dwrfEncryption, EncryptionLibrary encryptionLibrary, List<ColumnStatistics> fileStats, List<StripeInformation> fileStripes, Map<Integer, Slice> nodeToIntermediateKeys, OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor) {
requireNonNull(dwrfEncryption, "dwrfEncryption is null");
requireNonNull(encryptionLibrary, "encryptionLibrary is null");
if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
return fileStats;
}
ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
List<EncryptionGroup> encryptionGroups = dwrfEncryption.getEncryptionGroups();
List<byte[]> stripeKeys = null;
if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
stripeKeys = fileStripes.get(0).getKeyMetadata();
checkState(stripeKeys.size() == encryptionGroups.size(), "Number of keys in the first stripe must be the same as the number of encryption groups");
}
// node is added to the encryption group
for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
DwrfDataEncryptor decryptor = null;
List<Integer> nodes = encryptionGroup.getNodes();
for (int i = 0; i < nodes.size(); i++) {
Integer nodeId = nodes.get(i);
// do decryption only for those nodes that are requested (part of the projection)
if (!nodeToIntermediateKeys.containsKey(nodeId)) {
continue;
}
if (decryptor == null) {
// DEK for the FileStats can be stored either in the footer or/and in the first stripe.
// The key in the footer takes priority over the key in the first stripe.
byte[] encryptedDataKeyWithMeta = null;
if (encryptionGroup.getKeyMetadata().isPresent()) {
encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
} else if (stripeKeys != null) {
encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
}
checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
// decrypt the DEK which is encrypted using the IEK passed into a record reader
byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
}
// decrypt the FileStats
Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
try (OrcInputStream inputStream = new OrcInputStream(orcDataSource.getId(), // Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT), new BasicSliceInput(encryptedFileStats), decompressor, Optional.of(decryptor), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, encryptedFileStats.length())) {
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
// FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
decryptedFileStats[nodeId + statsIdx] = toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
}
} catch (IOException e) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
}
}
}
return ImmutableList.copyOf(decryptedFileStats);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class TupleDomainOrcPredicate method matches.
@Override
public boolean matches(long numberOfRows, Map<Integer, ColumnStatistics> statisticsByColumnIndex) {
Optional<Map<C, Domain>> optionalEffectivePredicateDomains = effectivePredicate.getDomains();
if (!optionalEffectivePredicateDomains.isPresent()) {
// effective predicate is none, so skip this section
return false;
}
Map<C, Domain> effectivePredicateDomains = optionalEffectivePredicateDomains.get();
for (ColumnReference<C> columnReference : columnReferences) {
Domain predicateDomain = effectivePredicateDomains.get(columnReference.getColumn());
if (predicateDomain == null) {
// no predicate on this column, so we can't exclude this section
continue;
}
ColumnStatistics columnStatistics = statisticsByColumnIndex.get(columnReference.getOrdinal());
if (columnStatistics == null) {
// no statistics for this column, so we can't exclude this section
continue;
}
if (!columnOverlaps(columnReference, predicateDomain, numberOfRows, columnStatistics)) {
return false;
}
}
// this section was not excluded
return true;
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class IcebergOrcFileWriter method computeMetrics.
private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
if (columnStatistics.isEmpty()) {
return new Metrics(fileRowCount, null, null, null, null, null, null);
}
// Columns that are descendants of LIST or MAP types are excluded because:
// 1. Their stats are not used by Apache Iceberg to filter out data files
// 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
// See https://github.com/apache/iceberg/pull/199#discussion_r429443627
Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
// OrcColumnId(0) is the root column that represents file-level schema
for (int i = 1; i < orcRowTypes.size(); i++) {
if (excludedColumns.contains(i)) {
continue;
}
OrcType orcColumn = orcRowTypes.get(i);
ColumnStatistics orcColumnStats = columnStatistics.get(i);
int icebergId = getIcebergId(orcColumn);
NestedField icebergField = icebergSchema.findField(icebergId);
verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
valueCountsBuilder.put(icebergId, fileRowCount);
if (orcColumnStats.hasNumberOfValues()) {
nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
}
toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
lowerBoundsBuilder.put(icebergId, minMax.getMin());
upperBoundsBuilder.put(icebergId, minMax.getMax());
});
}
Map<Integer, Long> valueCounts = valueCountsBuilder.build();
Map<Integer, Long> nullCounts = nullCountsBuilder.build();
Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Aggregations