use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class LongDictionaryColumnWriter method createColumnStatistics.
@Override
protected ColumnStatistics createColumnStatistics() {
ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
statisticsBuilder = new IntegerStatisticsBuilder();
return statistics;
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class SliceDictionaryColumnWriter method createColumnStatistics.
@Override
protected ColumnStatistics createColumnStatistics() {
ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
statisticsBuilder = newStringStatisticsBuilder();
return statistics;
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class TestOrcBloomFilters method testMatches.
@Test
public // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() {
// stripe column
Domain testingColumnHandleDomain = Domain.singleValue(BIGINT, 1234L);
TupleDomain.ColumnDomain<String> column0 = new TupleDomain.ColumnDomain<>(COLUMN_0, testingColumnHandleDomain);
// predicate consist of the bigint_0 = 1234
TupleDomain<String> effectivePredicate = TupleDomain.fromColumnDomains(Optional.of(ImmutableList.of(column0)));
TupleDomain<String> emptyEffectivePredicate = TupleDomain.all();
// predicate column references
List<ColumnReference<String>> columnReferences = ImmutableList.<ColumnReference<String>>builder().add(new ColumnReference<>(COLUMN_0, 0, BIGINT)).add(new ColumnReference<>(COLUMN_1, 1, BIGINT)).build();
TupleDomainOrcPredicate<String> predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences, true, Optional.empty());
TupleDomainOrcPredicate<String> emptyPredicate = new TupleDomainOrcPredicate<>(emptyEffectivePredicate, columnReferences, true, Optional.empty());
// assemble a matching and a non-matching bloom filter
HiveBloomFilter hiveBloomFilter = new HiveBloomFilter(new BloomFilter(1000, 0.01));
OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(hiveBloomFilter);
hiveBloomFilter.addLong(1234);
OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(hiveBloomFilter);
Map<Integer, ColumnStatistics> matchingStatisticsByColumnIndex = ImmutableMap.of(0, new IntegerColumnStatistics(null, toHiveBloomFilter(orcBloomFilter), new IntegerStatistics(10L, 2000L, null)));
Map<Integer, ColumnStatistics> nonMatchingStatisticsByColumnIndex = ImmutableMap.of(0, new IntegerColumnStatistics(null, toHiveBloomFilter(emptyOrcBloomFilter), new IntegerStatistics(10L, 2000L, null)));
Map<Integer, ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = ImmutableMap.of(0, new IntegerColumnStatistics(null, null, new IntegerStatistics(10L, 2000L, null)));
assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class IcebergOrcFileWriter method computeMetrics.
private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
if (columnStatistics.isEmpty()) {
return new Metrics(fileRowCount, null, null, null, null, null);
}
// Columns that are descendants of LIST or MAP types are excluded because:
// 1. Their stats are not used by Apache Iceberg to filter out data files
// 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
// See https://github.com/apache/iceberg/pull/199#discussion_r429443627
Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
// OrcColumnId(0) is the root column that represents file-level schema
for (int i = 1; i < orcRowTypes.size(); i++) {
if (excludedColumns.contains(i)) {
continue;
}
OrcType orcColumn = orcRowTypes.get(i);
ColumnStatistics orcColumnStats = columnStatistics.get(i);
int icebergId = getIcebergId(orcColumn);
NestedField icebergField = icebergSchema.findField(icebergId);
verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
valueCountsBuilder.put(icebergId, fileRowCount);
if (orcColumnStats.hasNumberOfValues()) {
nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
}
toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
lowerBoundsBuilder.put(icebergId, minMax.getMin());
upperBoundsBuilder.put(icebergId, minMax.getMax());
});
}
Map<Integer, Long> valueCounts = valueCountsBuilder.build();
Map<Integer, Long> nullCounts = nullCountsBuilder.build();
Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class OrcWriteValidation method validateColumnStatisticsEquivalent.
private static void validateColumnStatisticsEquivalent(OrcDataSourceId orcDataSourceId, String name, List<ColumnStatistics> actualColumnStatistics, List<ColumnStatistics> expectedColumnStatistics) throws OrcCorruptionException {
requireNonNull(name, "name is null");
requireNonNull(actualColumnStatistics, "actualColumnStatistics is null");
requireNonNull(expectedColumnStatistics, "expectedColumnStatistics is null");
if (actualColumnStatistics.size() != expectedColumnStatistics.size()) {
throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of columns in %s statistics", name);
}
for (int i = 0; i < actualColumnStatistics.size(); i++) {
ColumnStatistics actual = actualColumnStatistics.get(i);
ColumnStatistics expected = expectedColumnStatistics.get(i);
validateColumnStatisticsEquivalent(orcDataSourceId, name + " column " + i, actual, expected);
}
}
Aggregations