Search in sources :

Example 21 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class LongDictionaryColumnWriter method createColumnStatistics.

@Override
protected ColumnStatistics createColumnStatistics() {
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    statisticsBuilder = new IntegerStatisticsBuilder();
    return statistics;
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) IntegerStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.IntegerStatisticsBuilder)

Example 22 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class SliceDictionaryColumnWriter method createColumnStatistics.

@Override
protected ColumnStatistics createColumnStatistics() {
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    statisticsBuilder = newStringStatisticsBuilder();
    return statistics;
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics)

Example 23 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class TestOrcBloomFilters method testMatches.

@Test
public // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() {
    // stripe column
    Domain testingColumnHandleDomain = Domain.singleValue(BIGINT, 1234L);
    TupleDomain.ColumnDomain<String> column0 = new TupleDomain.ColumnDomain<>(COLUMN_0, testingColumnHandleDomain);
    // predicate consist of the bigint_0 = 1234
    TupleDomain<String> effectivePredicate = TupleDomain.fromColumnDomains(Optional.of(ImmutableList.of(column0)));
    TupleDomain<String> emptyEffectivePredicate = TupleDomain.all();
    // predicate column references
    List<ColumnReference<String>> columnReferences = ImmutableList.<ColumnReference<String>>builder().add(new ColumnReference<>(COLUMN_0, 0, BIGINT)).add(new ColumnReference<>(COLUMN_1, 1, BIGINT)).build();
    TupleDomainOrcPredicate<String> predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences, true, Optional.empty());
    TupleDomainOrcPredicate<String> emptyPredicate = new TupleDomainOrcPredicate<>(emptyEffectivePredicate, columnReferences, true, Optional.empty());
    // assemble a matching and a non-matching bloom filter
    HiveBloomFilter hiveBloomFilter = new HiveBloomFilter(new BloomFilter(1000, 0.01));
    OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(hiveBloomFilter);
    hiveBloomFilter.addLong(1234);
    OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(hiveBloomFilter);
    Map<Integer, ColumnStatistics> matchingStatisticsByColumnIndex = ImmutableMap.of(0, new IntegerColumnStatistics(null, toHiveBloomFilter(orcBloomFilter), new IntegerStatistics(10L, 2000L, null)));
    Map<Integer, ColumnStatistics> nonMatchingStatisticsByColumnIndex = ImmutableMap.of(0, new IntegerColumnStatistics(null, toHiveBloomFilter(emptyOrcBloomFilter), new IntegerStatistics(10L, 2000L, null)));
    Map<Integer, ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = ImmutableMap.of(0, new IntegerColumnStatistics(null, null, new IntegerStatistics(10L, 2000L, null)));
    assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
    assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
    assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
    assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
Also used : IntegerColumnStatistics(com.facebook.presto.orc.metadata.statistics.IntegerColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) OrcProto(com.facebook.presto.orc.proto.OrcProto) IntegerColumnStatistics(com.facebook.presto.orc.metadata.statistics.IntegerColumnStatistics) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter) BloomFilter(com.facebook.presto.orc.metadata.statistics.BloomFilter) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) ColumnReference(com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference) IntegerStatistics(com.facebook.presto.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 24 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class IcebergOrcFileWriter method computeMetrics.

private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcRowTypes.size(); i++) {
        if (excludedColumns.contains(i)) {
            continue;
        }
        OrcType orcColumn = orcRowTypes.get(i);
        ColumnStatistics orcColumnStats = columnStatistics.get(i);
        int icebergId = getIcebergId(orcColumn);
        NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
    null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ByteBuffer(java.nio.ByteBuffer) ImmutableMap(com.google.common.collect.ImmutableMap) NestedField(org.apache.iceberg.types.Types.NestedField) Metrics(org.apache.iceberg.Metrics) OrcType(com.facebook.presto.orc.metadata.OrcType)

Example 25 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class OrcWriteValidation method validateColumnStatisticsEquivalent.

private static void validateColumnStatisticsEquivalent(OrcDataSourceId orcDataSourceId, String name, List<ColumnStatistics> actualColumnStatistics, List<ColumnStatistics> expectedColumnStatistics) throws OrcCorruptionException {
    requireNonNull(name, "name is null");
    requireNonNull(actualColumnStatistics, "actualColumnStatistics is null");
    requireNonNull(expectedColumnStatistics, "expectedColumnStatistics is null");
    if (actualColumnStatistics.size() != expectedColumnStatistics.size()) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of columns in %s statistics", name);
    }
    for (int i = 0; i < actualColumnStatistics.size(); i++) {
        ColumnStatistics actual = actualColumnStatistics.get(i);
        ColumnStatistics expected = expectedColumnStatistics.get(i);
        validateColumnStatisticsEquivalent(orcDataSourceId, name + " column " + i, actual, expected);
    }
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics)

Aggregations

ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)46 ImmutableList (com.google.common.collect.ImmutableList)22 Slice (io.airlift.slice.Slice)22 List (java.util.List)22 ArrayList (java.util.ArrayList)19 Stream (com.facebook.presto.orc.metadata.Stream)18 StreamDataOutput (com.facebook.presto.orc.stream.StreamDataOutput)15 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)14 BooleanStreamCheckpoint (com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)12 PresentOutputStream (com.facebook.presto.orc.stream.PresentOutputStream)12 ImmutableMap (com.google.common.collect.ImmutableMap)11 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)7 OrcType (com.facebook.presto.orc.metadata.OrcType)7 LongOutputStream (com.facebook.presto.orc.stream.LongOutputStream)7 Map (java.util.Map)7 Type (com.facebook.presto.common.type.Type)6 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 ColumnEncoding (com.facebook.presto.orc.metadata.ColumnEncoding)5 StripeEncryptionGroup (com.facebook.presto.orc.metadata.StripeEncryptionGroup)5