Search in sources :

Example 11 with ColumnStatistics

use of io.prestosql.spi.statistics.ColumnStatistics in project boostkit-bigdata by kunpengcompute.

the class TestHivePushdownUtil method simulationHiveMetadata.

protected static HiveMetadata simulationHiveMetadata() {
    // simulation chain: HiveTransactionManager -> HiveMetadata -> ColumnMetadata + TableStatistics + ColumnStatistics
    ColumnMetadata columnMetadataInt = Mockito.mock(ColumnMetadata.class);
    Mockito.when(columnMetadataInt.getName()).thenReturn(COLUMN_INT.getName());
    Mockito.when(columnMetadataInt.getType()).thenReturn(INTEGER);
    HashMap<String, Object> propertyMap = new HashMap<>();
    propertyMap.put(STORAGE_FORMAT_PROPERTY, HiveStorageFormat.ORC);
    ConnectorTableMetadata connectorTableMetadata = Mockito.mock(ConnectorTableMetadata.class);
    Mockito.when(connectorTableMetadata.getProperties()).thenReturn(propertyMap);
    Map<ColumnHandle, ColumnStatistics> columnStatistics = new HashMap<>();
    ColumnStatistics columnStatisInt = new ColumnStatistics(Estimate.zero(), Estimate.of(DISTINICT_COLUMN_NUM), Estimate.unknown(), Optional.of(new DoubleRange(1, 10)));
    columnStatistics.put(COLUMN_INT, columnStatisInt);
    TableStatistics statistics = new TableStatistics(Estimate.of(OFFLOAD_COLUMN_NUM), 5, 1024, columnStatistics);
    HiveMetadata metadata = Mockito.mock(HiveMetadata.class);
    Mockito.when(metadata.getTableMetadata(OFFLOAD_SESSION, OFFLOAD_HIVE_TABLE_HANDLE)).thenReturn(connectorTableMetadata);
    Mockito.when(metadata.getColumnMetadata(Matchers.eq(OFFLOAD_SESSION), Matchers.eq(OFFLOAD_HIVE_TABLE_HANDLE), Matchers.any(ColumnHandle.class))).thenReturn(columnMetadataInt);
    Map<String, ColumnHandle> columnHandleMap = ImmutableMap.of(COLUMN_INT.getName(), COLUMN_INT);
    Mockito.when(metadata.getColumnHandles(OFFLOAD_SESSION, OFFLOAD_HIVE_TABLE_HANDLE)).thenReturn(columnHandleMap);
    Mockito.when(metadata.getTableStatistics(Matchers.eq(OFFLOAD_SESSION), Matchers.eq(OFFLOAD_HIVE_TABLE_HANDLE), Matchers.any(Constraint.class), Matchers.eq(true))).thenReturn(statistics);
    return metadata;
}
Also used : ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) ColumnMetadata(io.prestosql.spi.connector.ColumnMetadata) HashMap(java.util.HashMap) Constraint(io.prestosql.spi.connector.Constraint) DoubleRange(io.prestosql.spi.statistics.DoubleRange) TableStatistics(io.prestosql.spi.statistics.TableStatistics) HiveMetadata(io.prestosql.plugin.hive.HiveMetadata) ConnectorTableMetadata(io.prestosql.spi.connector.ConnectorTableMetadata)

Example 12 with ColumnStatistics

use of io.prestosql.spi.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class TablePushdown method isTableWithUniqueColumnTableStatistics.

/**
 * @param tableStatistics for the current table being parsed in the plan tree.
 * @param tableHandle for the TableScanNode currently being evaluated in the plan tree.
 * @return if the table satisfies the unique column requirement
 */
private boolean isTableWithUniqueColumnTableStatistics(TableStatistics tableStatistics, TableHandle tableHandle) {
    boolean joinColumnExists = false;
    Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(ruleContext.getSession(), tableHandle);
    ColumnStatistics columnStatistics = null;
    if (columnHandles.containsKey(joinCriteriaStrings[0])) {
        columnStatistics = tableStatistics.getColumnStatistics().get(columnHandles.get(joinCriteriaStrings[0]));
        joinColumnExists = true;
    } else if (columnHandles.containsKey(joinCriteriaStrings[1])) {
        columnStatistics = tableStatistics.getColumnStatistics().get(columnHandles.get(joinCriteriaStrings[1]));
        joinColumnExists = true;
    }
    if (!joinColumnExists) {
        return false;
    } else {
        requireNonNull(columnStatistics, "Column Statistics cannot be null if the column exists for the table");
        return tableStatistics.getRowCount().getValue() == columnStatistics.getDistinctValuesCount().getValue();
    }
}
Also used : ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) ColumnHandle(io.prestosql.spi.connector.ColumnHandle)

Example 13 with ColumnStatistics

use of io.prestosql.spi.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class TestConnectorFilterStatsCalculatorService method testTableStatisticsAfterFilter.

@Test
public void testTableStatisticsAfterFilter() {
    // totalSize always be zero
    assertPredicate("true", zeroTableStatistics, zeroTableStatistics);
    assertPredicate("x < 3e0", zeroTableStatistics, unknownTableStatistics);
    assertPredicate("false", zeroTableStatistics, zeroTableStatistics);
    // rowCount and totalSize all NaN
    assertPredicate("true", TableStatistics.empty(), TableStatistics.empty());
    // rowCount and totalSize from NaN to 0.0
    assertPredicate("false", TableStatistics.empty(), TableStatistics.builder().setRowCount(Estimate.zero()).build());
    TableStatistics filteredToZeroStatistics = TableStatistics.builder().setRowCount(Estimate.zero()).setColumnStatistics(xColumn, new ColumnStatistics(Estimate.of(1.0), Estimate.zero(), Estimate.zero(), Optional.empty())).build();
    assertPredicate("false", originalTableStatistics, filteredToZeroStatistics);
    TableStatistics filteredStatistics = TableStatistics.builder().setRowCount(Estimate.of(37.5)).setColumnStatistics(xColumn, new ColumnStatistics(Estimate.zero(), Estimate.of(20), Estimate.unknown(), Optional.of(new DoubleRange(-10, 0)))).build();
    assertPredicate("x < 0", originalTableStatistics, filteredStatistics);
    TableStatistics filteredStatisticsWithoutTotalSize = TableStatistics.builder().setRowCount(Estimate.of(37.5)).setColumnStatistics(xColumn, new ColumnStatistics(Estimate.zero(), Estimate.of(20), Estimate.unknown(), Optional.of(new DoubleRange(-10, 0)))).build();
    assertPredicate("x < 0", originalTableStatisticsWithoutTotalSize, filteredStatisticsWithoutTotalSize);
}
Also used : ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) DoubleRange(io.prestosql.spi.statistics.DoubleRange) TableStatistics(io.prestosql.spi.statistics.TableStatistics) Test(org.testng.annotations.Test)

Example 14 with ColumnStatistics

use of io.prestosql.spi.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class MetastoreHiveStatisticsProvider method calculateDataSize.

@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
        if (!statistics.getBasicStatistics().getRowCount().isPresent()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getTotalSizeInBytes().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
        return Estimate.unknown();
    }
    long knownRowCount = 0;
    long knownDataSize = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verify(columnStatistics != null, "columnStatistics is null");
        long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
        verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
        knownRowCount += rowCount;
        knownDataSize += dataSize;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    if (knownRowCount == 0) {
        return Estimate.unknown();
    }
    double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
    return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
Also used : HiveBasicStatistics(io.prestosql.plugin.hive.HiveBasicStatistics) TableStatistics(io.prestosql.spi.statistics.TableStatistics) Varchars.isVarcharType(io.prestosql.spi.type.Varchars.isVarcharType) Collections.unmodifiableList(java.util.Collections.unmodifiableList) DecimalType(io.prestosql.spi.type.DecimalType) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) NullableValue(io.prestosql.spi.predicate.NullableValue) BigDecimal(java.math.BigDecimal) HiveSessionProperties.isStatisticsEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) Type(io.prestosql.spi.type.Type) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Chars.isCharType(io.prestosql.spi.type.Chars.isCharType) Double.parseDouble(java.lang.Double.parseDouble) HiveErrorCode(io.prestosql.plugin.hive.HiveErrorCode) PrestoException(io.prestosql.spi.PrestoException) HiveSessionProperties.getPartitionStatisticsSampleSize(io.prestosql.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Decimals.isLongDecimal(io.prestosql.spi.type.Decimals.isLongDecimal) TINYINT(io.prestosql.spi.type.TinyintType.TINYINT) DecimalStatistics(io.prestosql.plugin.hive.metastore.DecimalStatistics) String.format(java.lang.String.format) HivePartition(io.prestosql.plugin.hive.HivePartition) Objects(java.util.Objects) Decimals.isShortDecimal(io.prestosql.spi.type.Decimals.isShortDecimal) List(java.util.List) DoubleStatistics(io.prestosql.plugin.hive.metastore.DoubleStatistics) Table(io.prestosql.plugin.hive.metastore.Table) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) Decimals(io.prestosql.spi.type.Decimals) INTEGER(io.prestosql.spi.type.IntegerType.INTEGER) DoubleRange(io.prestosql.spi.statistics.DoubleRange) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) SchemaTableName(io.prestosql.spi.connector.SchemaTableName) UNPARTITIONED_ID(io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DOUBLE(io.prestosql.spi.type.DoubleType.DOUBLE) Double.isFinite(java.lang.Double.isFinite) DateStatistics(io.prestosql.plugin.hive.metastore.DateStatistics) DATE(io.prestosql.spi.type.DateType.DATE) REAL(io.prestosql.spi.type.RealType.REAL) SemiTransactionalHiveMetastore(io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore) ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) VerifyException(com.google.common.base.VerifyException) HiveIdentity(io.prestosql.plugin.hive.authentication.HiveIdentity) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) SignedBytes(com.google.common.primitives.SignedBytes) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) Estimate(io.prestosql.spi.statistics.Estimate) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) SMALLINT(io.prestosql.spi.type.SmallintType.SMALLINT) Double.isNaN(java.lang.Double.isNaN) IntegerStatistics(io.prestosql.plugin.hive.metastore.IntegerStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) HiveSessionProperties.isIgnoreCorruptedStatistics(io.prestosql.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 15 with ColumnStatistics

use of io.prestosql.spi.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class MetastoreHiveStatisticsProvider method calculateNullsFraction.

@VisibleForTesting
static Estimate calculateNullsFraction(String column, Collection<PartitionStatistics> partitionStatistics) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream().filter(statistics -> {
        if (!statistics.getBasicStatistics().getRowCount().isPresent()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getNullsCount().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) {
        return Estimate.unknown();
    }
    long totalNullsCount = 0;
    long totalRowCount = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verify(columnStatistics != null, "columnStatistics is null");
        long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present"));
        verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero");
        verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount);
        totalNullsCount += nullsCount;
        totalRowCount += rowCount;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    verify(totalNullsCount <= totalRowCount, "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", totalNullsCount, totalRowCount);
    return Estimate.of(((double) totalNullsCount) / totalRowCount);
}
Also used : HiveBasicStatistics(io.prestosql.plugin.hive.HiveBasicStatistics) TableStatistics(io.prestosql.spi.statistics.TableStatistics) Varchars.isVarcharType(io.prestosql.spi.type.Varchars.isVarcharType) Collections.unmodifiableList(java.util.Collections.unmodifiableList) DecimalType(io.prestosql.spi.type.DecimalType) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) NullableValue(io.prestosql.spi.predicate.NullableValue) BigDecimal(java.math.BigDecimal) HiveSessionProperties.isStatisticsEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) Type(io.prestosql.spi.type.Type) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Chars.isCharType(io.prestosql.spi.type.Chars.isCharType) Double.parseDouble(java.lang.Double.parseDouble) HiveErrorCode(io.prestosql.plugin.hive.HiveErrorCode) PrestoException(io.prestosql.spi.PrestoException) HiveSessionProperties.getPartitionStatisticsSampleSize(io.prestosql.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Decimals.isLongDecimal(io.prestosql.spi.type.Decimals.isLongDecimal) TINYINT(io.prestosql.spi.type.TinyintType.TINYINT) DecimalStatistics(io.prestosql.plugin.hive.metastore.DecimalStatistics) String.format(java.lang.String.format) HivePartition(io.prestosql.plugin.hive.HivePartition) Objects(java.util.Objects) Decimals.isShortDecimal(io.prestosql.spi.type.Decimals.isShortDecimal) List(java.util.List) DoubleStatistics(io.prestosql.plugin.hive.metastore.DoubleStatistics) Table(io.prestosql.plugin.hive.metastore.Table) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) Decimals(io.prestosql.spi.type.Decimals) INTEGER(io.prestosql.spi.type.IntegerType.INTEGER) DoubleRange(io.prestosql.spi.statistics.DoubleRange) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) SchemaTableName(io.prestosql.spi.connector.SchemaTableName) UNPARTITIONED_ID(io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DOUBLE(io.prestosql.spi.type.DoubleType.DOUBLE) Double.isFinite(java.lang.Double.isFinite) DateStatistics(io.prestosql.plugin.hive.metastore.DateStatistics) DATE(io.prestosql.spi.type.DateType.DATE) REAL(io.prestosql.spi.type.RealType.REAL) SemiTransactionalHiveMetastore(io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore) ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) VerifyException(com.google.common.base.VerifyException) HiveIdentity(io.prestosql.plugin.hive.authentication.HiveIdentity) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) SignedBytes(com.google.common.primitives.SignedBytes) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) Estimate(io.prestosql.spi.statistics.Estimate) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) SMALLINT(io.prestosql.spi.type.SmallintType.SMALLINT) Double.isNaN(java.lang.Double.isNaN) IntegerStatistics(io.prestosql.plugin.hive.metastore.IntegerStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) HiveSessionProperties.isIgnoreCorruptedStatistics(io.prestosql.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

ColumnStatistics (io.prestosql.spi.statistics.ColumnStatistics)19 TableStatistics (io.prestosql.spi.statistics.TableStatistics)15 ColumnHandle (io.prestosql.spi.connector.ColumnHandle)14 Type (io.prestosql.spi.type.Type)11 Map (java.util.Map)11 HiveColumnStatistics (io.prestosql.plugin.hive.metastore.HiveColumnStatistics)10 Chars.isCharType (io.prestosql.spi.type.Chars.isCharType)10 Varchars.isVarcharType (io.prestosql.spi.type.Varchars.isVarcharType)10 ImmutableMap (com.google.common.collect.ImmutableMap)8 DoubleRange (io.prestosql.spi.statistics.DoubleRange)8 DecimalType (io.prestosql.spi.type.DecimalType)8 OptionalDouble (java.util.OptionalDouble)8 Verify.verify (com.google.common.base.Verify.verify)7 HiveColumnHandle (io.prestosql.plugin.hive.HiveColumnHandle)7 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)6 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)6 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)6 Logger (io.airlift.log.Logger)6 Slice (io.airlift.slice.Slice)6 HiveIdentity (io.prestosql.plugin.hive.authentication.HiveIdentity)6