Search in sources :

Example 31 with PartitionStatistics

use of io.prestosql.plugin.hive.PartitionStatistics in project boostkit-bigdata by kunpengcompute.

the class TestSemiTransactionalHiveMetastore method updatePartitionsStatistics.

private void updatePartitionsStatistics() {
    Map<String, Function<PartitionStatistics, PartitionStatistics>> partNamesUpdateMap = new HashMap<>();
    List<PartitionStatistics> statistics = ImmutableList.of(STATISTICS_1, STATISTICS_1);
    for (int index = 0; index < partitions.size(); index++) {
        PartitionStatistics partitionStatistics = statistics.get(index);
        partNamesUpdateMap.put(partitions.get(index), actualStatistics -> partitionStatistics);
    }
    thriftHiveMetastore.updatePartitionsStatistics(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE_UP_NAME, partNamesUpdateMap);
}
Also used : Function(java.util.function.Function) HashMap(java.util.HashMap) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics)

Example 32 with PartitionStatistics

use of io.prestosql.plugin.hive.PartitionStatistics in project boostkit-bigdata by kunpengcompute.

the class MetastoreHiveStatisticsProvider method calculateDataSize.

@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
        if (!statistics.getBasicStatistics().getRowCount().isPresent()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getTotalSizeInBytes().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
        return Estimate.unknown();
    }
    long knownRowCount = 0;
    long knownDataSize = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verify(columnStatistics != null, "columnStatistics is null");
        long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
        verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
        knownRowCount += rowCount;
        knownDataSize += dataSize;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    if (knownRowCount == 0) {
        return Estimate.unknown();
    }
    double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
    return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
Also used : HiveBasicStatistics(io.prestosql.plugin.hive.HiveBasicStatistics) TableStatistics(io.prestosql.spi.statistics.TableStatistics) Varchars.isVarcharType(io.prestosql.spi.type.Varchars.isVarcharType) Collections.unmodifiableList(java.util.Collections.unmodifiableList) DecimalType(io.prestosql.spi.type.DecimalType) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) NullableValue(io.prestosql.spi.predicate.NullableValue) BigDecimal(java.math.BigDecimal) HiveSessionProperties.isStatisticsEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) Type(io.prestosql.spi.type.Type) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Chars.isCharType(io.prestosql.spi.type.Chars.isCharType) Double.parseDouble(java.lang.Double.parseDouble) HiveErrorCode(io.prestosql.plugin.hive.HiveErrorCode) PrestoException(io.prestosql.spi.PrestoException) HiveSessionProperties.getPartitionStatisticsSampleSize(io.prestosql.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Decimals.isLongDecimal(io.prestosql.spi.type.Decimals.isLongDecimal) TINYINT(io.prestosql.spi.type.TinyintType.TINYINT) DecimalStatistics(io.prestosql.plugin.hive.metastore.DecimalStatistics) String.format(java.lang.String.format) HivePartition(io.prestosql.plugin.hive.HivePartition) Objects(java.util.Objects) Decimals.isShortDecimal(io.prestosql.spi.type.Decimals.isShortDecimal) List(java.util.List) DoubleStatistics(io.prestosql.plugin.hive.metastore.DoubleStatistics) Table(io.prestosql.plugin.hive.metastore.Table) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) Decimals(io.prestosql.spi.type.Decimals) INTEGER(io.prestosql.spi.type.IntegerType.INTEGER) DoubleRange(io.prestosql.spi.statistics.DoubleRange) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) SchemaTableName(io.prestosql.spi.connector.SchemaTableName) UNPARTITIONED_ID(io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DOUBLE(io.prestosql.spi.type.DoubleType.DOUBLE) Double.isFinite(java.lang.Double.isFinite) DateStatistics(io.prestosql.plugin.hive.metastore.DateStatistics) DATE(io.prestosql.spi.type.DateType.DATE) REAL(io.prestosql.spi.type.RealType.REAL) SemiTransactionalHiveMetastore(io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore) ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) VerifyException(com.google.common.base.VerifyException) HiveIdentity(io.prestosql.plugin.hive.authentication.HiveIdentity) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) SignedBytes(com.google.common.primitives.SignedBytes) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) Estimate(io.prestosql.spi.statistics.Estimate) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) SMALLINT(io.prestosql.spi.type.SmallintType.SMALLINT) Double.isNaN(java.lang.Double.isNaN) IntegerStatistics(io.prestosql.plugin.hive.metastore.IntegerStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) HiveSessionProperties.isIgnoreCorruptedStatistics(io.prestosql.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 33 with PartitionStatistics

use of io.prestosql.plugin.hive.PartitionStatistics in project boostkit-bigdata by kunpengcompute.

the class TestMetastoreHiveStatisticsProvider method testGetTableStatisticsValidationFailure.

@Test
public void testGetTableStatisticsValidationFailure() {
    PartitionStatistics corruptedStatistics = PartitionStatistics.builder().setBasicStatistics(new HiveBasicStatistics(-1, 0, 0, 0)).build();
    String partitionName = "p1=string1/p2=1234";
    MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, schemaTableName, hivePartitions, table) -> ImmutableMap.of(partitionName, corruptedStatistics));
    TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setIgnoreCorruptedStatistics(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties());
    assertThatThrownBy(() -> statisticsProvider.getTableStatistics(session, TABLE, ImmutableMap.of(), ImmutableMap.of(), ImmutableList.of(partition(partitionName)), true, table)).isInstanceOf(PrestoException.class).hasFieldOrPropertyWithValue("errorCode", HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode());
    TestingConnectorSession ignoreSession = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setIgnoreCorruptedStatistics(true), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties());
    assertEquals(statisticsProvider.getTableStatistics(ignoreSession, TABLE, ImmutableMap.of(), ImmutableMap.of(), ImmutableList.of(partition(partitionName)), true, table), TableStatistics.empty());
}
Also used : MetastoreHiveStatisticsProvider.validatePartitionStatistics(io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.validatePartitionStatistics) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) TestingConnectorSession(io.prestosql.testing.TestingConnectorSession) OrcFileWriterConfig(io.prestosql.plugin.hive.OrcFileWriterConfig) PrestoException(io.prestosql.spi.PrestoException) HiveBasicStatistics(io.prestosql.plugin.hive.HiveBasicStatistics) HiveSessionProperties(io.prestosql.plugin.hive.HiveSessionProperties) ParquetFileWriterConfig(io.prestosql.plugin.hive.ParquetFileWriterConfig) HiveConfig(io.prestosql.plugin.hive.HiveConfig) Test(org.testng.annotations.Test)

Example 34 with PartitionStatistics

use of io.prestosql.plugin.hive.PartitionStatistics in project boostkit-bigdata by kunpengcompute.

the class TestMetastoreHiveStatisticsProvider method testGetTableStatisticsUnpartitioned.

@Test
public void testGetTableStatisticsUnpartitioned() {
    PartitionStatistics statistics = PartitionStatistics.builder().setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())).setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))).build();
    MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, schemaTableName, hivePartitions, table) -> ImmutableMap.of(UNPARTITIONED_ID, statistics));
    TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties());
    HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty());
    TableStatistics expected = TableStatistics.builder().setRowCount(Estimate.of(1000)).setColumnStatistics(columnHandle, ColumnStatistics.builder().setRange(new DoubleRange(-100, 100)).setNullsFraction(Estimate.of(0.5)).setDistinctValuesCount(Estimate.of(300)).build()).build();
    assertEquals(statisticsProvider.getTableStatistics(session, TABLE, ImmutableMap.of(COLUMN, columnHandle), ImmutableMap.of(COLUMN, BIGINT), ImmutableList.of(new HivePartition(TABLE)), true, table), expected);
}
Also used : DoubleRange(io.prestosql.spi.statistics.DoubleRange) MetastoreHiveStatisticsProvider.validatePartitionStatistics(io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.validatePartitionStatistics) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) TestingConnectorSession(io.prestosql.testing.TestingConnectorSession) OrcFileWriterConfig(io.prestosql.plugin.hive.OrcFileWriterConfig) TableStatistics(io.prestosql.spi.statistics.TableStatistics) HiveBasicStatistics(io.prestosql.plugin.hive.HiveBasicStatistics) HiveSessionProperties(io.prestosql.plugin.hive.HiveSessionProperties) ParquetFileWriterConfig(io.prestosql.plugin.hive.ParquetFileWriterConfig) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) HiveConfig(io.prestosql.plugin.hive.HiveConfig) HivePartition(io.prestosql.plugin.hive.HivePartition) Test(org.testng.annotations.Test)

Example 35 with PartitionStatistics

use of io.prestosql.plugin.hive.PartitionStatistics in project hetu-core by openlookeng.

the class CarbondataMetadata method finishInsertInNewPartition.

@Override
protected void finishInsertInNewPartition(ConnectorSession session, HiveInsertTableHandle handle, Table table, Map<String, Type> columnTypes, PartitionUpdate partitionUpdate, Map<List<String>, ComputedStatistics> partitionComputedStatistics, HiveACIDWriteType acidWriteType) {
    // insert into new partition or overwrite existing partition
    if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) {
        List<String> partitionValues = toPartitionValues(partitionUpdate.getName());
        PartitionStatistics partitionStatistics = createPartitionStatistics(session, partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partitionValues));
        metastore.finishInsertIntoExistingPartition(session, handle.getSchemaName(), handle.getTableName(), partitionValues, partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), partitionStatistics, acidWriteType);
    } else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.APPEND) {
        Partition partition = buildPartitionObject(session, table, partitionUpdate);
        if (!partition.getStorage().getStorageFormat().getInputFormat().equals(handle.getPartitionStorageFormat().getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) {
            throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Partition format changed during insert");
        }
        PartitionStatistics partitionStatistics = createPartitionStatistics(session, partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partition.getValues()));
        metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), partition, partitionUpdate.getWritePath(), partitionStatistics, acidWriteType);
    }
}
Also used : Partition(io.prestosql.plugin.hive.metastore.Partition) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) PrestoException(io.prestosql.spi.PrestoException)

Aggregations

PartitionStatistics (io.prestosql.plugin.hive.PartitionStatistics)62 PrestoException (io.prestosql.spi.PrestoException)32 HiveBasicStatistics (io.prestosql.plugin.hive.HiveBasicStatistics)31 SchemaTableName (io.prestosql.spi.connector.SchemaTableName)31 HivePartition (io.prestosql.plugin.hive.HivePartition)20 HiveIdentity (io.prestosql.plugin.hive.authentication.HiveIdentity)18 TableNotFoundException (io.prestosql.spi.connector.TableNotFoundException)18 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)16 ImmutableMap (com.google.common.collect.ImmutableMap)16 PartitionNotFoundException (io.prestosql.plugin.hive.PartitionNotFoundException)16 ArrayList (java.util.ArrayList)15 HiveColumnStatistics (io.prestosql.plugin.hive.metastore.HiveColumnStatistics)14 List (java.util.List)14 OptionalLong (java.util.OptionalLong)14 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)12 ImmutableList (com.google.common.collect.ImmutableList)12 Logger (io.airlift.log.Logger)12 HiveErrorCode (io.prestosql.plugin.hive.HiveErrorCode)12 Type (io.prestosql.spi.type.Type)12 Map (java.util.Map)12