Search in sources :

Example 11 with TableStatistics

use of io.trino.spi.statistics.TableStatistics in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method testStatisticsInfinityAndNaN.

@Test
public void testStatisticsInfinityAndNaN() {
    // Stats with NaN values cannot be used
    DeltaLakeTableHandle tableHandle = registerTable("infinity_nan");
    TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
    ColumnStatistics columnStatistics = stats.getColumnStatistics().get(COLUMN_HANDLE);
    assertEquals(columnStatistics.getRange().get().getMin(), POSITIVE_INFINITY);
    assertEquals(columnStatistics.getRange().get().getMax(), POSITIVE_INFINITY);
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Test(org.testng.annotations.Test)

Example 12 with TableStatistics

use of io.trino.spi.statistics.TableStatistics in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method testStatisticsParquetParsedStatisticsNaNValues.

@Test
public void testStatisticsParquetParsedStatisticsNaNValues() {
    // The transaction log for this table was created so that the checkpoints only write struct statistics, not json statistics
    // The table has a REAL and DOUBLE columns each with 9 values, one of them being NaN
    DeltaLakeTableHandle tableHandle = registerTable("parquet_struct_statistics_nan");
    TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
    assertEquals(stats.getRowCount(), Estimate.of(9));
    Map<ColumnHandle, ColumnStatistics> statisticsMap = stats.getColumnStatistics();
    ColumnStatistics columnStats = statisticsMap.get(new DeltaLakeColumnHandle("fl", REAL, REGULAR));
    assertEquals(columnStats.getNullsFraction(), Estimate.zero());
    assertThat(columnStats.getRange()).isEmpty();
    columnStats = statisticsMap.get(new DeltaLakeColumnHandle("dou", DOUBLE, REGULAR));
    assertEquals(columnStats.getNullsFraction(), Estimate.zero());
    assertThat(columnStats.getRange()).isEmpty();
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) Test(org.testng.annotations.Test)

Example 13 with TableStatistics

use of io.trino.spi.statistics.TableStatistics in project trino by trinodb.

the class TableScanStatsRule method doCalculate.

@Override
protected Optional<PlanNodeStatsEstimate> doCalculate(TableScanNode node, StatsProvider sourceStats, Lookup lookup, Session session, TypeProvider types) {
    if (isStatisticsPrecalculationForPushdownEnabled(session) && node.getStatistics().isPresent()) {
        return node.getStatistics();
    }
    // TODO Construct predicate like AddExchanges's LayoutConstraintEvaluator
    Constraint constraint = new Constraint(TupleDomain.all());
    TableStatistics tableStatistics = metadata.getTableStatistics(session, node.getTable(), constraint);
    Map<Symbol, SymbolStatsEstimate> outputSymbolStats = new HashMap<>();
    for (Map.Entry<Symbol, ColumnHandle> entry : node.getAssignments().entrySet()) {
        Symbol symbol = entry.getKey();
        Optional<ColumnStatistics> columnStatistics = Optional.ofNullable(tableStatistics.getColumnStatistics().get(entry.getValue()));
        SymbolStatsEstimate symbolStatistics = columnStatistics.map(statistics -> toSymbolStatistics(tableStatistics, statistics, types.get(symbol))).orElse(SymbolStatsEstimate.unknown());
        outputSymbolStats.put(symbol, symbolStatistics);
    }
    return Optional.of(PlanNodeStatsEstimate.builder().setOutputRowCount(tableStatistics.getRowCount().getValue()).addSymbolStatistics(outputSymbolStats).build());
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) Symbol(io.trino.sql.planner.Symbol) Constraint(io.trino.spi.connector.Constraint) Lookup(io.trino.sql.planner.iterative.Lookup) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomain(io.trino.spi.predicate.TupleDomain) Patterns.tableScan(io.trino.sql.planner.plan.Patterns.tableScan) SystemSessionProperties.isStatisticsPrecalculationForPushdownEnabled(io.trino.SystemSessionProperties.isStatisticsPrecalculationForPushdownEnabled) Pattern(io.trino.matching.Pattern) NaN(java.lang.Double.NaN) FixedWidthType(io.trino.spi.type.FixedWidthType) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) Metadata(io.trino.metadata.Metadata) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) TypeProvider(io.trino.sql.planner.TypeProvider) Optional(java.util.Optional) TableScanNode(io.trino.sql.planner.plan.TableScanNode) Session(io.trino.Session) ColumnHandle(io.trino.spi.connector.ColumnHandle) Constraint(io.trino.spi.connector.Constraint) HashMap(java.util.HashMap) Symbol(io.trino.sql.planner.Symbol) TableStatistics(io.trino.spi.statistics.TableStatistics) HashMap(java.util.HashMap) Map(java.util.Map)

Example 14 with TableStatistics

use of io.trino.spi.statistics.TableStatistics in project trino by trinodb.

the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.

@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
    TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
    double numRecords = 0L;
    MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
    List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
    List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
    Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
    columns.forEach(column -> nullCounts.put(column, 0.0));
    Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
    columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
    if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
        return createZeroStatistics(columns);
    }
    Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
    List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
    for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
        Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
        if (fileStatistics.isEmpty()) {
            // Open source Delta Lake does not collect stats
            return TableStatistics.empty();
        }
        DeltaLakeFileStatistics stats = fileStatistics.get();
        if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
            continue;
        }
        TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
        if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
            continue;
        }
        if (stats.getNumRecords().isEmpty()) {
            // Not clear if it's possible for stats to be present with no row count, but bail out if that happens
            return TableStatistics.empty();
        }
        numRecords += stats.getNumRecords().get();
        for (DeltaLakeColumnHandle column : columns) {
            if (column.getColumnType() == PARTITION_KEY) {
                Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
                if (partitionValue.isEmpty()) {
                    nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
                } else {
                    // NULL is not counted as a distinct value
                    // Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
                    // it may not be true in case of real, doubles, timestamps etc
                    partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
                }
            } else {
                Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
                if (maybeNullCount.isPresent()) {
                    nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
                } else {
                    // If any individual file fails to report null counts, fail to calculate the total for the table
                    nullCounts.put(column, NaN);
                }
            }
            // Math.min returns NaN if any operand is NaN
            stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
            stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
        }
    }
    if (numRecords == 0) {
        return createZeroStatistics(columns);
    }
    TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
    Optional<DeltaLakeStatistics> statistics = Optional.empty();
    if (isExtendedStatisticsEnabled(session)) {
        statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
    }
    for (DeltaLakeColumnHandle column : columns) {
        ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
        Double nullCount = nullCounts.get(column);
        columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
        Double maxValue = maxValues.get(column);
        Double minValue = minValues.get(column);
        if (isValidInRange(maxValue) && isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
        } else if (isValidInRange(maxValue)) {
            columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
        } else if (isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
        }
        // extend statistics with NDV
        if (column.getColumnType() == PARTITION_KEY) {
            columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
        }
        if (statistics.isPresent()) {
            DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
            if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
                columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
            }
        }
        statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
    }
    return statsBuilder.build();
}
Also used : DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) POSITIVE_INFINITY(java.lang.Double.POSITIVE_INFINITY) PATH_PROPERTY(io.trino.plugin.deltalake.DeltaLakeMetadata.PATH_PROPERTY) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Database(io.trino.plugin.hive.metastore.Database) NEGATIVE_INFINITY(java.lang.Double.NEGATIVE_INFINITY) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) DeltaLakeMetadata.createStatisticsPredicate(io.trino.plugin.deltalake.DeltaLakeMetadata.createStatisticsPredicate) NaN(java.lang.Double.NaN) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Table(io.trino.plugin.hive.metastore.Table) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Set(java.util.Set) DeltaLakeSplitManager.partitionMatchesPredicate(io.trino.plugin.deltalake.DeltaLakeSplitManager.partitionMatchesPredicate) DeltaLakeSchemaSupport(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) List(java.util.List) Optional(java.util.Optional) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DoubleRange(io.trino.spi.statistics.DoubleRange) Constraint(io.trino.spi.connector.Constraint) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) CachingDeltaLakeStatisticsAccess(io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) DELTA_LAKE_INVALID_TABLE(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_TABLE) OptionalDouble(java.util.OptionalDouble) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DeltaLakeSessionProperties.isExtendedStatisticsEnabled(io.trino.plugin.deltalake.DeltaLakeSessionProperties.isExtendedStatisticsEnabled) Estimate(io.trino.spi.statistics.Estimate) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) PARTITION_KEY(io.trino.plugin.deltalake.DeltaLakeColumnType.PARTITION_KEY) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) DELTA_LAKE_INVALID_SCHEMA(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) PrincipalPrivileges(io.trino.plugin.hive.metastore.PrincipalPrivileges) TypeManager(io.trino.spi.type.TypeManager) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Set(java.util.Set) HashSet(java.util.HashSet) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) HashMap(java.util.HashMap) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) HashSet(java.util.HashSet) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) OptionalDouble(java.util.OptionalDouble) OptionalDouble(java.util.OptionalDouble) DoubleRange(io.trino.spi.statistics.DoubleRange) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) TrinoException(io.trino.spi.TrinoException) TableStatistics(io.trino.spi.statistics.TableStatistics)

Example 15 with TableStatistics

use of io.trino.spi.statistics.TableStatistics in project trino by trinodb.

the class HiveMetastoreBackedDeltaLakeMetastore method createZeroStatistics.

private TableStatistics createZeroStatistics(List<DeltaLakeColumnHandle> columns) {
    TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(0));
    for (DeltaLakeColumnHandle column : columns) {
        ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder();
        columnStatistics.setNullsFraction(Estimate.of(0));
        columnStatistics.setDistinctValuesCount(Estimate.of(0));
        statsBuilder.setColumnStatistics(column, columnStatistics.build());
    }
    return statsBuilder.build();
}
Also used : DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle)

Aggregations

TableStatistics (io.trino.spi.statistics.TableStatistics)35 ColumnStatistics (io.trino.spi.statistics.ColumnStatistics)23 Test (org.testng.annotations.Test)20 DeltaLakeTableHandle (io.trino.plugin.deltalake.DeltaLakeTableHandle)15 ColumnHandle (io.trino.spi.connector.ColumnHandle)13 SchemaTableName (io.trino.spi.connector.SchemaTableName)9 Map (java.util.Map)7 Constraint (io.trino.spi.connector.Constraint)6 ImmutableMap (com.google.common.collect.ImmutableMap)5 ConnectorTableHandle (io.trino.spi.connector.ConnectorTableHandle)5 TupleDomain (io.trino.spi.predicate.TupleDomain)5 Type (io.trino.spi.type.Type)5 IOException (java.io.IOException)5 Objects.requireNonNull (java.util.Objects.requireNonNull)5 Optional (java.util.Optional)5 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)4 DeltaLakeColumnHandle (io.trino.plugin.deltalake.DeltaLakeColumnHandle)4 DoubleRange (io.trino.spi.statistics.DoubleRange)4 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)3 ImmutableList (com.google.common.collect.ImmutableList)3