Search in sources :

Example 11 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method testStatisticsInfinityAndNaN.

@Test
public void testStatisticsInfinityAndNaN() {
    // Stats with NaN values cannot be used
    DeltaLakeTableHandle tableHandle = registerTable("infinity_nan");
    TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
    ColumnStatistics columnStatistics = stats.getColumnStatistics().get(COLUMN_HANDLE);
    assertEquals(columnStatistics.getRange().get().getMin(), POSITIVE_INFINITY);
    assertEquals(columnStatistics.getRange().get().getMax(), POSITIVE_INFINITY);
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Test(org.testng.annotations.Test)

Example 12 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method testStatisticsParquetParsedStatisticsNaNValues.

@Test
public void testStatisticsParquetParsedStatisticsNaNValues() {
    // The transaction log for this table was created so that the checkpoints only write struct statistics, not json statistics
    // The table has a REAL and DOUBLE columns each with 9 values, one of them being NaN
    DeltaLakeTableHandle tableHandle = registerTable("parquet_struct_statistics_nan");
    TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
    assertEquals(stats.getRowCount(), Estimate.of(9));
    Map<ColumnHandle, ColumnStatistics> statisticsMap = stats.getColumnStatistics();
    ColumnStatistics columnStats = statisticsMap.get(new DeltaLakeColumnHandle("fl", REAL, REGULAR));
    assertEquals(columnStats.getNullsFraction(), Estimate.zero());
    assertThat(columnStats.getRange()).isEmpty();
    columnStats = statisticsMap.get(new DeltaLakeColumnHandle("dou", DOUBLE, REGULAR));
    assertEquals(columnStats.getNullsFraction(), Estimate.zero());
    assertThat(columnStats.getRange()).isEmpty();
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) Test(org.testng.annotations.Test)

Example 13 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class TableScanStatsRule method doCalculate.

@Override
protected Optional<PlanNodeStatsEstimate> doCalculate(TableScanNode node, StatsProvider sourceStats, Lookup lookup, Session session, TypeProvider types) {
    if (isStatisticsPrecalculationForPushdownEnabled(session) && node.getStatistics().isPresent()) {
        return node.getStatistics();
    }
    // TODO Construct predicate like AddExchanges's LayoutConstraintEvaluator
    Constraint constraint = new Constraint(TupleDomain.all());
    TableStatistics tableStatistics = metadata.getTableStatistics(session, node.getTable(), constraint);
    Map<Symbol, SymbolStatsEstimate> outputSymbolStats = new HashMap<>();
    for (Map.Entry<Symbol, ColumnHandle> entry : node.getAssignments().entrySet()) {
        Symbol symbol = entry.getKey();
        Optional<ColumnStatistics> columnStatistics = Optional.ofNullable(tableStatistics.getColumnStatistics().get(entry.getValue()));
        SymbolStatsEstimate symbolStatistics = columnStatistics.map(statistics -> toSymbolStatistics(tableStatistics, statistics, types.get(symbol))).orElse(SymbolStatsEstimate.unknown());
        outputSymbolStats.put(symbol, symbolStatistics);
    }
    return Optional.of(PlanNodeStatsEstimate.builder().setOutputRowCount(tableStatistics.getRowCount().getValue()).addSymbolStatistics(outputSymbolStats).build());
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) Symbol(io.trino.sql.planner.Symbol) Constraint(io.trino.spi.connector.Constraint) Lookup(io.trino.sql.planner.iterative.Lookup) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomain(io.trino.spi.predicate.TupleDomain) Patterns.tableScan(io.trino.sql.planner.plan.Patterns.tableScan) SystemSessionProperties.isStatisticsPrecalculationForPushdownEnabled(io.trino.SystemSessionProperties.isStatisticsPrecalculationForPushdownEnabled) Pattern(io.trino.matching.Pattern) NaN(java.lang.Double.NaN) FixedWidthType(io.trino.spi.type.FixedWidthType) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) Metadata(io.trino.metadata.Metadata) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) TypeProvider(io.trino.sql.planner.TypeProvider) Optional(java.util.Optional) TableScanNode(io.trino.sql.planner.plan.TableScanNode) Session(io.trino.Session) ColumnHandle(io.trino.spi.connector.ColumnHandle) Constraint(io.trino.spi.connector.Constraint) HashMap(java.util.HashMap) Symbol(io.trino.sql.planner.Symbol) TableStatistics(io.trino.spi.statistics.TableStatistics) HashMap(java.util.HashMap) Map(java.util.Map)

Example 14 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class HiveMetastoreBackedDeltaLakeMetastore method createZeroStatistics.

private TableStatistics createZeroStatistics(List<DeltaLakeColumnHandle> columns) {
    TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(0));
    for (DeltaLakeColumnHandle column : columns) {
        ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder();
        columnStatistics.setNullsFraction(Estimate.of(0));
        columnStatistics.setDistinctValuesCount(Estimate.of(0));
        statsBuilder.setColumnStatistics(column, columnStatistics.build());
    }
    return statsBuilder.build();
}
Also used : DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle)

Example 15 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class TableStatisticsMaker method makeTableStatistics.

private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
    if (tableHandle.getSnapshotId().isEmpty() || constraint.getSummary().isNone()) {
        return TableStatistics.empty();
    }
    TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transformKeys(IcebergColumnHandle.class::cast).intersect(tableHandle.getEnforcedPredicate());
    if (intersection.isNone()) {
        return TableStatistics.empty();
    }
    Schema icebergTableSchema = icebergTable.schema();
    List<Types.NestedField> columns = icebergTableSchema.columns();
    Map<Integer, Type.PrimitiveType> idToTypeMapping = primitiveFieldTypes(icebergTableSchema);
    List<PartitionField> partitionFields = icebergTable.spec().fields();
    List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
    List<IcebergColumnHandle> columnHandles = getColumns(icebergTableSchema, typeManager);
    Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toUnmodifiableMap(IcebergColumnHandle::getId, identity()));
    ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
    for (int index = 0; index < partitionFields.size(); index++) {
        PartitionField field = partitionFields.get(index);
        Type type = icebergPartitionTypes.get(index);
        idToDetailsBuilder.put(field.fieldId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toTrinoType(type, typeManager), type.typeId().javaClass()));
    }
    Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.buildOrThrow();
    TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
    IcebergStatistics.Builder icebergStatisticsBuilder = new IcebergStatistics.Builder(columns, typeManager);
    try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
        for (FileScanTask fileScanTask : fileScanTasks) {
            DataFile dataFile = fileScanTask.file();
            if (!dataFileMatches(dataFile, constraint, partitionFields, idToDetails)) {
                continue;
            }
            icebergStatisticsBuilder.acceptDataFile(dataFile, fileScanTask.spec());
        }
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    IcebergStatistics summary = icebergStatisticsBuilder.build();
    if (summary.getFileCount() == 0) {
        return TableStatistics.empty();
    }
    ImmutableMap.Builder<ColumnHandle, ColumnStatistics> columnHandleBuilder = ImmutableMap.builder();
    double recordCount = summary.getRecordCount();
    for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
        int fieldId = columnHandle.getId();
        ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
        Long nullCount = summary.getNullCounts().get(fieldId);
        if (nullCount != null) {
            columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
        }
        if (summary.getColumnSizes() != null) {
            Long columnSize = summary.getColumnSizes().get(fieldId);
            if (columnSize != null) {
                columnBuilder.setDataSize(Estimate.of(columnSize));
            }
        }
        Object min = summary.getMinValues().get(fieldId);
        Object max = summary.getMaxValues().get(fieldId);
        if (min != null && max != null) {
            columnBuilder.setRange(DoubleRange.from(columnHandle.getType(), min, max));
        }
        columnHandleBuilder.put(columnHandle, columnBuilder.build());
    }
    return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.buildOrThrow());
}
Also used : Schema(org.apache.iceberg.Schema) UncheckedIOException(java.io.UncheckedIOException) DataFile(org.apache.iceberg.DataFile) PartitionField(org.apache.iceberg.PartitionField) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableScan(org.apache.iceberg.TableScan) ColumnHandle(io.trino.spi.connector.ColumnHandle) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) ImmutableMap(com.google.common.collect.ImmutableMap) Constraint(io.trino.spi.connector.Constraint) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) Type(org.apache.iceberg.types.Type) TableStatistics(io.trino.spi.statistics.TableStatistics) FileScanTask(org.apache.iceberg.FileScanTask)

Aggregations

ColumnStatistics (io.trino.spi.statistics.ColumnStatistics)24 TableStatistics (io.trino.spi.statistics.TableStatistics)23 Test (org.testng.annotations.Test)15 DeltaLakeTableHandle (io.trino.plugin.deltalake.DeltaLakeTableHandle)14 ColumnHandle (io.trino.spi.connector.ColumnHandle)10 Type (io.trino.spi.type.Type)6 ImmutableMap (com.google.common.collect.ImmutableMap)5 HiveColumnStatistics (io.trino.plugin.hive.metastore.HiveColumnStatistics)5 CharType (io.trino.spi.type.CharType)5 VarcharType (io.trino.spi.type.VarcharType)5 Map (java.util.Map)5 SchemaTableName (io.trino.spi.connector.SchemaTableName)4 DecimalType (io.trino.spi.type.DecimalType)4 Objects.requireNonNull (java.util.Objects.requireNonNull)4 MoreObjects.toStringHelper (com.google.common.base.MoreObjects.toStringHelper)3 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)3 Preconditions.checkState (com.google.common.base.Preconditions.checkState)3 Verify.verify (com.google.common.base.Verify.verify)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)3