Search in sources :

Example 16 with Constraint

use of io.trino.spi.connector.Constraint in project trino by trinodb.

the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.

@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
    TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
    double numRecords = 0L;
    MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
    List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
    List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
    Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
    columns.forEach(column -> nullCounts.put(column, 0.0));
    Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
    columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
    if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
        return createZeroStatistics(columns);
    }
    Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
    List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
    for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
        Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
        if (fileStatistics.isEmpty()) {
            // Open source Delta Lake does not collect stats
            return TableStatistics.empty();
        }
        DeltaLakeFileStatistics stats = fileStatistics.get();
        if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
            continue;
        }
        TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
        if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
            continue;
        }
        if (stats.getNumRecords().isEmpty()) {
            // Not clear if it's possible for stats to be present with no row count, but bail out if that happens
            return TableStatistics.empty();
        }
        numRecords += stats.getNumRecords().get();
        for (DeltaLakeColumnHandle column : columns) {
            if (column.getColumnType() == PARTITION_KEY) {
                Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
                if (partitionValue.isEmpty()) {
                    nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
                } else {
                    // NULL is not counted as a distinct value
                    // Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
                    // it may not be true in case of real, doubles, timestamps etc
                    partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
                }
            } else {
                Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
                if (maybeNullCount.isPresent()) {
                    nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
                } else {
                    // If any individual file fails to report null counts, fail to calculate the total for the table
                    nullCounts.put(column, NaN);
                }
            }
            // Math.min returns NaN if any operand is NaN
            stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
            stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
        }
    }
    if (numRecords == 0) {
        return createZeroStatistics(columns);
    }
    TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
    Optional<DeltaLakeStatistics> statistics = Optional.empty();
    if (isExtendedStatisticsEnabled(session)) {
        statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
    }
    for (DeltaLakeColumnHandle column : columns) {
        ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
        Double nullCount = nullCounts.get(column);
        columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
        Double maxValue = maxValues.get(column);
        Double minValue = minValues.get(column);
        if (isValidInRange(maxValue) && isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
        } else if (isValidInRange(maxValue)) {
            columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
        } else if (isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
        }
        // extend statistics with NDV
        if (column.getColumnType() == PARTITION_KEY) {
            columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
        }
        if (statistics.isPresent()) {
            DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
            if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
                columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
            }
        }
        statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
    }
    return statsBuilder.build();
}
Also used : DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) POSITIVE_INFINITY(java.lang.Double.POSITIVE_INFINITY) PATH_PROPERTY(io.trino.plugin.deltalake.DeltaLakeMetadata.PATH_PROPERTY) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Database(io.trino.plugin.hive.metastore.Database) NEGATIVE_INFINITY(java.lang.Double.NEGATIVE_INFINITY) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) DeltaLakeMetadata.createStatisticsPredicate(io.trino.plugin.deltalake.DeltaLakeMetadata.createStatisticsPredicate) NaN(java.lang.Double.NaN) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Table(io.trino.plugin.hive.metastore.Table) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Set(java.util.Set) DeltaLakeSplitManager.partitionMatchesPredicate(io.trino.plugin.deltalake.DeltaLakeSplitManager.partitionMatchesPredicate) DeltaLakeSchemaSupport(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) List(java.util.List) Optional(java.util.Optional) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DoubleRange(io.trino.spi.statistics.DoubleRange) Constraint(io.trino.spi.connector.Constraint) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) CachingDeltaLakeStatisticsAccess(io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) DELTA_LAKE_INVALID_TABLE(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_TABLE) OptionalDouble(java.util.OptionalDouble) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DeltaLakeSessionProperties.isExtendedStatisticsEnabled(io.trino.plugin.deltalake.DeltaLakeSessionProperties.isExtendedStatisticsEnabled) Estimate(io.trino.spi.statistics.Estimate) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) PARTITION_KEY(io.trino.plugin.deltalake.DeltaLakeColumnType.PARTITION_KEY) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) DELTA_LAKE_INVALID_SCHEMA(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) PrincipalPrivileges(io.trino.plugin.hive.metastore.PrincipalPrivileges) TypeManager(io.trino.spi.type.TypeManager) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Set(java.util.Set) HashSet(java.util.HashSet) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) HashMap(java.util.HashMap) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) HashSet(java.util.HashSet) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) OptionalDouble(java.util.OptionalDouble) OptionalDouble(java.util.OptionalDouble) DoubleRange(io.trino.spi.statistics.DoubleRange) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) TrinoException(io.trino.spi.TrinoException) TableStatistics(io.trino.spi.statistics.TableStatistics)

Example 17 with Constraint

use of io.trino.spi.connector.Constraint in project trino by trinodb.

the class DefaultJdbcMetadata method applyFilter.

@Override
public Optional<ConstraintApplicationResult<ConnectorTableHandle>> applyFilter(ConnectorSession session, ConnectorTableHandle table, Constraint constraint) {
    JdbcTableHandle handle = (JdbcTableHandle) table;
    if (handle.getSortOrder().isPresent() && handle.getLimit().isPresent()) {
        handle = flushAttributesAsQuery(session, handle);
    }
    TupleDomain<ColumnHandle> oldDomain = handle.getConstraint();
    TupleDomain<ColumnHandle> newDomain = oldDomain.intersect(constraint.getSummary());
    List<String> newConstraintExpressions;
    TupleDomain<ColumnHandle> remainingFilter;
    Optional<ConnectorExpression> remainingExpression;
    if (newDomain.isNone()) {
        newConstraintExpressions = ImmutableList.of();
        remainingFilter = TupleDomain.all();
        remainingExpression = Optional.of(Constant.TRUE);
    } else {
        Map<ColumnHandle, Domain> domains = newDomain.getDomains().orElseThrow();
        List<JdbcColumnHandle> columnHandles = domains.keySet().stream().map(JdbcColumnHandle.class::cast).collect(toImmutableList());
        List<ColumnMapping> columnMappings = jdbcClient.toColumnMappings(session, columnHandles.stream().map(JdbcColumnHandle::getJdbcTypeHandle).collect(toImmutableList()));
        Map<ColumnHandle, Domain> supported = new HashMap<>();
        Map<ColumnHandle, Domain> unsupported = new HashMap<>();
        for (int i = 0; i < columnHandles.size(); i++) {
            JdbcColumnHandle column = columnHandles.get(i);
            ColumnMapping mapping = columnMappings.get(i);
            DomainPushdownResult pushdownResult = mapping.getPredicatePushdownController().apply(session, domains.get(column));
            supported.put(column, pushdownResult.getPushedDown());
            unsupported.put(column, pushdownResult.getRemainingFilter());
        }
        newDomain = TupleDomain.withColumnDomains(supported);
        remainingFilter = TupleDomain.withColumnDomains(unsupported);
        if (isComplexExpressionPushdown(session)) {
            List<String> newExpressions = new ArrayList<>();
            List<ConnectorExpression> remainingExpressions = new ArrayList<>();
            for (ConnectorExpression expression : extractConjuncts(constraint.getExpression())) {
                Optional<String> converted = jdbcClient.convertPredicate(session, expression, constraint.getAssignments());
                if (converted.isPresent()) {
                    newExpressions.add(converted.get());
                } else {
                    remainingExpressions.add(expression);
                }
            }
            newConstraintExpressions = ImmutableSet.<String>builder().addAll(handle.getConstraintExpressions()).addAll(newExpressions).build().asList();
            remainingExpression = Optional.of(and(remainingExpressions));
        } else {
            newConstraintExpressions = ImmutableList.of();
            remainingExpression = Optional.empty();
        }
    }
    if (oldDomain.equals(newDomain) && handle.getConstraintExpressions().equals(newConstraintExpressions)) {
        return Optional.empty();
    }
    handle = new JdbcTableHandle(handle.getRelationHandle(), newDomain, newConstraintExpressions, handle.getSortOrder(), handle.getLimit(), handle.getColumns(), handle.getOtherReferencedTables(), handle.getNextSyntheticColumnId());
    return Optional.of(remainingExpression.isPresent() ? new ConstraintApplicationResult<>(handle, remainingFilter, remainingExpression.get(), false) : new ConstraintApplicationResult<>(handle, remainingFilter, false));
}
Also used : ColumnHandle(io.trino.spi.connector.ColumnHandle) HashMap(java.util.HashMap) ConnectorExpression(io.trino.spi.expression.ConnectorExpression) ArrayList(java.util.ArrayList) Constraint(io.trino.spi.connector.Constraint) ConstraintApplicationResult(io.trino.spi.connector.ConstraintApplicationResult) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) DomainPushdownResult(io.trino.plugin.jdbc.PredicatePushdownController.DomainPushdownResult)

Example 18 with Constraint

use of io.trino.spi.connector.Constraint in project trino by trinodb.

the class BigQueryMetadata method getViewDefinitionSystemTable.

private Optional<SystemTable> getViewDefinitionSystemTable(ConnectorSession session, SchemaTableName viewDefinitionTableName, SchemaTableName sourceTableName) {
    BigQueryClient client = bigQueryClientFactory.create(session);
    String projectId = getProjectId(client);
    String remoteSchemaName = client.toRemoteDataset(projectId, sourceTableName.getSchemaName()).map(RemoteDatabaseObject::getOnlyRemoteName).orElseThrow(() -> new TableNotFoundException(viewDefinitionTableName));
    String remoteTableName = client.toRemoteTable(projectId, remoteSchemaName, sourceTableName.getTableName()).map(RemoteDatabaseObject::getOnlyRemoteName).orElseThrow(() -> new TableNotFoundException(viewDefinitionTableName));
    TableInfo tableInfo = client.getTable(TableId.of(projectId, remoteSchemaName, remoteTableName)).orElseThrow(() -> new TableNotFoundException(viewDefinitionTableName));
    if (!(tableInfo.getDefinition() instanceof ViewDefinition)) {
        throw new TableNotFoundException(viewDefinitionTableName);
    }
    List<ColumnMetadata> columns = ImmutableList.of(new ColumnMetadata("query", VarcharType.VARCHAR));
    List<Type> types = columns.stream().map(ColumnMetadata::getType).collect(toImmutableList());
    Optional<String> query = Optional.ofNullable(((ViewDefinition) tableInfo.getDefinition()).getQuery());
    Iterable<List<Object>> propertyValues = ImmutableList.of(ImmutableList.of(query.orElse("NULL")));
    return Optional.of(createSystemTable(new ConnectorTableMetadata(sourceTableName, columns), constraint -> new InMemoryRecordSet(types, propertyValues).cursor()));
}
Also used : ViewDefinition(com.google.cloud.bigquery.ViewDefinition) StandardTableDefinition(com.google.cloud.bigquery.StandardTableDefinition) TableId(com.google.cloud.bigquery.TableId) SchemaNotFoundException(io.trino.spi.connector.SchemaNotFoundException) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) VIEW(com.google.cloud.bigquery.TableDefinition.Type.VIEW) Schema(com.google.cloud.bigquery.Schema) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Map(java.util.Map) ProjectionApplicationResult(io.trino.spi.connector.ProjectionApplicationResult) RemoteDatabaseObject(io.trino.plugin.bigquery.BigQueryClient.RemoteDatabaseObject) ENGLISH(java.util.Locale.ENGLISH) TABLE(com.google.cloud.bigquery.TableDefinition.Type.TABLE) Field(com.google.cloud.bigquery.Field) TableDefinition(com.google.cloud.bigquery.TableDefinition) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) Streams(com.google.common.collect.Streams) SchemaTableName(io.trino.spi.connector.SchemaTableName) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Stream(java.util.stream.Stream) BIGQUERY_LISTING_DATASET_ERROR(io.trino.plugin.bigquery.BigQueryErrorCode.BIGQUERY_LISTING_DATASET_ERROR) TrinoPrincipal(io.trino.spi.security.TrinoPrincipal) SchemaTablePrefix(io.trino.spi.connector.SchemaTablePrefix) Assignment(io.trino.spi.connector.Assignment) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) ConnectorMetadata(io.trino.spi.connector.ConnectorMetadata) SystemTable(io.trino.spi.connector.SystemTable) Constraint(io.trino.spi.connector.Constraint) Logger(io.airlift.log.Logger) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Type(io.trino.spi.type.Type) BigQueryException(com.google.cloud.bigquery.BigQueryException) BigQueryType.toField(io.trino.plugin.bigquery.BigQueryType.toField) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata) DatasetId(com.google.cloud.bigquery.DatasetId) Function(java.util.function.Function) Inject(javax.inject.Inject) VarcharType(io.trino.spi.type.VarcharType) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) Table(com.google.cloud.bigquery.Table) ConstraintApplicationResult(io.trino.spi.connector.ConstraintApplicationResult) RecordCursor(io.trino.spi.connector.RecordCursor) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) InMemoryRecordSet(io.trino.spi.connector.InMemoryRecordSet) ConnectorTableProperties(io.trino.spi.connector.ConnectorTableProperties) ConnectorExpression(io.trino.spi.expression.ConnectorExpression) DatasetInfo(com.google.cloud.bigquery.DatasetInfo) TableInfo(com.google.cloud.bigquery.TableInfo) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) ViewDefinition(com.google.cloud.bigquery.ViewDefinition) InMemoryRecordSet(io.trino.spi.connector.InMemoryRecordSet) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) Type(io.trino.spi.type.Type) VarcharType(io.trino.spi.type.VarcharType) TableInfo(com.google.cloud.bigquery.TableInfo) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata)

Example 19 with Constraint

use of io.trino.spi.connector.Constraint in project trino by trinodb.

the class BaseIcebergConnectorTest method assertFilterPushdown.

private void assertFilterPushdown(QualifiedObjectName tableName, Map<String, Domain> filter, Map<String, Domain> expectedEnforcedPredicate, Map<String, Domain> expectedUnenforcedPredicate) {
    Metadata metadata = getQueryRunner().getMetadata();
    newTransaction().execute(getSession(), session -> {
        TableHandle table = metadata.getTableHandle(session, tableName).orElseThrow(() -> new TableNotFoundException(tableName.asSchemaTableName()));
        Map<String, ColumnHandle> columns = metadata.getColumnHandles(session, table);
        TupleDomain<ColumnHandle> domains = TupleDomain.withColumnDomains(filter.entrySet().stream().collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue)));
        Optional<ConstraintApplicationResult<TableHandle>> result = metadata.applyFilter(session, table, new Constraint(domains));
        assertTrue(result.isEmpty() == (expectedUnenforcedPredicate == null && expectedEnforcedPredicate == null));
        if (result.isPresent()) {
            IcebergTableHandle newTable = (IcebergTableHandle) result.get().getHandle().getConnectorHandle();
            assertEquals(newTable.getEnforcedPredicate(), TupleDomain.withColumnDomains(expectedEnforcedPredicate.entrySet().stream().collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue))));
            assertEquals(newTable.getUnenforcedPredicate(), TupleDomain.withColumnDomains(expectedUnenforcedPredicate.entrySet().stream().collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue))));
        }
    });
}
Also used : SkipException(org.testng.SkipException) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.testng.annotations.Test) TestTable(io.trino.testing.sql.TestTable) BROADCAST(io.trino.sql.planner.OptimizerConfig.JoinDistributionType.BROADCAST) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) Matcher(java.util.regex.Matcher) Map(java.util.Map) MaterializedRow(io.trino.testing.MaterializedRow) Path(java.nio.file.Path) Assert.assertFalse(org.testng.Assert.assertFalse) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) Assert.assertNotEquals(org.testng.Assert.assertNotEquals) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Schema(org.apache.avro.Schema) DataProviders(io.trino.testing.DataProviders) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) DataFileWriter(org.apache.avro.file.DataFileWriter) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) Collectors.joining(java.util.stream.Collectors.joining) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) TestingSession.testSessionBuilder(io.trino.testing.TestingSession.testSessionBuilder) Stream(java.util.stream.Stream) ORC(io.trino.plugin.iceberg.IcebergFileFormat.ORC) Session(io.trino.Session) NullableValue(io.trino.spi.predicate.NullableValue) GenericData(org.apache.avro.generic.GenericData) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) String.join(java.lang.String.join) ColumnHandle(io.trino.spi.connector.ColumnHandle) ConstraintApplicationResult(io.trino.spi.connector.ConstraintApplicationResult) LongStream(java.util.stream.LongStream) OperatorStats(io.trino.operator.OperatorStats) Language(org.intellij.lang.annotations.Language) Files(java.nio.file.Files) IOException(java.io.IOException) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) File(java.io.File) BaseConnectorTest(io.trino.testing.BaseConnectorTest) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) TableHandle(io.trino.metadata.TableHandle) QualifiedObjectName(io.trino.metadata.QualifiedObjectName) Paths(java.nio.file.Paths) QueryRunner(io.trino.testing.QueryRunner) Domain.multipleValues(io.trino.spi.predicate.Domain.multipleValues) QueryId(io.trino.spi.QueryId) TransactionBuilder.transaction(io.trino.transaction.TransactionBuilder.transaction) IntStream.range(java.util.stream.IntStream.range) MaterializedResult(io.trino.testing.MaterializedResult) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) PREFERRED_WRITE_PARTITIONING_MIN_NUMBER_OF_PARTITIONS(io.trino.SystemSessionProperties.PREFERRED_WRITE_PARTITIONING_MIN_NUMBER_OF_PARTITIONS) MoreCollectors.onlyElement(com.google.common.collect.MoreCollectors.onlyElement) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) QueryAssertions.assertEqualsIgnoreOrder(io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder) Locale(java.util.Locale) Iterables.concat(com.google.common.collect.Iterables.concat) TestingConnectorBehavior(io.trino.testing.TestingConnectorBehavior) TestTable.randomTableSuffix(io.trino.testing.sql.TestTable.randomTableSuffix) TpchTable(io.trino.tpch.TpchTable) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) Collections.nCopies(java.util.Collections.nCopies) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) ICEBERG_CATALOG(io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG) DataSize(io.airlift.units.DataSize) List(java.util.List) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) BIGINT(io.trino.spi.type.BigintType.BIGINT) Domain.singleValue(io.trino.spi.predicate.Domain.singleValue) Optional(java.util.Optional) Pattern(java.util.regex.Pattern) DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) IcebergQueryRunner.createIcebergQueryRunner(io.trino.plugin.iceberg.IcebergQueryRunner.createIcebergQueryRunner) LINE_ITEM(io.trino.tpch.TpchTable.LINE_ITEM) IntStream(java.util.stream.IntStream) Constraint(io.trino.spi.connector.Constraint) DataProvider(org.testng.annotations.DataProvider) PARQUET(io.trino.plugin.iceberg.IcebergFileFormat.PARQUET) ImmutableList(com.google.common.collect.ImmutableList) Assertions.assertThatThrownBy(org.assertj.core.api.Assertions.assertThatThrownBy) Objects.requireNonNull(java.util.Objects.requireNonNull) TableStatistics(io.trino.spi.statistics.TableStatistics) NoSuchElementException(java.util.NoSuchElementException) VerifyException(com.google.common.base.VerifyException) OutputStream(java.io.OutputStream) ResultWithQueryId(io.trino.testing.ResultWithQueryId) TupleDomain(io.trino.spi.predicate.TupleDomain) Consumer(java.util.function.Consumer) Assert.assertEventually(io.trino.testing.assertions.Assert.assertEventually) JOIN_DISTRIBUTION_TYPE(io.trino.SystemSessionProperties.JOIN_DISTRIBUTION_TYPE) Metadata(io.trino.metadata.Metadata) Assert.assertTrue(org.testng.Assert.assertTrue) MaterializedResult.resultBuilder(io.trino.testing.MaterializedResult.resultBuilder) ColumnHandle(io.trino.spi.connector.ColumnHandle) Constraint(io.trino.spi.connector.Constraint) Metadata(io.trino.metadata.Metadata) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) ConstraintApplicationResult(io.trino.spi.connector.ConstraintApplicationResult) TableHandle(io.trino.metadata.TableHandle) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 20 with Constraint

use of io.trino.spi.connector.Constraint in project trino by trinodb.

the class TableStatisticsMaker method makeTableStatistics.

private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
    if (tableHandle.getSnapshotId().isEmpty() || constraint.getSummary().isNone()) {
        return TableStatistics.empty();
    }
    TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transformKeys(IcebergColumnHandle.class::cast).intersect(tableHandle.getEnforcedPredicate());
    if (intersection.isNone()) {
        return TableStatistics.empty();
    }
    Schema icebergTableSchema = icebergTable.schema();
    List<Types.NestedField> columns = icebergTableSchema.columns();
    Map<Integer, Type.PrimitiveType> idToTypeMapping = primitiveFieldTypes(icebergTableSchema);
    List<PartitionField> partitionFields = icebergTable.spec().fields();
    List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
    List<IcebergColumnHandle> columnHandles = getColumns(icebergTableSchema, typeManager);
    Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toUnmodifiableMap(IcebergColumnHandle::getId, identity()));
    ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
    for (int index = 0; index < partitionFields.size(); index++) {
        PartitionField field = partitionFields.get(index);
        Type type = icebergPartitionTypes.get(index);
        idToDetailsBuilder.put(field.fieldId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toTrinoType(type, typeManager), type.typeId().javaClass()));
    }
    Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.buildOrThrow();
    TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
    IcebergStatistics.Builder icebergStatisticsBuilder = new IcebergStatistics.Builder(columns, typeManager);
    try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
        for (FileScanTask fileScanTask : fileScanTasks) {
            DataFile dataFile = fileScanTask.file();
            if (!dataFileMatches(dataFile, constraint, partitionFields, idToDetails)) {
                continue;
            }
            icebergStatisticsBuilder.acceptDataFile(dataFile, fileScanTask.spec());
        }
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    IcebergStatistics summary = icebergStatisticsBuilder.build();
    if (summary.getFileCount() == 0) {
        return TableStatistics.empty();
    }
    ImmutableMap.Builder<ColumnHandle, ColumnStatistics> columnHandleBuilder = ImmutableMap.builder();
    double recordCount = summary.getRecordCount();
    for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
        int fieldId = columnHandle.getId();
        ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
        Long nullCount = summary.getNullCounts().get(fieldId);
        if (nullCount != null) {
            columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
        }
        if (summary.getColumnSizes() != null) {
            Long columnSize = summary.getColumnSizes().get(fieldId);
            if (columnSize != null) {
                columnBuilder.setDataSize(Estimate.of(columnSize));
            }
        }
        Object min = summary.getMinValues().get(fieldId);
        Object max = summary.getMaxValues().get(fieldId);
        if (min != null && max != null) {
            columnBuilder.setRange(DoubleRange.from(columnHandle.getType(), min, max));
        }
        columnHandleBuilder.put(columnHandle, columnBuilder.build());
    }
    return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.buildOrThrow());
}
Also used : Schema(org.apache.iceberg.Schema) UncheckedIOException(java.io.UncheckedIOException) DataFile(org.apache.iceberg.DataFile) PartitionField(org.apache.iceberg.PartitionField) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableScan(org.apache.iceberg.TableScan) ColumnHandle(io.trino.spi.connector.ColumnHandle) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) ImmutableMap(com.google.common.collect.ImmutableMap) Constraint(io.trino.spi.connector.Constraint) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) Type(org.apache.iceberg.types.Type) TableStatistics(io.trino.spi.statistics.TableStatistics) FileScanTask(org.apache.iceberg.FileScanTask)

Aggregations

Constraint (io.trino.spi.connector.Constraint)41 ColumnHandle (io.trino.spi.connector.ColumnHandle)32 SchemaTableName (io.trino.spi.connector.SchemaTableName)28 TupleDomain (io.trino.spi.predicate.TupleDomain)27 ConnectorSession (io.trino.spi.connector.ConnectorSession)26 Domain (io.trino.spi.predicate.Domain)23 Test (org.testng.annotations.Test)23 ConnectorTableHandle (io.trino.spi.connector.ConnectorTableHandle)20 ConnectorMetadata (io.trino.spi.connector.ConnectorMetadata)18 ConstraintApplicationResult (io.trino.spi.connector.ConstraintApplicationResult)18 Map (java.util.Map)18 Objects.requireNonNull (java.util.Objects.requireNonNull)18 Optional (java.util.Optional)18 ImmutableMap (com.google.common.collect.ImmutableMap)17 List (java.util.List)17 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)16 ImmutableList (com.google.common.collect.ImmutableList)15 Set (java.util.Set)15 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)14 ColumnMetadata (io.trino.spi.connector.ColumnMetadata)13