use of io.trino.spi.connector.Constraint in project trino by trinodb.
the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.
@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
double numRecords = 0L;
MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
columns.forEach(column -> nullCounts.put(column, 0.0));
Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
return createZeroStatistics(columns);
}
Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
if (fileStatistics.isEmpty()) {
// Open source Delta Lake does not collect stats
return TableStatistics.empty();
}
DeltaLakeFileStatistics stats = fileStatistics.get();
if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
continue;
}
TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
continue;
}
if (stats.getNumRecords().isEmpty()) {
// Not clear if it's possible for stats to be present with no row count, but bail out if that happens
return TableStatistics.empty();
}
numRecords += stats.getNumRecords().get();
for (DeltaLakeColumnHandle column : columns) {
if (column.getColumnType() == PARTITION_KEY) {
Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
if (partitionValue.isEmpty()) {
nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
} else {
// NULL is not counted as a distinct value
// Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
// it may not be true in case of real, doubles, timestamps etc
partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
}
} else {
Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
if (maybeNullCount.isPresent()) {
nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
} else {
// If any individual file fails to report null counts, fail to calculate the total for the table
nullCounts.put(column, NaN);
}
}
// Math.min returns NaN if any operand is NaN
stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
}
}
if (numRecords == 0) {
return createZeroStatistics(columns);
}
TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
Optional<DeltaLakeStatistics> statistics = Optional.empty();
if (isExtendedStatisticsEnabled(session)) {
statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
}
for (DeltaLakeColumnHandle column : columns) {
ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
Double nullCount = nullCounts.get(column);
columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
Double maxValue = maxValues.get(column);
Double minValue = minValues.get(column);
if (isValidInRange(maxValue) && isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
} else if (isValidInRange(maxValue)) {
columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
} else if (isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
}
// extend statistics with NDV
if (column.getColumnType() == PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
}
if (statistics.isPresent()) {
DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
}
}
statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
}
return statsBuilder.build();
}
use of io.trino.spi.connector.Constraint in project trino by trinodb.
the class DefaultJdbcMetadata method applyFilter.
@Override
public Optional<ConstraintApplicationResult<ConnectorTableHandle>> applyFilter(ConnectorSession session, ConnectorTableHandle table, Constraint constraint) {
JdbcTableHandle handle = (JdbcTableHandle) table;
if (handle.getSortOrder().isPresent() && handle.getLimit().isPresent()) {
handle = flushAttributesAsQuery(session, handle);
}
TupleDomain<ColumnHandle> oldDomain = handle.getConstraint();
TupleDomain<ColumnHandle> newDomain = oldDomain.intersect(constraint.getSummary());
List<String> newConstraintExpressions;
TupleDomain<ColumnHandle> remainingFilter;
Optional<ConnectorExpression> remainingExpression;
if (newDomain.isNone()) {
newConstraintExpressions = ImmutableList.of();
remainingFilter = TupleDomain.all();
remainingExpression = Optional.of(Constant.TRUE);
} else {
Map<ColumnHandle, Domain> domains = newDomain.getDomains().orElseThrow();
List<JdbcColumnHandle> columnHandles = domains.keySet().stream().map(JdbcColumnHandle.class::cast).collect(toImmutableList());
List<ColumnMapping> columnMappings = jdbcClient.toColumnMappings(session, columnHandles.stream().map(JdbcColumnHandle::getJdbcTypeHandle).collect(toImmutableList()));
Map<ColumnHandle, Domain> supported = new HashMap<>();
Map<ColumnHandle, Domain> unsupported = new HashMap<>();
for (int i = 0; i < columnHandles.size(); i++) {
JdbcColumnHandle column = columnHandles.get(i);
ColumnMapping mapping = columnMappings.get(i);
DomainPushdownResult pushdownResult = mapping.getPredicatePushdownController().apply(session, domains.get(column));
supported.put(column, pushdownResult.getPushedDown());
unsupported.put(column, pushdownResult.getRemainingFilter());
}
newDomain = TupleDomain.withColumnDomains(supported);
remainingFilter = TupleDomain.withColumnDomains(unsupported);
if (isComplexExpressionPushdown(session)) {
List<String> newExpressions = new ArrayList<>();
List<ConnectorExpression> remainingExpressions = new ArrayList<>();
for (ConnectorExpression expression : extractConjuncts(constraint.getExpression())) {
Optional<String> converted = jdbcClient.convertPredicate(session, expression, constraint.getAssignments());
if (converted.isPresent()) {
newExpressions.add(converted.get());
} else {
remainingExpressions.add(expression);
}
}
newConstraintExpressions = ImmutableSet.<String>builder().addAll(handle.getConstraintExpressions()).addAll(newExpressions).build().asList();
remainingExpression = Optional.of(and(remainingExpressions));
} else {
newConstraintExpressions = ImmutableList.of();
remainingExpression = Optional.empty();
}
}
if (oldDomain.equals(newDomain) && handle.getConstraintExpressions().equals(newConstraintExpressions)) {
return Optional.empty();
}
handle = new JdbcTableHandle(handle.getRelationHandle(), newDomain, newConstraintExpressions, handle.getSortOrder(), handle.getLimit(), handle.getColumns(), handle.getOtherReferencedTables(), handle.getNextSyntheticColumnId());
return Optional.of(remainingExpression.isPresent() ? new ConstraintApplicationResult<>(handle, remainingFilter, remainingExpression.get(), false) : new ConstraintApplicationResult<>(handle, remainingFilter, false));
}
use of io.trino.spi.connector.Constraint in project trino by trinodb.
the class BigQueryMetadata method getViewDefinitionSystemTable.
private Optional<SystemTable> getViewDefinitionSystemTable(ConnectorSession session, SchemaTableName viewDefinitionTableName, SchemaTableName sourceTableName) {
BigQueryClient client = bigQueryClientFactory.create(session);
String projectId = getProjectId(client);
String remoteSchemaName = client.toRemoteDataset(projectId, sourceTableName.getSchemaName()).map(RemoteDatabaseObject::getOnlyRemoteName).orElseThrow(() -> new TableNotFoundException(viewDefinitionTableName));
String remoteTableName = client.toRemoteTable(projectId, remoteSchemaName, sourceTableName.getTableName()).map(RemoteDatabaseObject::getOnlyRemoteName).orElseThrow(() -> new TableNotFoundException(viewDefinitionTableName));
TableInfo tableInfo = client.getTable(TableId.of(projectId, remoteSchemaName, remoteTableName)).orElseThrow(() -> new TableNotFoundException(viewDefinitionTableName));
if (!(tableInfo.getDefinition() instanceof ViewDefinition)) {
throw new TableNotFoundException(viewDefinitionTableName);
}
List<ColumnMetadata> columns = ImmutableList.of(new ColumnMetadata("query", VarcharType.VARCHAR));
List<Type> types = columns.stream().map(ColumnMetadata::getType).collect(toImmutableList());
Optional<String> query = Optional.ofNullable(((ViewDefinition) tableInfo.getDefinition()).getQuery());
Iterable<List<Object>> propertyValues = ImmutableList.of(ImmutableList.of(query.orElse("NULL")));
return Optional.of(createSystemTable(new ConnectorTableMetadata(sourceTableName, columns), constraint -> new InMemoryRecordSet(types, propertyValues).cursor()));
}
use of io.trino.spi.connector.Constraint in project trino by trinodb.
the class BaseIcebergConnectorTest method assertFilterPushdown.
private void assertFilterPushdown(QualifiedObjectName tableName, Map<String, Domain> filter, Map<String, Domain> expectedEnforcedPredicate, Map<String, Domain> expectedUnenforcedPredicate) {
Metadata metadata = getQueryRunner().getMetadata();
newTransaction().execute(getSession(), session -> {
TableHandle table = metadata.getTableHandle(session, tableName).orElseThrow(() -> new TableNotFoundException(tableName.asSchemaTableName()));
Map<String, ColumnHandle> columns = metadata.getColumnHandles(session, table);
TupleDomain<ColumnHandle> domains = TupleDomain.withColumnDomains(filter.entrySet().stream().collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue)));
Optional<ConstraintApplicationResult<TableHandle>> result = metadata.applyFilter(session, table, new Constraint(domains));
assertTrue(result.isEmpty() == (expectedUnenforcedPredicate == null && expectedEnforcedPredicate == null));
if (result.isPresent()) {
IcebergTableHandle newTable = (IcebergTableHandle) result.get().getHandle().getConnectorHandle();
assertEquals(newTable.getEnforcedPredicate(), TupleDomain.withColumnDomains(expectedEnforcedPredicate.entrySet().stream().collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue))));
assertEquals(newTable.getUnenforcedPredicate(), TupleDomain.withColumnDomains(expectedUnenforcedPredicate.entrySet().stream().collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue))));
}
});
}
use of io.trino.spi.connector.Constraint in project trino by trinodb.
the class TableStatisticsMaker method makeTableStatistics.
private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
if (tableHandle.getSnapshotId().isEmpty() || constraint.getSummary().isNone()) {
return TableStatistics.empty();
}
TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transformKeys(IcebergColumnHandle.class::cast).intersect(tableHandle.getEnforcedPredicate());
if (intersection.isNone()) {
return TableStatistics.empty();
}
Schema icebergTableSchema = icebergTable.schema();
List<Types.NestedField> columns = icebergTableSchema.columns();
Map<Integer, Type.PrimitiveType> idToTypeMapping = primitiveFieldTypes(icebergTableSchema);
List<PartitionField> partitionFields = icebergTable.spec().fields();
List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
List<IcebergColumnHandle> columnHandles = getColumns(icebergTableSchema, typeManager);
Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toUnmodifiableMap(IcebergColumnHandle::getId, identity()));
ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
for (int index = 0; index < partitionFields.size(); index++) {
PartitionField field = partitionFields.get(index);
Type type = icebergPartitionTypes.get(index);
idToDetailsBuilder.put(field.fieldId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toTrinoType(type, typeManager), type.typeId().javaClass()));
}
Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.buildOrThrow();
TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
IcebergStatistics.Builder icebergStatisticsBuilder = new IcebergStatistics.Builder(columns, typeManager);
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
if (!dataFileMatches(dataFile, constraint, partitionFields, idToDetails)) {
continue;
}
icebergStatisticsBuilder.acceptDataFile(dataFile, fileScanTask.spec());
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
IcebergStatistics summary = icebergStatisticsBuilder.build();
if (summary.getFileCount() == 0) {
return TableStatistics.empty();
}
ImmutableMap.Builder<ColumnHandle, ColumnStatistics> columnHandleBuilder = ImmutableMap.builder();
double recordCount = summary.getRecordCount();
for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
int fieldId = columnHandle.getId();
ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
Long nullCount = summary.getNullCounts().get(fieldId);
if (nullCount != null) {
columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
}
if (summary.getColumnSizes() != null) {
Long columnSize = summary.getColumnSizes().get(fieldId);
if (columnSize != null) {
columnBuilder.setDataSize(Estimate.of(columnSize));
}
}
Object min = summary.getMinValues().get(fieldId);
Object max = summary.getMaxValues().get(fieldId);
if (min != null && max != null) {
columnBuilder.setRange(DoubleRange.from(columnHandle.getType(), min, max));
}
columnHandleBuilder.put(columnHandle, columnBuilder.build());
}
return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.buildOrThrow());
}
Aggregations