use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class TestOrcPageSourceFactory method readFile.
private static List<Nation> readFile(Map<NationColumn, Integer> columns, OptionalLong nationKeyPredicate, Optional<AcidInfo> acidInfo, String filePath, long fileSize) {
TupleDomain<HiveColumnHandle> tupleDomain = TupleDomain.all();
if (nationKeyPredicate.isPresent()) {
tupleDomain = TupleDomain.withColumnDomains(ImmutableMap.of(toHiveColumnHandle(NATION_KEY, 0), Domain.singleValue(INTEGER, nationKeyPredicate.getAsLong())));
}
List<HiveColumnHandle> columnHandles = columns.entrySet().stream().map(entry -> toHiveColumnHandle(entry.getKey(), entry.getValue())).collect(toImmutableList());
List<String> columnNames = columnHandles.stream().map(HiveColumnHandle::getName).collect(toImmutableList());
Optional<ReaderPageSource> pageSourceWithProjections = PAGE_SOURCE_FACTORY.createPageSource(new JobConf(new Configuration(false)), SESSION, new Path(filePath), 0, fileSize, fileSize, createSchema(), columnHandles, tupleDomain, acidInfo, OptionalInt.empty(), false, NO_ACID_TRANSACTION);
checkArgument(pageSourceWithProjections.isPresent());
checkArgument(pageSourceWithProjections.get().getReaderColumns().isEmpty(), "projected columns not expected here");
ConnectorPageSource pageSource = pageSourceWithProjections.get().get();
int nationKeyColumn = columnNames.indexOf("n_nationkey");
int nameColumn = columnNames.indexOf("n_name");
int regionKeyColumn = columnNames.indexOf("n_regionkey");
int commentColumn = columnNames.indexOf("n_comment");
ImmutableList.Builder<Nation> rows = ImmutableList.builder();
while (!pageSource.isFinished()) {
Page page = pageSource.getNextPage();
if (page == null) {
continue;
}
page = page.getLoadedPage();
for (int position = 0; position < page.getPositionCount(); position++) {
long nationKey = -42;
if (nationKeyColumn >= 0) {
nationKey = BIGINT.getLong(page.getBlock(nationKeyColumn), position);
}
String name = "<not read>";
if (nameColumn >= 0) {
name = VARCHAR.getSlice(page.getBlock(nameColumn), position).toStringUtf8();
}
long regionKey = -42;
if (regionKeyColumn >= 0) {
regionKey = BIGINT.getLong(page.getBlock(regionKeyColumn), position);
}
String comment = "<not read>";
if (commentColumn >= 0) {
comment = VARCHAR.getSlice(page.getBlock(commentColumn), position).toStringUtf8();
}
rows.add(new Nation(position, nationKey, name, regionKey, comment));
}
}
return rows.build();
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class TestOrcPredicates method createPageSource.
private ConnectorPageSource createPageSource(TupleDomain<TestColumn> effectivePredicate, List<TestColumn> columnsToRead, ConnectorSession session, FileSplit split) {
OrcPageSourceFactory readerFactory = new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_ENVIRONMENT, STATS, UTC);
Properties splitProperties = new Properties();
splitProperties.setProperty(FILE_INPUT_FORMAT, ORC.getInputFormat());
splitProperties.setProperty(SERIALIZATION_LIB, ORC.getSerde());
// Use full columns in split properties
ImmutableList.Builder<String> splitPropertiesColumnNames = ImmutableList.builder();
ImmutableList.Builder<String> splitPropertiesColumnTypes = ImmutableList.builder();
Set<String> baseColumnNames = new HashSet<>();
for (TestColumn columnToRead : columnsToRead) {
String name = columnToRead.getBaseName();
if (!baseColumnNames.contains(name) && !columnToRead.isPartitionKey()) {
baseColumnNames.add(name);
splitPropertiesColumnNames.add(name);
splitPropertiesColumnTypes.add(columnToRead.getBaseObjectInspector().getTypeName());
}
}
splitProperties.setProperty("columns", splitPropertiesColumnNames.build().stream().collect(Collectors.joining(",")));
splitProperties.setProperty("columns.types", splitPropertiesColumnTypes.build().stream().collect(Collectors.joining(",")));
List<HivePartitionKey> partitionKeys = columnsToRead.stream().filter(TestColumn::isPartitionKey).map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue())).collect(toList());
String partitionName = String.join("/", partitionKeys.stream().map(partitionKey -> format("%s=%s", partitionKey.getName(), partitionKey.getValue())).collect(toImmutableList()));
List<HiveColumnHandle> columnHandles = getColumnHandles(columnsToRead);
TupleDomain<HiveColumnHandle> predicate = effectivePredicate.transformKeys(testColumn -> {
Optional<HiveColumnHandle> handle = columnHandles.stream().filter(column -> testColumn.getName().equals(column.getName())).findFirst();
checkState(handle.isPresent(), "Predicate on invalid column");
return handle.get();
});
List<HivePageSourceProvider.ColumnMapping> columnMappings = buildColumnMappings(partitionName, partitionKeys, columnHandles, ImmutableList.of(), TableToPartitionMapping.empty(), split.getPath(), OptionalInt.empty(), split.getLength(), Instant.now().toEpochMilli());
Optional<ConnectorPageSource> pageSource = HivePageSourceProvider.createHivePageSource(ImmutableSet.of(readerFactory), ImmutableSet.of(), new Configuration(false), session, split.getPath(), OptionalInt.empty(), split.getStart(), split.getLength(), split.getLength(), splitProperties, predicate, columnHandles, TESTING_TYPE_MANAGER, Optional.empty(), Optional.empty(), false, Optional.empty(), false, NO_ACID_TRANSACTION, columnMappings);
assertTrue(pageSource.isPresent());
return pageSource.get();
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class Util method domainsMatch.
static boolean domainsMatch(TupleDomain<Predicate<ColumnHandle>> expected, TupleDomain<ColumnHandle> actual) {
Optional<Map<Predicate<ColumnHandle>, Domain>> expectedDomains = expected.getDomains();
Optional<Map<ColumnHandle, Domain>> actualDomains = actual.getDomains();
if (expectedDomains.isPresent() != actualDomains.isPresent()) {
return false;
}
if (expectedDomains.isPresent()) {
if (expectedDomains.get().size() != actualDomains.get().size()) {
return false;
}
for (Map.Entry<Predicate<ColumnHandle>, Domain> entry : expectedDomains.get().entrySet()) {
// There should be exactly one column matching the expected column matcher
ColumnHandle actualColumn = Iterables.getOnlyElement(actualDomains.get().keySet().stream().filter(x -> entry.getKey().test(x)).collect(toImmutableList()));
if (!actualDomains.get().get(actualColumn).contains(entry.getValue())) {
return false;
}
}
}
return true;
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.
@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
double numRecords = 0L;
MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
columns.forEach(column -> nullCounts.put(column, 0.0));
Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
return createZeroStatistics(columns);
}
Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
if (fileStatistics.isEmpty()) {
// Open source Delta Lake does not collect stats
return TableStatistics.empty();
}
DeltaLakeFileStatistics stats = fileStatistics.get();
if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
continue;
}
TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
continue;
}
if (stats.getNumRecords().isEmpty()) {
// Not clear if it's possible for stats to be present with no row count, but bail out if that happens
return TableStatistics.empty();
}
numRecords += stats.getNumRecords().get();
for (DeltaLakeColumnHandle column : columns) {
if (column.getColumnType() == PARTITION_KEY) {
Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
if (partitionValue.isEmpty()) {
nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
} else {
// NULL is not counted as a distinct value
// Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
// it may not be true in case of real, doubles, timestamps etc
partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
}
} else {
Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
if (maybeNullCount.isPresent()) {
nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
} else {
// If any individual file fails to report null counts, fail to calculate the total for the table
nullCounts.put(column, NaN);
}
}
// Math.min returns NaN if any operand is NaN
stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
}
}
if (numRecords == 0) {
return createZeroStatistics(columns);
}
TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
Optional<DeltaLakeStatistics> statistics = Optional.empty();
if (isExtendedStatisticsEnabled(session)) {
statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
}
for (DeltaLakeColumnHandle column : columns) {
ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
Double nullCount = nullCounts.get(column);
columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
Double maxValue = maxValues.get(column);
Double minValue = minValues.get(column);
if (isValidInRange(maxValue) && isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
} else if (isValidInRange(maxValue)) {
columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
} else if (isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
}
// extend statistics with NDV
if (column.getColumnType() == PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
}
if (statistics.isPresent()) {
DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
}
}
statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
}
return statsBuilder.build();
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class RemoveRedundantPredicateAboveTableScan method apply.
@Override
public Result apply(FilterNode filterNode, Captures captures, Context context) {
Session session = context.getSession();
TableScanNode node = captures.get(TABLE_SCAN);
Expression predicate = filterNode.getPredicate();
Expression deterministicPredicate = filterDeterministicConjuncts(plannerContext.getMetadata(), predicate);
Expression nonDeterministicPredicate = filterNonDeterministicConjuncts(plannerContext.getMetadata(), predicate);
ExtractionResult decomposedPredicate = getFullyExtractedPredicates(session, deterministicPredicate, context.getSymbolAllocator().getTypes());
if (decomposedPredicate.getTupleDomain().isAll()) {
// no conjunct could be fully converted to tuple domain
return Result.empty();
}
TupleDomain<ColumnHandle> predicateDomain = decomposedPredicate.getTupleDomain().transformKeys(node.getAssignments()::get);
if (predicateDomain.isNone()) {
// to turn the subtree into a Values node
return Result.ofPlanNode(new ValuesNode(node.getId(), node.getOutputSymbols(), ImmutableList.of()));
}
if (node.getEnforcedConstraint().isNone()) {
// table scans with none domain should be converted to ValuesNode
return Result.ofPlanNode(new ValuesNode(node.getId(), node.getOutputSymbols(), ImmutableList.of()));
}
// is not NONE
Map<ColumnHandle, Domain> enforcedColumnDomains = node.getEnforcedConstraint().getDomains().orElseThrow();
TupleDomain<ColumnHandle> unenforcedDomain = predicateDomain.transformDomains((columnHandle, predicateColumnDomain) -> {
Type type = predicateColumnDomain.getType();
Domain enforcedColumnDomain = Optional.ofNullable(enforcedColumnDomains.get(columnHandle)).orElseGet(() -> Domain.all(type));
if (predicateColumnDomain.contains(enforcedColumnDomain)) {
// full enforced
return Domain.all(type);
}
return predicateColumnDomain.intersect(enforcedColumnDomain);
});
if (unenforcedDomain.equals(predicateDomain)) {
// no change in filter predicate
return Result.empty();
}
Map<ColumnHandle, Symbol> assignments = ImmutableBiMap.copyOf(node.getAssignments()).inverse();
Expression resultingPredicate = createResultingPredicate(plannerContext, session, context.getSymbolAllocator(), typeAnalyzer, // Dynamic filters are included in decomposedPredicate.getRemainingExpression()
TRUE_LITERAL, new DomainTranslator(plannerContext).toPredicate(session, unenforcedDomain.transformKeys(assignments::get)), nonDeterministicPredicate, decomposedPredicate.getRemainingExpression());
if (!TRUE_LITERAL.equals(resultingPredicate)) {
return Result.ofPlanNode(new FilterNode(context.getIdAllocator().getNextId(), node, resultingPredicate));
}
return Result.ofPlanNode(node);
}
Aggregations