use of com.facebook.presto.common.predicate.TupleDomain in project presto by prestodb.
the class IcebergPageSourceProvider method createBatchOrcPageSource.
private static ConnectorPageSource createBatchOrcPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, boolean isCacheable, List<IcebergColumnHandle> regularColumns, TypeManager typeManager, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, OrcEncoding orcEncoding, DataSize maxBufferSize, DataSize streamBufferSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, int domainCompactionThreshold, OrcFileTailSource orcFileTailSource, StripeMetadataSourceFactory stripeMetadataSourceFactory, FileFormatDataSourceStats stats, Optional<EncryptionInformation> encryptionInformation, DwrfEncryptionProvider dwrfEncryptionProvider) {
OrcDataSource orcDataSource = null;
try {
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FileStatus fileStatus = fileSystem.getFileStatus(path);
long fileSize = fileStatus.getLen();
long modificationTime = fileStatus.getModificationTime();
HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.openFile(path, hiveFileContext));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options.getMaxMergeDistance(), maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats);
// Todo: pass real columns to ProjectionBasedDwrfKeyProvider instead of ImmutableList.of()
DwrfKeyProvider dwrfKeyProvider = new ProjectionBasedDwrfKeyProvider(encryptionInformation, ImmutableList.of(), true, path);
RuntimeStats runtimeStats = new RuntimeStats();
OrcReader reader = new OrcReader(orcDataSource, orcEncoding, orcFileTailSource, stripeMetadataSourceFactory, new HiveOrcAggregatedMemoryContext(), options, isCacheable, dwrfEncryptionProvider, dwrfKeyProvider, runtimeStats);
List<HiveColumnHandle> physicalColumnHandles = new ArrayList<>(regularColumns.size());
ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
ImmutableList.Builder<TupleDomainOrcPredicate.ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
List<IcebergOrcColumn> fileOrcColumns = getFileOrcColumns(reader);
Map<Integer, IcebergOrcColumn> fileOrcColumnByIcebergId = fileOrcColumns.stream().filter(orcColumn -> orcColumn.getAttributes().containsKey(ORC_ICEBERG_ID_KEY)).collect(toImmutableMap(orcColumn -> Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY)), orcColumn -> IcebergOrcColumn.copy(orcColumn).setIcebergColumnId(Optional.of(Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY))))));
Map<String, IcebergOrcColumn> fileOrcColumnsByName = uniqueIndex(fileOrcColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
int nextMissingColumnIndex = fileOrcColumnsByName.size();
for (IcebergColumnHandle column : regularColumns) {
IcebergOrcColumn icebergOrcColumn;
boolean isExcludeColumn = false;
if (fileOrcColumnByIcebergId.isEmpty()) {
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
} else {
icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
if (icebergOrcColumn == null) {
// Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
if (icebergOrcColumn != null) {
isExcludeColumn = true;
}
}
}
if (icebergOrcColumn != null) {
HiveColumnHandle columnHandle = new HiveColumnHandle(// Todo: using orc file column name
column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), icebergOrcColumn.getOrcColumnId(), icebergOrcColumn.getColumnType(), Optional.empty(), Optional.empty());
physicalColumnHandles.add(columnHandle);
// Skip SchemaEvolution column
if (!isExcludeColumn) {
includedColumns.put(columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature()));
columnReferences.add(new TupleDomainOrcPredicate.ColumnReference<>(columnHandle, columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature())));
}
} else {
physicalColumnHandles.add(new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), nextMissingColumnIndex++, REGULAR, Optional.empty(), Optional.empty()));
}
}
TupleDomain<HiveColumnHandle> hiveColumnHandleTupleDomain = effectivePredicate.transform(column -> {
IcebergOrcColumn icebergOrcColumn;
if (fileOrcColumnByIcebergId.isEmpty()) {
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
} else {
icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
if (icebergOrcColumn == null) {
// Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
}
}
return new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), // Note: the HiveColumnHandle.hiveColumnIndex starts from '0' while the IcebergColumnHandle.id starts from '1'
icebergOrcColumn != null ? icebergOrcColumn.getOrcColumnId() : column.getId() - 1, icebergOrcColumn != null ? icebergOrcColumn.getColumnType() : REGULAR, Optional.empty(), Optional.empty());
});
OrcPredicate predicate = new TupleDomainOrcPredicate<>(hiveColumnHandleTupleDomain, columnReferences.build(), orcBloomFiltersEnabled, Optional.of(domainCompactionThreshold));
OrcAggregatedMemoryContext systemMemoryUsage = new HiveOrcAggregatedMemoryContext();
OrcBatchRecordReader recordReader = reader.createBatchRecordReader(includedColumns.build(), predicate, start, length, UTC, systemMemoryUsage, INITIAL_BATCH_SIZE);
return new OrcBatchPageSource(recordReader, orcDataSource, physicalColumnHandles, typeManager, systemMemoryUsage, stats, runtimeStats);
} catch (Exception e) {
if (orcDataSource != null) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
}
throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.facebook.presto.common.predicate.TupleDomain in project presto by prestodb.
the class TableStatisticsMaker method makeTableStatistics.
private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Constraint constraint) {
if (!tableHandle.getSnapshotId().isPresent() || constraint.getSummary().isNone()) {
return TableStatistics.empty();
}
TupleDomain<IcebergColumnHandle> intersection = constraint.getSummary().transform(IcebergColumnHandle.class::cast).intersect(tableHandle.getPredicate());
if (intersection.isNone()) {
return TableStatistics.empty();
}
List<Types.NestedField> columns = icebergTable.schema().columns();
Map<Integer, Type.PrimitiveType> idToTypeMapping = columns.stream().filter(column -> column.type().isPrimitiveType()).collect(Collectors.toMap(Types.NestedField::fieldId, column -> column.type().asPrimitiveType()));
List<PartitionField> partitionFields = icebergTable.spec().fields();
Set<Integer> identityPartitionIds = getIdentityPartitions(icebergTable.spec()).keySet().stream().map(PartitionField::sourceId).collect(toSet());
List<Types.NestedField> nonPartitionPrimitiveColumns = columns.stream().filter(column -> !identityPartitionIds.contains(column.fieldId()) && column.type().isPrimitiveType()).collect(toImmutableList());
List<Type> icebergPartitionTypes = partitionTypes(partitionFields, idToTypeMapping);
List<IcebergColumnHandle> columnHandles = getColumns(icebergTable.schema(), typeManager);
Map<Integer, IcebergColumnHandle> idToColumnHandle = columnHandles.stream().collect(toImmutableMap(IcebergColumnHandle::getId, identity()));
ImmutableMap.Builder<Integer, ColumnFieldDetails> idToDetailsBuilder = ImmutableMap.builder();
for (int index = 0; index < partitionFields.size(); index++) {
PartitionField field = partitionFields.get(index);
Type type = icebergPartitionTypes.get(index);
idToDetailsBuilder.put(field.sourceId(), new ColumnFieldDetails(field, idToColumnHandle.get(field.sourceId()), type, toPrestoType(type, typeManager), type.typeId().javaClass()));
}
Map<Integer, ColumnFieldDetails> idToDetails = idToDetailsBuilder.build();
TableScan tableScan = icebergTable.newScan().filter(toIcebergExpression(intersection)).useSnapshot(tableHandle.getSnapshotId().get()).includeColumnStats();
Partition summary = null;
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
if (!dataFileMatches(dataFile, constraint, idToTypeMapping, partitionFields, idToDetails)) {
continue;
}
if (summary == null) {
summary = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, dataFile.partition(), dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(idToTypeMapping, dataFile.lowerBounds()), toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
} else {
summary.incrementFileCount();
summary.incrementRecordCount(dataFile.recordCount());
summary.incrementSize(dataFile.fileSizeInBytes());
updateSummaryMin(summary, partitionFields, toMap(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
updateSummaryMax(summary, partitionFields, toMap(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
summary.updateNullCount(dataFile.nullValueCounts());
updateColumnSizes(summary, dataFile.columnSizes());
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
if (summary == null) {
return TableStatistics.empty();
}
double recordCount = summary.getRecordCount();
TableStatistics.Builder result = TableStatistics.builder();
result.setRowCount(Estimate.of(recordCount));
result.setTotalSize(Estimate.of(summary.getSize()));
for (IcebergColumnHandle columnHandle : idToColumnHandle.values()) {
int fieldId = columnHandle.getId();
ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder();
Long nullCount = summary.getNullCounts().get(fieldId);
if (nullCount != null) {
columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount));
}
if (summary.getColumnSizes() != null) {
Long columnSize = summary.getColumnSizes().get(fieldId);
if (columnSize != null) {
columnBuilder.setDataSize(Estimate.of(columnSize));
}
}
Object min = summary.getMinValues().get(fieldId);
Object max = summary.getMaxValues().get(fieldId);
if (min instanceof Number && max instanceof Number) {
columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue())));
}
result.setColumnStatistics(columnHandle, columnBuilder.build());
}
return result.build();
}
use of com.facebook.presto.common.predicate.TupleDomain in project presto by prestodb.
the class ExpressionConverter method toIcebergExpression.
public static Expression toIcebergExpression(TupleDomain<IcebergColumnHandle> tupleDomain) {
if (tupleDomain.isAll()) {
return alwaysTrue();
}
if (!tupleDomain.getDomains().isPresent()) {
return alwaysFalse();
}
Map<IcebergColumnHandle, Domain> domainMap = tupleDomain.getDomains().get();
Expression expression = alwaysTrue();
for (Map.Entry<IcebergColumnHandle, Domain> entry : domainMap.entrySet()) {
IcebergColumnHandle columnHandle = entry.getKey();
Domain domain = entry.getValue();
expression = and(expression, toIcebergExpression(columnHandle.getName(), columnHandle.getType(), domain));
}
return expression;
}
use of com.facebook.presto.common.predicate.TupleDomain in project presto by prestodb.
the class TableLayoutResult method computeEnforced.
public static TupleDomain<ColumnHandle> computeEnforced(TupleDomain<ColumnHandle> predicate, TupleDomain<ColumnHandle> unenforced) {
if (predicate.isNone()) {
// * The connector didn't successfully enforce anything. Therefore, enforced predicate would be TupleDomain.all().
if (unenforced.isNone()) {
return TupleDomain.all();
}
if (unenforced.isAll()) {
return TupleDomain.none();
}
throw new IllegalArgumentException();
}
// The engine requested the connector provides a layout with a non-none TupleDomain.
// A TupleDomain is effectively a list of column-Domain pairs.
// The connector is expected enforce the respective domain entirely on none, some, or all of the columns.
// 1. When the connector could enforce none of the domains, the unenforced would be equal to predicate;
// 2. When the connector could enforce some of the domains, the unenforced would contain a subset of the column-Domain pairs;
// 3. When the connector could enforce all of the domains, the unenforced would be TupleDomain.all().
// In all 3 cases shown above, the unenforced is not TupleDomain.none().
checkArgument(!unenforced.isNone());
Map<ColumnHandle, Domain> predicateDomains = predicate.getDomains().get();
Map<ColumnHandle, Domain> unenforcedDomains = unenforced.getDomains().get();
ImmutableMap.Builder<ColumnHandle, Domain> enforcedDomainsBuilder = ImmutableMap.builder();
for (Map.Entry<ColumnHandle, Domain> entry : predicateDomains.entrySet()) {
ColumnHandle predicateColumnHandle = entry.getKey();
if (unenforcedDomains.containsKey(predicateColumnHandle)) {
checkArgument(entry.getValue().equals(unenforcedDomains.get(predicateColumnHandle)), "Enforced tuple domain cannot be determined. The connector is expected to enforce the respective domain entirely on none, some, or all of the column.");
} else {
enforcedDomainsBuilder.put(predicateColumnHandle, entry.getValue());
}
}
Map<ColumnHandle, Domain> enforcedDomains = enforcedDomainsBuilder.build();
checkArgument(enforcedDomains.size() + unenforcedDomains.size() == predicateDomains.size(), "Enforced tuple domain cannot be determined. Connector returned an unenforced TupleDomain that contains columns not in predicate.");
return TupleDomain.withColumnDomains(enforcedDomains);
}
use of com.facebook.presto.common.predicate.TupleDomain in project presto by prestodb.
the class TestLocalDynamicFilter method testMultipleColumns.
@Test
public void testMultipleColumns() throws ExecutionException, InterruptedException {
LocalDynamicFilter filter = new LocalDynamicFilter(ImmutableMultimap.of("123", new DynamicFilterPlaceholder("123", new VariableReferenceExpression(Optional.empty(), "a", INTEGER), EQUAL), "456", new DynamicFilterPlaceholder("456", new VariableReferenceExpression(Optional.empty(), "b", INTEGER), EQUAL)), ImmutableMap.of("123", 0, "456", 1), 1);
assertEquals(filter.getBuildChannels(), ImmutableMap.of("123", 0, "456", 1));
Consumer<TupleDomain<String>> consumer = filter.getTupleDomainConsumer();
ListenableFuture<TupleDomain<VariableReferenceExpression>> result = filter.getResultFuture();
assertFalse(result.isDone());
consumer.accept(TupleDomain.withColumnDomains(ImmutableMap.of("123", Domain.singleValue(INTEGER, 10L), "456", Domain.singleValue(INTEGER, 20L))));
assertEquals(result.get(), TupleDomain.withColumnDomains(ImmutableMap.of(new VariableReferenceExpression(Optional.empty(), "a", INTEGER), Domain.singleValue(INTEGER, 10L), new VariableReferenceExpression(Optional.empty(), "b", INTEGER), Domain.singleValue(INTEGER, 20L))));
}
Aggregations