use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class IcebergMetadata method getTableProperties.
@Override
public ConnectorTableProperties getTableProperties(ConnectorSession session, ConnectorTableHandle tableHandle) {
IcebergTableHandle table = (IcebergTableHandle) tableHandle;
if (table.getSnapshotId().isEmpty()) {
// TupleDomain.none() as the predicate
return new ConnectorTableProperties(TupleDomain.none(), Optional.empty(), Optional.empty(), Optional.empty(), ImmutableList.of());
}
Table icebergTable = catalog.loadTable(session, table.getSchemaTableName());
// Extract identity partition fields that are present in all partition specs, for creating the discrete predicates.
Set<Integer> partitionSourceIds = identityPartitionColumnsInAllSpecs(icebergTable);
TupleDomain<IcebergColumnHandle> enforcedPredicate = table.getEnforcedPredicate();
DiscretePredicates discretePredicates = null;
if (!partitionSourceIds.isEmpty()) {
// Extract identity partition columns
Map<Integer, IcebergColumnHandle> columns = getColumns(icebergTable.schema(), typeManager).stream().filter(column -> partitionSourceIds.contains(column.getId())).collect(toImmutableMap(IcebergColumnHandle::getId, Function.identity()));
Supplier<List<FileScanTask>> lazyFiles = Suppliers.memoize(() -> {
TableScan tableScan = icebergTable.newScan().useSnapshot(table.getSnapshotId().get()).filter(toIcebergExpression(enforcedPredicate)).includeColumnStats();
try (CloseableIterable<FileScanTask> iterator = tableScan.planFiles()) {
return ImmutableList.copyOf(iterator);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
Iterable<FileScanTask> files = () -> lazyFiles.get().iterator();
Iterable<TupleDomain<ColumnHandle>> discreteTupleDomain = Iterables.transform(files, fileScan -> {
// Extract partition values in the data file
Map<Integer, Optional<String>> partitionColumnValueStrings = getPartitionKeys(fileScan);
Map<ColumnHandle, NullableValue> partitionValues = partitionSourceIds.stream().filter(partitionColumnValueStrings::containsKey).collect(toImmutableMap(columns::get, columnId -> {
IcebergColumnHandle column = columns.get(columnId);
Object prestoValue = deserializePartitionValue(column.getType(), partitionColumnValueStrings.get(columnId).orElse(null), column.getName());
return NullableValue.of(column.getType(), prestoValue);
}));
return TupleDomain.fromFixedValues(partitionValues);
});
discretePredicates = new DiscretePredicates(columns.values().stream().map(ColumnHandle.class::cast).collect(toImmutableList()), discreteTupleDomain);
}
return new ConnectorTableProperties(// over all tableScan.planFiles() and caching partition values in table handle.
enforcedPredicate.transformKeys(ColumnHandle.class::cast), // TODO: implement table partitioning
Optional.empty(), Optional.empty(), Optional.ofNullable(discretePredicates), ImmutableList.of());
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class IcebergPageSourceProvider method createParquetPageSource.
private static ReaderPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> regularColumns, ParquetReaderOptions options, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, Optional<NameMapping> nameMapping) {
AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), fileSize, inputStream, fileFormatDataSourceStats, options);
// extra variable required for lambda below
ParquetDataSource theDataSource = dataSource;
ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(identity, () -> MetadataReader.readFooter(theDataSource));
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
if (nameMapping.isPresent() && !ParquetSchemaUtil.hasIds(fileSchema)) {
// NameMapping conversion is necessary because MetadataReader converts all column names to lowercase and NameMapping is case sensitive
fileSchema = ParquetSchemaUtil.applyNameMapping(fileSchema, convertToLowercase(nameMapping.get()));
}
// Mapping from Iceberg field ID to Parquet fields.
Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
Optional<ReaderColumns> columnProjections = projectColumns(regularColumns);
List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(regularColumns);
List<org.apache.parquet.schema.Type> parquetFields = readColumns.stream().map(column -> parquetIdToField.get(column.getId())).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, UTC);
List<BlockMetaData> blocks = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain)) {
blocks.add(block);
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumnIO, blocks, Optional.empty(), dataSource, UTC, memoryContext, options);
ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
for (int columnIndex = 0; columnIndex < readColumns.size(); columnIndex++) {
IcebergColumnHandle column = readColumns.get(columnIndex);
org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
Type trinoType = column.getBaseType();
trinoTypes.add(trinoType);
if (parquetField == null) {
internalFields.add(Optional.empty());
} else {
// The top level columns are already mapped by name/id appropriately.
ColumnIO columnIO = messageColumnIO.getChild(parquetField.getName());
internalFields.add(IcebergParquetColumnIOConverter.constructField(new FieldContext(trinoType, column.getColumnIdentity()), columnIO));
}
}
return new ReaderPageSource(new ParquetPageSource(parquetReader, trinoTypes.build(), internalFields.build()), columnProjections);
} catch (IOException | RuntimeException e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof ParquetCorruptionException) {
throw new TrinoException(ICEBERG_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
}
throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class IcebergPageSourceProvider method getParquetTupleDomain.
private static TupleDomain<ColumnDescriptor> getParquetTupleDomain(Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<IcebergColumnHandle> effectivePredicate) {
if (effectivePredicate.isNone()) {
return TupleDomain.none();
}
ImmutableMap.Builder<ColumnDescriptor, Domain> predicate = ImmutableMap.builder();
effectivePredicate.getDomains().get().forEach((columnHandle, domain) -> {
String baseType = columnHandle.getType().getTypeSignature().getBase();
// skip looking up predicates for complex types as Parquet only stores stats for primitives
if (!baseType.equals(StandardTypes.MAP) && !baseType.equals(StandardTypes.ARRAY) && !baseType.equals(StandardTypes.ROW)) {
RichColumnDescriptor descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName()));
if (descriptor != null) {
predicate.put(descriptor, domain);
}
}
});
return TupleDomain.withColumnDomains(predicate.buildOrThrow());
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class IcebergPageSourceProvider method createPageSource.
@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
IcebergSplit split = (IcebergSplit) connectorSplit;
IcebergTableHandle table = (IcebergTableHandle) connectorTable;
List<IcebergColumnHandle> icebergColumns = columns.stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList());
Map<Integer, Optional<String>> partitionKeys = split.getPartitionKeys();
List<IcebergColumnHandle> regularColumns = columns.stream().map(IcebergColumnHandle.class::cast).filter(column -> !partitionKeys.containsKey(column.getId())).collect(toImmutableList());
TupleDomain<IcebergColumnHandle> effectivePredicate = table.getUnenforcedPredicate().intersect(dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast)).simplify(ICEBERG_DOMAIN_COMPACTION_THRESHOLD);
HdfsContext hdfsContext = new HdfsContext(session);
ReaderPageSource dataPageSource = createDataPageSource(session, hdfsContext, new Path(split.getPath()), split.getStart(), split.getLength(), split.getFileSize(), split.getFileFormat(), regularColumns, effectivePredicate, table.getNameMappingJson().map(NameMappingParser::fromJson));
Optional<ReaderProjectionsAdapter> projectionsAdapter = dataPageSource.getReaderColumns().map(readerColumns -> new ReaderProjectionsAdapter(regularColumns, readerColumns, column -> ((IcebergColumnHandle) column).getType(), IcebergPageSourceProvider::applyProjection));
return new IcebergPageSource(icebergColumns, partitionKeys, dataPageSource.get(), projectionsAdapter);
}
use of io.trino.spi.predicate.TupleDomain in project trino by trinodb.
the class ExpressionConverter method toIcebergExpression.
public static Expression toIcebergExpression(TupleDomain<IcebergColumnHandle> tupleDomain) {
if (tupleDomain.isAll()) {
return alwaysTrue();
}
if (tupleDomain.getDomains().isEmpty()) {
return alwaysFalse();
}
Map<IcebergColumnHandle, Domain> domainMap = tupleDomain.getDomains().get();
Expression expression = alwaysTrue();
for (Map.Entry<IcebergColumnHandle, Domain> entry : domainMap.entrySet()) {
IcebergColumnHandle columnHandle = entry.getKey();
Domain domain = entry.getValue();
expression = and(expression, toIcebergExpression(columnHandle.getQualifiedName(), columnHandle.getType(), domain));
}
return expression;
}
Aggregations