use of io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext in project trino by trinodb.
the class IcebergPageSourceProvider method createParquetPageSource.
private static ReaderPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> regularColumns, ParquetReaderOptions options, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, Optional<NameMapping> nameMapping) {
AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), fileSize, inputStream, fileFormatDataSourceStats, options);
// extra variable required for lambda below
ParquetDataSource theDataSource = dataSource;
ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(identity, () -> MetadataReader.readFooter(theDataSource));
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
if (nameMapping.isPresent() && !ParquetSchemaUtil.hasIds(fileSchema)) {
// NameMapping conversion is necessary because MetadataReader converts all column names to lowercase and NameMapping is case sensitive
fileSchema = ParquetSchemaUtil.applyNameMapping(fileSchema, convertToLowercase(nameMapping.get()));
}
// Mapping from Iceberg field ID to Parquet fields.
Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
Optional<ReaderColumns> columnProjections = projectColumns(regularColumns);
List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(regularColumns);
List<org.apache.parquet.schema.Type> parquetFields = readColumns.stream().map(column -> parquetIdToField.get(column.getId())).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, UTC);
List<BlockMetaData> blocks = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain)) {
blocks.add(block);
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumnIO, blocks, Optional.empty(), dataSource, UTC, memoryContext, options);
ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
for (int columnIndex = 0; columnIndex < readColumns.size(); columnIndex++) {
IcebergColumnHandle column = readColumns.get(columnIndex);
org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
Type trinoType = column.getBaseType();
trinoTypes.add(trinoType);
if (parquetField == null) {
internalFields.add(Optional.empty());
} else {
// The top level columns are already mapped by name/id appropriately.
ColumnIO columnIO = messageColumnIO.getChild(parquetField.getName());
internalFields.add(IcebergParquetColumnIOConverter.constructField(new FieldContext(trinoType, column.getColumnIdentity()), columnIO));
}
}
return new ReaderPageSource(new ParquetPageSource(parquetReader, trinoTypes.build(), internalFields.build()), columnProjections);
} catch (IOException | RuntimeException e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof ParquetCorruptionException) {
throw new TrinoException(ICEBERG_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
}
throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations