use of com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA in project presto by prestodb.
the class IcebergPageSourceProvider method createParquetPageSource.
private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, SchemaTableName tableName, List<IcebergColumnHandle> regularColumns, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FileStatus fileStatus = fileSystem.getFileStatus(path);
long fileSize = fileStatus.getLen();
long modificationTime = fileStatus.getModificationTime();
HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
FSDataInputStream inputStream = fileSystem.openFile(path, hiveFileContext);
dataSource = buildHdfsParquetDataSource(inputStream, path, fileFormatDataSourceStats);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
// Mapping from Iceberg field ID to Parquet fields.
Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
List<org.apache.parquet.schema.Type> parquetFields = regularColumns.stream().map(column -> {
if (parquetIdToField.isEmpty()) {
// This is a migrated table
return getParquetTypeByName(column.getName(), fileSchema);
}
return parquetIdToField.get(column.getId());
}).collect(toList());
// TODO: support subfield pushdown
MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
List<BlockMetaData> blocks = new ArrayList<>();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if ((firstDataPage >= start) && (firstDataPage < (start + length)) && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks, dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> prestoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
for (int columnIndex = 0; columnIndex < regularColumns.size(); columnIndex++) {
IcebergColumnHandle column = regularColumns.get(columnIndex);
namesBuilder.add(column.getName());
org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
Type prestoType = column.getType();
prestoTypes.add(prestoType);
if (parquetField == null) {
internalFields.add(Optional.empty());
} else {
internalFields.add(constructField(column.getType(), messageColumnIO.getChild(parquetField.getName())));
}
}
return new ParquetPageSource(parquetReader, prestoTypes.build(), internalFields.build(), namesBuilder.build(), new RuntimeStats());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(ICEBERG_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
}
throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations