use of com.facebook.presto.delta.DeltaErrorCode.DELTA_BAD_DATA in project presto by prestodb.
the class DeltaPageSourceProvider method createParquetPageSource.
private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<DeltaColumnHandle> columns, SchemaTableName tableName, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, TupleDomain<DeltaColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).open(path);
dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getDataType()), fileSchema, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
footerBlocks.add(block);
}
}
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : footerBlocks.build()) {
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
for (DeltaColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SUBFIELD, "column type must be regular or subfield column");
String name = column.getName();
Type type = typeManager.getType(column.getDataType());
namesBuilder.add(name);
typesBuilder.add(type);
if (isPushedDownSubfield(column)) {
Subfield pushedDownSubfield = getPushedDownSubfield(column);
List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
if (columnIO.isPresent()) {
fieldsBuilder.add(constructField(type, columnIO.get()));
} else {
fieldsBuilder.add(Optional.empty());
}
} else if (getParquetType(type, fileSchema, column, tableName, path).isPresent()) {
fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, name)));
} else {
fieldsBuilder.add(Optional.empty());
}
}
return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), new RuntimeStats());
} catch (Exception exception) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (exception instanceof PrestoException) {
throw (PrestoException) exception;
}
if (exception instanceof ParquetCorruptionException) {
throw new PrestoException(DELTA_BAD_DATA, exception);
}
if (exception instanceof AccessControlException) {
throw new PrestoException(PERMISSION_DENIED, exception.getMessage(), exception);
}
if (nullToEmpty(exception.getMessage()).trim().equals("Filesystem closed") || exception instanceof FileNotFoundException) {
throw new PrestoException(DELTA_CANNOT_OPEN_SPLIT, exception);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, exception.getMessage());
if (exception.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(DELTA_MISSING_DATA, message, exception);
}
throw new PrestoException(DELTA_CANNOT_OPEN_SPLIT, message, exception);
}
}
Aggregations