use of com.facebook.presto.parquet.RichColumnDescriptor in project presto by prestodb.
the class IcebergPageSourceProvider method createParquetPageSource.
private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, SchemaTableName tableName, List<IcebergColumnHandle> regularColumns, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FileStatus fileStatus = fileSystem.getFileStatus(path);
long fileSize = fileStatus.getLen();
long modificationTime = fileStatus.getModificationTime();
HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
FSDataInputStream inputStream = fileSystem.openFile(path, hiveFileContext);
dataSource = buildHdfsParquetDataSource(inputStream, path, fileFormatDataSourceStats);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
// Mapping from Iceberg field ID to Parquet fields.
Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
List<org.apache.parquet.schema.Type> parquetFields = regularColumns.stream().map(column -> {
if (parquetIdToField.isEmpty()) {
// This is a migrated table
return getParquetTypeByName(column.getName(), fileSchema);
}
return parquetIdToField.get(column.getId());
}).collect(toList());
// TODO: support subfield pushdown
MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
List<BlockMetaData> blocks = new ArrayList<>();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if ((firstDataPage >= start) && (firstDataPage < (start + length)) && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks, dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> prestoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
for (int columnIndex = 0; columnIndex < regularColumns.size(); columnIndex++) {
IcebergColumnHandle column = regularColumns.get(columnIndex);
namesBuilder.add(column.getName());
org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
Type prestoType = column.getType();
prestoTypes.add(prestoType);
if (parquetField == null) {
internalFields.add(Optional.empty());
} else {
internalFields.add(constructField(column.getType(), messageColumnIO.getChild(parquetField.getName())));
}
}
return new ParquetPageSource(parquetReader, prestoTypes.build(), internalFields.build(), namesBuilder.build(), new RuntimeStats());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(ICEBERG_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
}
throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.facebook.presto.parquet.RichColumnDescriptor in project presto by prestodb.
the class ParquetPageSourceFactory method getParquetTupleDomain.
public static TupleDomain<ColumnDescriptor> getParquetTupleDomain(Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<HiveColumnHandle> effectivePredicate) {
if (effectivePredicate.isNone()) {
return TupleDomain.none();
}
ImmutableMap.Builder<ColumnDescriptor, Domain> predicate = ImmutableMap.builder();
for (Entry<HiveColumnHandle, Domain> entry : effectivePredicate.getDomains().get().entrySet()) {
HiveColumnHandle columnHandle = entry.getKey();
// skip looking up predicates for complex types as Parquet only stores stats for primitives
if (!columnHandle.getHiveType().getCategory().equals(PRIMITIVE)) {
continue;
}
RichColumnDescriptor descriptor;
if (isPushedDownSubfield(columnHandle)) {
Subfield pushedDownSubfield = getPushedDownSubfield(columnHandle);
List<String> subfieldPath = columnPathFromSubfield(pushedDownSubfield);
descriptor = descriptorsByPath.get(subfieldPath);
} else {
descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName()));
}
if (descriptor != null) {
predicate.put(descriptor, entry.getValue());
}
}
return TupleDomain.withColumnDomains(predicate.build());
}
use of com.facebook.presto.parquet.RichColumnDescriptor in project presto by prestodb.
the class ColumnIOConverter method constructField.
public static Optional<Field> constructField(Type type, ColumnIO columnIO) {
if (columnIO == null) {
return Optional.empty();
}
boolean required = columnIO.getType().getRepetition() != OPTIONAL;
int repetitionLevel = columnRepetitionLevel(columnIO);
int definitionLevel = columnDefinitionLevel(columnIO);
if (ROW.equals(type.getTypeSignature().getBase())) {
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
List<Type> parameters = type.getTypeParameters();
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
List<TypeSignatureParameter> fields = type.getTypeSignature().getParameters();
boolean structHasParameters = false;
for (int i = 0; i < fields.size(); i++) {
NamedTypeSignature namedTypeSignature = fields.get(i).getNamedTypeSignature();
String name = namedTypeSignature.getName().get().toLowerCase(Locale.ENGLISH);
Optional<Field> field = constructField(parameters.get(i), lookupColumnByName(groupColumnIO, name));
structHasParameters |= field.isPresent();
fieldsBuilder.add(field);
}
if (structHasParameters) {
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, fieldsBuilder.build()));
}
return Optional.empty();
} else if (MAP.equals(type.getTypeSignature().getBase())) {
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
MapType mapType = (MapType) type;
GroupColumnIO keyValueColumnIO = getMapKeyValueColumn(groupColumnIO);
if (keyValueColumnIO.getChildrenCount() != 2) {
return Optional.empty();
}
Optional<Field> keyField = constructField(mapType.getKeyType(), keyValueColumnIO.getChild(0));
Optional<Field> valueField = constructField(mapType.getValueType(), keyValueColumnIO.getChild(1));
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(keyField, valueField)));
} else if (ARRAY.equals(type.getTypeSignature().getBase())) {
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
List<Type> types = type.getTypeParameters();
if (groupColumnIO.getChildrenCount() != 1) {
return Optional.empty();
}
Optional<Field> field = constructField(types.get(0), getArrayElementColumn(groupColumnIO.getChild(0)));
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(field)));
}
PrimitiveColumnIO primitiveColumnIO = (PrimitiveColumnIO) columnIO;
RichColumnDescriptor column = new RichColumnDescriptor(primitiveColumnIO.getColumnDescriptor(), columnIO.getType().asPrimitiveType());
return Optional.of(new PrimitiveField(type, repetitionLevel, definitionLevel, required, column, primitiveColumnIO.getId()));
}
use of com.facebook.presto.parquet.RichColumnDescriptor in project presto by prestodb.
the class ParquetPageSourceFactory method createParquetPageSource.
public static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, SchemaTableName tableName, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, StandardFunctionResolution functionResolution, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, HiveFileContext hiveFileContext, ParquetMetadataSource parquetMetadataSource, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).openFile(path, hiveFileContext);
dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
ParquetMetadata parquetMetadata = parquetMetadataSource.getParquetMetadata(dataSource, fileSize, hiveFileContext.isCacheable()).getParquetMetadata();
if (!columns.isEmpty() && columns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
return new AggregatedParquetPageSource(columns, parquetMetadata, typeManager, functionResolution);
}
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getTypeSignature()), fileSchema, useParquetColumnNames, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
footerBlocks.add(block);
}
}
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : footerBlocks.build()) {
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
hiveFileContext.incrementCounter("parquet.blocksRead", 1);
hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount());
hiveFileContext.incrementCounter("parquet.totalBytesRead", block.getTotalByteSize());
} else {
hiveFileContext.incrementCounter("parquet.blocksSkipped", 1);
hiveFileContext.incrementCounter("parquet.rowsSkipped", block.getRowCount());
hiveFileContext.incrementCounter("parquet.totalBytesSkipped", block.getTotalByteSize());
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SYNTHESIZED, "column type must be regular or synthesized column");
String name = column.getName();
Type type = typeManager.getType(column.getTypeSignature());
namesBuilder.add(name);
typesBuilder.add(type);
if (column.getColumnType() == SYNTHESIZED) {
Subfield pushedDownSubfield = getPushedDownSubfield(column);
List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
if (columnIO.isPresent()) {
fieldsBuilder.add(constructField(type, columnIO.get()));
} else {
fieldsBuilder.add(Optional.empty());
}
} else if (getParquetType(type, fileSchema, useParquetColumnNames, column, tableName, path).isPresent()) {
String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName();
fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName)));
} else {
fieldsBuilder.add(Optional.empty());
}
}
return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), hiveFileContext.getStats());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(HIVE_BAD_DATA, e);
}
if (e instanceof AccessControlException) {
throw new PrestoException(PERMISSION_DENIED, e.getMessage(), e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.facebook.presto.parquet.RichColumnDescriptor in project presto by prestodb.
the class IcebergPageSourceProvider method getParquetTupleDomain.
private static TupleDomain<ColumnDescriptor> getParquetTupleDomain(Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<IcebergColumnHandle> effectivePredicate) {
if (effectivePredicate.isNone()) {
return TupleDomain.none();
}
ImmutableMap.Builder<ColumnDescriptor, Domain> predicate = ImmutableMap.builder();
effectivePredicate.getDomains().get().forEach((columnHandle, domain) -> {
String baseType = columnHandle.getType().getTypeSignature().getBase();
// skip looking up predicates for complex types as Parquet only stores stats for primitives
if (!baseType.equals(StandardTypes.MAP) && !baseType.equals(StandardTypes.ARRAY) && !baseType.equals(StandardTypes.ROW)) {
RichColumnDescriptor descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName()));
if (descriptor != null) {
predicate.put(descriptor, domain);
}
}
});
return TupleDomain.withColumnDomains(predicate.build());
}
Aggregations