Search in sources :

Example 1 with ParquetReader

use of com.facebook.presto.parquet.reader.ParquetReader in project presto by prestodb.

the class DeltaPageSourceProvider method createParquetPageSource.

private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<DeltaColumnHandle> columns, SchemaTableName tableName, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, TupleDomain<DeltaColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, boolean columnIndexFilterEnabled) {
    AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).open(path);
        dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getDataType()), fileSchema, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                footerBlocks.add(block);
            }
        }
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
        final ParquetDataSource finalDataSource = dataSource;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
        for (BlockMetaData block : footerBlocks.build()) {
            Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
            if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
                blocks.add(block);
                blockIndexStores.add(columnIndexStore.orElse(null));
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
        ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
        for (DeltaColumnHandle column : columns) {
            checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SUBFIELD, "column type must be regular or subfield column");
            String name = column.getName();
            Type type = typeManager.getType(column.getDataType());
            namesBuilder.add(name);
            typesBuilder.add(type);
            if (isPushedDownSubfield(column)) {
                Subfield pushedDownSubfield = getPushedDownSubfield(column);
                List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
                Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
                if (columnIO.isPresent()) {
                    fieldsBuilder.add(constructField(type, columnIO.get()));
                } else {
                    fieldsBuilder.add(Optional.empty());
                }
            } else if (getParquetType(type, fileSchema, column, tableName, path).isPresent()) {
                fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, name)));
            } else {
                fieldsBuilder.add(Optional.empty());
            }
        }
        return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), new RuntimeStats());
    } catch (Exception exception) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (exception instanceof PrestoException) {
            throw (PrestoException) exception;
        }
        if (exception instanceof ParquetCorruptionException) {
            throw new PrestoException(DELTA_BAD_DATA, exception);
        }
        if (exception instanceof AccessControlException) {
            throw new PrestoException(PERMISSION_DENIED, exception.getMessage(), exception);
        }
        if (nullToEmpty(exception.getMessage()).trim().equals("Filesystem closed") || exception instanceof FileNotFoundException) {
            throw new PrestoException(DELTA_CANNOT_OPEN_SPLIT, exception);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, exception.getMessage());
        if (exception.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(DELTA_MISSING_DATA, message, exception);
        }
        throw new PrestoException(DELTA_CANNOT_OPEN_SPLIT, message, exception);
    }
}
Also used : ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) DeltaColumnHandle.getPushedDownSubfield(com.facebook.presto.delta.DeltaColumnHandle.getPushedDownSubfield) ConnectorTransactionHandle(com.facebook.presto.spi.connector.ConnectorTransactionHandle) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) ParquetTypeUtils.lookupColumnByName(com.facebook.presto.parquet.ParquetTypeUtils.lookupColumnByName) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) Collectors.toMap(java.util.stream.Collectors.toMap) SplitContext(com.facebook.presto.spi.SplitContext) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DeltaColumnHandle.isPushedDownSubfield(com.facebook.presto.delta.DeltaColumnHandle.isPushedDownSubfield) RuntimeStats(com.facebook.presto.common.RuntimeStats) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) HdfsContext(com.facebook.presto.hive.HdfsContext) ConnectorPageSourceProvider(com.facebook.presto.spi.connector.ConnectorPageSourceProvider) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) SUBFIELD(com.facebook.presto.delta.DeltaColumnHandle.ColumnType.SUBFIELD) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) DELTA_MISSING_DATA(com.facebook.presto.delta.DeltaErrorCode.DELTA_MISSING_DATA) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) Collectors(java.util.stream.Collectors) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) ConnectorSession(com.facebook.presto.spi.ConnectorSession) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) DELTA_CANNOT_OPEN_SPLIT(com.facebook.presto.delta.DeltaErrorCode.DELTA_CANNOT_OPEN_SPLIT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) Optional(java.util.Optional) DELTA_PARQUET_SCHEMA_MISMATCH(com.facebook.presto.delta.DeltaErrorCode.DELTA_PARQUET_SCHEMA_MISMATCH) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) REGULAR(com.facebook.presto.delta.DeltaColumnHandle.ColumnType.REGULAR) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) DeltaSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.delta.DeltaSessionProperties.getParquetMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) MetadataReader(com.facebook.presto.parquet.cache.MetadataReader) PARTITION(com.facebook.presto.delta.DeltaColumnHandle.ColumnType.PARTITION) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) Utils(com.facebook.presto.common.Utils) ConnectorTableLayoutHandle(com.facebook.presto.spi.ConnectorTableLayoutHandle) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) DeltaSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.delta.DeltaSessionProperties.isParquetBatchReaderVerificationEnabled) ArrayList(java.util.ArrayList) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) ParquetPageSourceFactory.checkSchemaMatch(com.facebook.presto.hive.parquet.ParquetPageSourceFactory.checkSchemaMatch) DELTA_BAD_DATA(com.facebook.presto.delta.DeltaErrorCode.DELTA_BAD_DATA) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) IOException(java.io.IOException) ParquetTypeUtils.nestedColumnPath(com.facebook.presto.parquet.ParquetTypeUtils.nestedColumnPath) DeltaSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.delta.DeltaSessionProperties.isParquetBatchReadsEnabled) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) PERMISSION_DENIED(com.facebook.presto.spi.StandardErrorCode.PERMISSION_DENIED) Field(com.facebook.presto.parquet.Field) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) DeltaTypeUtils.convertPartitionValue(com.facebook.presto.delta.DeltaTypeUtils.convertPartitionValue) ColumnHandle(com.facebook.presto.spi.ColumnHandle) AccessControlException(org.apache.hadoop.security.AccessControlException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Block(com.facebook.presto.common.block.Block) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList(com.google.common.collect.ImmutableList) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) RuntimeStats(com.facebook.presto.common.RuntimeStats) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(com.facebook.presto.parquet.predicate.Predicate) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) DeltaColumnHandle.getPushedDownSubfield(com.facebook.presto.delta.DeltaColumnHandle.getPushedDownSubfield) DeltaColumnHandle.isPushedDownSubfield(com.facebook.presto.delta.DeltaColumnHandle.isPushedDownSubfield) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) Subfield(com.facebook.presto.common.Subfield) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) Optional(java.util.Optional) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) AccessControlException(org.apache.hadoop.security.AccessControlException) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) IOException(java.io.IOException) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) ColumnIO(org.apache.parquet.io.ColumnIO) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) AccessControlException(org.apache.hadoop.security.AccessControlException) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Type(com.facebook.presto.common.type.Type) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 2 with ParquetReader

use of com.facebook.presto.parquet.reader.ParquetReader in project presto by prestodb.

the class BenchmarkParquetReader method read.

private static Object read(BenchmarkData data) throws Exception {
    try (ParquetReader recordReader = data.createRecordReader()) {
        List<Block> blocks = new ArrayList<>();
        while (recordReader.nextBatch() > 0) {
            Block block = recordReader.readBlock(data.getField());
            blocks.add(block);
        }
        return blocks;
    }
}
Also used : ArrayList(java.util.ArrayList) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) Block(com.facebook.presto.common.block.Block)

Example 3 with ParquetReader

use of com.facebook.presto.parquet.reader.ParquetReader in project presto by prestodb.

the class IcebergPageSourceProvider method createParquetPageSource.

private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, SchemaTableName tableName, List<IcebergColumnHandle> regularColumns, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, boolean columnIndexFilterEnabled) {
    AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        long fileSize = fileStatus.getLen();
        long modificationTime = fileStatus.getModificationTime();
        HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
        FSDataInputStream inputStream = fileSystem.openFile(path, hiveFileContext);
        dataSource = buildHdfsParquetDataSource(inputStream, path, fileFormatDataSourceStats);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        // Mapping from Iceberg field ID to Parquet fields.
        Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
        List<org.apache.parquet.schema.Type> parquetFields = regularColumns.stream().map(column -> {
            if (parquetIdToField.isEmpty()) {
                // This is a migrated table
                return getParquetTypeByName(column.getName(), fileSchema);
            }
            return parquetIdToField.get(column.getId());
        }).collect(toList());
        // TODO: support subfield pushdown
        MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
        final ParquetDataSource finalDataSource = dataSource;
        List<BlockMetaData> blocks = new ArrayList<>();
        List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
            if ((firstDataPage >= start) && (firstDataPage < (start + length)) && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
                blocks.add(block);
                blockIndexStores.add(columnIndexStore.orElse(null));
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks, dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
        ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Type> prestoTypes = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
        for (int columnIndex = 0; columnIndex < regularColumns.size(); columnIndex++) {
            IcebergColumnHandle column = regularColumns.get(columnIndex);
            namesBuilder.add(column.getName());
            org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
            Type prestoType = column.getType();
            prestoTypes.add(prestoType);
            if (parquetField == null) {
                internalFields.add(Optional.empty());
            } else {
                internalFields.add(constructField(column.getType(), messageColumnIO.getChild(parquetField.getName())));
            }
        }
        return new ParquetPageSource(parquetReader, prestoTypes.build(), internalFields.build(), namesBuilder.build(), new RuntimeStats());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof ParquetCorruptionException) {
            throw new PrestoException(ICEBERG_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ConnectorTransactionHandle(com.facebook.presto.spi.connector.ConnectorTransactionHandle) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) ConnectorPageSourceProvider(com.facebook.presto.spi.connector.ConnectorPageSourceProvider) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) ORC_ICEBERG_ID_KEY(com.facebook.presto.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) IcebergSessionProperties.getOrcLazyReadSmallRanges(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ICEBERG_BAD_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) MetadataReader(com.facebook.presto.parquet.cache.MetadataReader) StandardTypes(com.facebook.presto.common.type.StandardTypes) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) IcebergSessionProperties.getOrcMaxReadBlockSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) ArrayList(java.util.ArrayList) IcebergSessionProperties.getOrcTinyStripeThreshold(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ROOT_COLUMN_ID(com.facebook.presto.iceberg.IcebergOrcColumn.ROOT_COLUMN_ID) ICEBERG_MISSING_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) FileFormat(org.apache.iceberg.FileFormat) Domain(com.facebook.presto.common.predicate.Domain) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) HiveSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize) ColumnHandle(com.facebook.presto.spi.ColumnHandle) IcebergSessionProperties.isOrcZstdJniDecompressionEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcZstdJniDecompressionEnabled) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OrcReader(com.facebook.presto.orc.OrcReader) ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) NO_CACHE_CONSTRAINTS(com.facebook.presto.hive.CacheQuota.NO_CACHE_CONSTRAINTS) IcebergSessionProperties.getOrcMaxBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) OrcBatchPageSource(com.facebook.presto.hive.orc.OrcBatchPageSource) SchemaTableName(com.facebook.presto.spi.SchemaTableName) SplitContext(com.facebook.presto.spi.SplitContext) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) HdfsContext(com.facebook.presto.hive.HdfsContext) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) HiveSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPredicate(com.facebook.presto.orc.OrcPredicate) HiveDwrfEncryptionProvider(com.facebook.presto.hive.HiveDwrfEncryptionProvider) String.format(java.lang.String.format) IcebergSessionProperties.isOrcBloomFiltersEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) Objects(java.util.Objects) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) IntStream(java.util.stream.IntStream) ConnectorTableLayoutHandle(com.facebook.presto.spi.ConnectorTableLayoutHandle) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) Function(java.util.function.Function) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) ImmutableList(com.google.common.collect.ImmutableList) ICEBERG_CANNOT_OPEN_SPLIT(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) OrcType(com.facebook.presto.orc.metadata.OrcType) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) IcebergSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) OrcEncoding(com.facebook.presto.orc.OrcEncoding) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) Field(com.facebook.presto.parquet.Field) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) IcebergSessionProperties.getOrcStreamBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) FileStatus(org.apache.hadoop.fs.FileStatus) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) OrcPredicate(com.facebook.presto.orc.OrcPredicate) Predicate(com.facebook.presto.parquet.predicate.Predicate) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) MessageType(org.apache.parquet.schema.MessageType) Optional(java.util.Optional) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) Objects(java.util.Objects) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) RuntimeStats(com.facebook.presto.common.RuntimeStats) PrestoException(com.facebook.presto.spi.PrestoException) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) IOException(java.io.IOException) PrestoException(com.facebook.presto.spi.PrestoException) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) MessageType(org.apache.parquet.schema.MessageType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem)

Example 4 with ParquetReader

use of com.facebook.presto.parquet.reader.ParquetReader in project presto by prestodb.

the class ParquetPageSourceFactory method createParquetPageSource.

public static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, SchemaTableName tableName, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, StandardFunctionResolution functionResolution, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, HiveFileContext hiveFileContext, ParquetMetadataSource parquetMetadataSource, boolean columnIndexFilterEnabled) {
    AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).openFile(path, hiveFileContext);
        dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
        ParquetMetadata parquetMetadata = parquetMetadataSource.getParquetMetadata(dataSource, fileSize, hiveFileContext.isCacheable()).getParquetMetadata();
        if (!columns.isEmpty() && columns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
            return new AggregatedParquetPageSource(columns, parquetMetadata, typeManager, functionResolution);
        }
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getTypeSignature()), fileSchema, useParquetColumnNames, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                footerBlocks.add(block);
            }
        }
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
        final ParquetDataSource finalDataSource = dataSource;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
        for (BlockMetaData block : footerBlocks.build()) {
            Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
            if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
                blocks.add(block);
                blockIndexStores.add(columnIndexStore.orElse(null));
                hiveFileContext.incrementCounter("parquet.blocksRead", 1);
                hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount());
                hiveFileContext.incrementCounter("parquet.totalBytesRead", block.getTotalByteSize());
            } else {
                hiveFileContext.incrementCounter("parquet.blocksSkipped", 1);
                hiveFileContext.incrementCounter("parquet.rowsSkipped", block.getRowCount());
                hiveFileContext.incrementCounter("parquet.totalBytesSkipped", block.getTotalByteSize());
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
        ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
        for (HiveColumnHandle column : columns) {
            checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SYNTHESIZED, "column type must be regular or synthesized column");
            String name = column.getName();
            Type type = typeManager.getType(column.getTypeSignature());
            namesBuilder.add(name);
            typesBuilder.add(type);
            if (column.getColumnType() == SYNTHESIZED) {
                Subfield pushedDownSubfield = getPushedDownSubfield(column);
                List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
                Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
                if (columnIO.isPresent()) {
                    fieldsBuilder.add(constructField(type, columnIO.get()));
                } else {
                    fieldsBuilder.add(Optional.empty());
                }
            } else if (getParquetType(type, fileSchema, useParquetColumnNames, column, tableName, path).isPresent()) {
                String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName();
                fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName)));
            } else {
                fieldsBuilder.add(Optional.empty());
            }
        }
        return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), hiveFileContext.getStats());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new PrestoException(HIVE_BAD_DATA, e);
        }
        if (e instanceof AccessControlException) {
            throw new PrestoException(PERMISSION_DENIED, e.getMessage(), e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) DateTimeZone(org.joda.time.DateTimeZone) TINYINT(com.facebook.presto.common.type.StandardTypes.TINYINT) HIVE_PARTITION_SCHEMA_MISMATCH(com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) ROW(com.facebook.presto.common.type.StandardTypes.ROW) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) AGGREGATED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.AGGREGATED) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) ParquetMetadataSource(com.facebook.presto.parquet.cache.ParquetMetadataSource) Set(java.util.Set) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ARRAY(com.facebook.presto.common.type.StandardTypes.ARRAY) ConnectorSession(com.facebook.presto.spi.ConnectorSession) CHAR(com.facebook.presto.common.type.StandardTypes.CHAR) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) HiveBatchPageSourceFactory(com.facebook.presto.hive.HiveBatchPageSourceFactory) ColumnIO(org.apache.parquet.io.ColumnIO) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) StandardTypes(com.facebook.presto.common.type.StandardTypes) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) ArrayList(java.util.ArrayList) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) HIVE_BAD_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_BAD_DATA) DATE(com.facebook.presto.common.type.StandardTypes.DATE) IOException(java.io.IOException) ParquetTypeUtils.nestedColumnPath(com.facebook.presto.parquet.ParquetTypeUtils.nestedColumnPath) Domain(com.facebook.presto.common.predicate.Domain) INTEGER(com.facebook.presto.common.type.StandardTypes.INTEGER) HiveColumnHandle.getPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.getPushedDownSubfield) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) HiveSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) REAL(com.facebook.presto.common.type.StandardTypes.REAL) RowType(com.facebook.presto.common.type.RowType) ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HiveColumnHandle.isPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.isPushedDownSubfield) StandardFunctionResolution(com.facebook.presto.spi.function.StandardFunctionResolution) ParquetTypeUtils.lookupColumnByName(com.facebook.presto.parquet.ParquetTypeUtils.lookupColumnByName) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) MAP(com.facebook.presto.common.type.StandardTypes.MAP) Locale(java.util.Locale) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) BIGINT(com.facebook.presto.common.type.StandardTypes.BIGINT) HiveSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled) ImmutableSet(com.google.common.collect.ImmutableSet) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Collectors(java.util.stream.Collectors) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) VARCHAR(com.facebook.presto.common.type.StandardTypes.VARCHAR) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled) Entry(java.util.Map.Entry) Optional(java.util.Optional) TIMESTAMP(com.facebook.presto.common.type.StandardTypes.TIMESTAMP) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) SMALLINT(com.facebook.presto.common.type.StandardTypes.SMALLINT) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) DECIMAL(com.facebook.presto.common.type.StandardTypes.DECIMAL) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) VARBINARY(com.facebook.presto.common.type.StandardTypes.VARBINARY) Storage(com.facebook.presto.hive.metastore.Storage) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) PERMISSION_DENIED(com.facebook.presto.spi.StandardErrorCode.PERMISSION_DENIED) Field(com.facebook.presto.parquet.Field) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) AccessControlException(org.apache.hadoop.security.AccessControlException) SYNTHESIZED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.SYNTHESIZED) HiveSessionProperties.columnIndexFilterEnabled(com.facebook.presto.hive.HiveSessionProperties.columnIndexFilterEnabled) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList(com.google.common.collect.ImmutableList) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(com.facebook.presto.parquet.predicate.Predicate) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) HiveColumnHandle.getPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.getPushedDownSubfield) HiveColumnHandle.isPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.isPushedDownSubfield) Subfield(com.facebook.presto.common.Subfield) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) Optional(java.util.Optional) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) AccessControlException(org.apache.hadoop.security.AccessControlException) IOException(java.io.IOException) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) ColumnIO(org.apache.parquet.io.ColumnIO) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) AccessControlException(org.apache.hadoop.security.AccessControlException) RowType(com.facebook.presto.common.type.RowType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Type(com.facebook.presto.common.type.Type) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Aggregations

Domain (com.facebook.presto.common.predicate.Domain)3 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)3 Type (com.facebook.presto.common.type.Type)3 TypeManager (com.facebook.presto.common.type.TypeManager)3 FileFormatDataSourceStats (com.facebook.presto.hive.FileFormatDataSourceStats)3 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)3 ParquetReader (com.facebook.presto.parquet.reader.ParquetReader)3 ArrayList (java.util.ArrayList)3 RuntimeStats (com.facebook.presto.common.RuntimeStats)2 Block (com.facebook.presto.common.block.Block)2 HdfsContext (com.facebook.presto.hive.HdfsContext)2 HdfsParquetDataSource.buildHdfsParquetDataSource (com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource)2 ParquetPageSource (com.facebook.presto.hive.parquet.ParquetPageSource)2 AggregatedMemoryContext (com.facebook.presto.memory.context.AggregatedMemoryContext)2 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)2 Field (com.facebook.presto.parquet.Field)2 ParquetCorruptionException (com.facebook.presto.parquet.ParquetCorruptionException)2 ParquetDataSource (com.facebook.presto.parquet.ParquetDataSource)2 ParquetTypeUtils.getColumnIO (com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO)2 ParquetTypeUtils.getDescriptors (com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors)2