Search in sources :

Example 1 with ParquetPredicate

use of com.facebook.presto.hive.parquet.predicate.ParquetPredicate in project presto by prestodb.

the class ParquetHiveRecordCursor method createParquetRecordReader.

private ParquetRecordReader<FakeParquetRecord> createParquetRecordReader(HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, List<HiveColumnHandle> columns, boolean useParquetColumnNames, TypeManager typeManager, boolean predicatePushdownEnabled, TupleDomain<HiveColumnHandle> effectivePredicate) {
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
        dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
        ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(sessionUser, () -> ParquetFileReader.readFooter(configuration, path, NO_FILTER));
        List<BlockMetaData> blocks = parquetMetadata.getBlocks();
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        PrestoReadSupport readSupport = new PrestoReadSupport(useParquetColumnNames, columns, fileSchema);
        List<parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
        LongArrayList offsets = new LongArrayList(blocks.size());
        for (BlockMetaData block : blocks) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                if (predicatePushdownEnabled) {
                    ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
                    if (predicateMatches(parquetPredicate, block, dataSource, requestedSchema, effectivePredicate)) {
                        offsets.add(block.getStartingPos());
                    }
                } else {
                    offsets.add(block.getStartingPos());
                }
            }
        }
        ParquetInputSplit split = new ParquetInputSplit(path, start, start + length, length, null, offsets.toLongArray());
        TaskAttemptContext taskContext = ContextUtil.newTaskAttemptContext(configuration, new TaskAttemptID());
        return hdfsEnvironment.doAs(sessionUser, () -> {
            ParquetRecordReader<FakeParquetRecord> realReader = new PrestoParquetRecordReader(readSupport);
            realReader.initialize(split, taskContext);
            return realReader;
        });
    } catch (Exception e) {
        Throwables.propagateIfInstanceOf(e, PrestoException.class);
        if (e instanceof InterruptedException) {
            Thread.currentThread().interrupt();
            throw Throwables.propagate(e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    } finally {
        if (dataSource != null) {
            try {
                dataSource.close();
            } catch (IOException ignored) {
            }
        }
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) Arrays(java.util.Arrays) Block(com.facebook.presto.spi.block.Block) TypeManager(com.facebook.presto.spi.type.TypeManager) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CURSOR_ERROR(com.facebook.presto.hive.HiveErrorCode.HIVE_CURSOR_ERROR) LongArrayList(it.unimi.dsi.fastutil.longs.LongArrayList) Slices.wrappedBuffer(io.airlift.slice.Slices.wrappedBuffer) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) DecimalType(com.facebook.presto.spi.type.DecimalType) DecimalMetadata(parquet.schema.DecimalMetadata) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BigInteger(java.math.BigInteger) PrimitiveType(parquet.schema.PrimitiveType) MAP_KEY_VALUE(parquet.schema.OriginalType.MAP_KEY_VALUE) Decimals(com.facebook.presto.spi.type.Decimals) ReadSupport(parquet.hadoop.api.ReadSupport) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) BlockBuilder(com.facebook.presto.spi.block.BlockBuilder) Math.min(java.lang.Math.min) Chars.trimSpacesAndTruncateToLength(com.facebook.presto.spi.type.Chars.trimSpacesAndTruncateToLength) Binary(parquet.io.api.Binary) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) ROW(com.facebook.presto.spi.type.StandardTypes.ROW) RecordCursor(com.facebook.presto.spi.RecordCursor) List(java.util.List) ParquetPredicateUtils.buildParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate) DecimalType.createDecimalType(com.facebook.presto.spi.type.DecimalType.createDecimalType) NO_FILTER(parquet.format.converter.ParquetMetadataConverter.NO_FILTER) Optional(java.util.Optional) Math.max(java.lang.Math.max) Varchars.truncateToLength(com.facebook.presto.spi.type.Varchars.truncateToLength) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) RecordMaterializer(parquet.io.api.RecordMaterializer) Converter(parquet.io.api.Converter) Varchars.isVarcharType(com.facebook.presto.spi.type.Varchars.isVarcharType) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) GroupConverter(parquet.io.api.GroupConverter) ParquetTypeUtils.getParquetType(com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType) Slice(io.airlift.slice.Slice) ParquetFileReader(parquet.hadoop.ParquetFileReader) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) ParquetRecordReader(parquet.hadoop.ParquetRecordReader) PrestoException(com.facebook.presto.spi.PrestoException) PrimitiveConverter(parquet.io.api.PrimitiveConverter) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) MAP(com.facebook.presto.spi.type.StandardTypes.MAP) ParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicate) DecimalUtils(com.facebook.presto.hive.util.DecimalUtils) ARRAY(com.facebook.presto.spi.type.StandardTypes.ARRAY) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) ImmutableList(com.google.common.collect.ImmutableList) HiveUtil.closeWithSuppression(com.facebook.presto.hive.HiveUtil.closeWithSuppression) Type(com.facebook.presto.spi.type.Type) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) Objects.requireNonNull(java.util.Objects.requireNonNull) DECIMAL(parquet.schema.OriginalType.DECIMAL) BlockBuilderStatus(com.facebook.presto.spi.block.BlockBuilderStatus) Dictionary(parquet.column.Dictionary) TIMESTAMP(com.facebook.presto.spi.type.TimestampType.TIMESTAMP) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) MessageType(parquet.schema.MessageType) Properties(java.util.Properties) ParquetPredicateUtils.predicateMatches(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.predicateMatches) HiveUtil.getDecimalType(com.facebook.presto.hive.HiveUtil.getDecimalType) ContextUtil(parquet.hadoop.util.ContextUtil) Throwables(com.google.common.base.Throwables) IOException(java.io.IOException) FileMetaData(parquet.hadoop.metadata.FileMetaData) BlockMetaData(parquet.hadoop.metadata.BlockMetaData) Collectors.toList(java.util.stream.Collectors.toList) GroupType(parquet.schema.GroupType) Chars.isCharType(com.facebook.presto.spi.type.Chars.isCharType) ParquetInputSplit(parquet.hadoop.ParquetInputSplit) BlockMetaData(parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) PrestoException(com.facebook.presto.spi.PrestoException) FileSystem(org.apache.hadoop.fs.FileSystem) FileMetaData(parquet.hadoop.metadata.FileMetaData) MessageType(parquet.schema.MessageType) ParquetPredicateUtils.buildParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate) ParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicate) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) LongArrayList(it.unimi.dsi.fastutil.longs.LongArrayList) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) DecimalType(com.facebook.presto.spi.type.DecimalType) PrimitiveType(parquet.schema.PrimitiveType) DecimalType.createDecimalType(com.facebook.presto.spi.type.DecimalType.createDecimalType) Varchars.isVarcharType(com.facebook.presto.spi.type.Varchars.isVarcharType) ParquetTypeUtils.getParquetType(com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType) Type(com.facebook.presto.spi.type.Type) MessageType(parquet.schema.MessageType) HiveUtil.getDecimalType(com.facebook.presto.hive.HiveUtil.getDecimalType) GroupType(parquet.schema.GroupType) Chars.isCharType(com.facebook.presto.spi.type.Chars.isCharType) ParquetInputSplit(parquet.hadoop.ParquetInputSplit)

Example 2 with ParquetPredicate

use of com.facebook.presto.hive.parquet.predicate.ParquetPredicate in project presto by prestodb.

the class ParquetPageSourceFactory method createParquetPageSource.

public static ParquetPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, boolean useParquetColumnNames, TypeManager typeManager, boolean predicatePushdownEnabled, TupleDomain<HiveColumnHandle> effectivePredicate) {
    AggregatedMemoryContext systemMemoryContext = new AggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
        dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
        ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        List<parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
        List<BlockMetaData> blocks = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                blocks.add(block);
            }
        }
        if (predicatePushdownEnabled) {
            ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
            final ParquetDataSource finalDataSource = dataSource;
            blocks = blocks.stream().filter(block -> predicateMatches(parquetPredicate, block, finalDataSource, requestedSchema, effectivePredicate)).collect(toList());
        }
        ParquetReader parquetReader = new ParquetReader(fileSchema, requestedSchema, blocks, dataSource, typeManager, systemMemoryContext);
        return new ParquetPageSource(parquetReader, dataSource, fileSchema, requestedSchema, length, schema, columns, effectivePredicate, typeManager, useParquetColumnNames, systemMemoryContext);
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) DateTimeZone(org.joda.time.DateTimeZone) ParquetTypeUtils.getParquetType(com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType) TypeManager(com.facebook.presto.spi.type.TypeManager) FileSystem(org.apache.hadoop.fs.FileSystem) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) PrestoException(com.facebook.presto.spi.PrestoException) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) HiveSessionProperties.isParquetOptimizedReaderEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetOptimizedReaderEnabled) ArrayList(java.util.ArrayList) ParquetReader(com.facebook.presto.hive.parquet.reader.ParquetReader) Inject(javax.inject.Inject) ParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicate) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ParquetMetadataReader(com.facebook.presto.hive.parquet.reader.ParquetMetadataReader) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) HiveSessionProperties.isParquetPredicatePushdownEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetPredicatePushdownEnabled) ImmutableSet(com.google.common.collect.ImmutableSet) MessageType(parquet.schema.MessageType) Properties(java.util.Properties) ParquetPredicateUtils.predicateMatches(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.predicateMatches) Set(java.util.Set) IOException(java.io.IOException) AggregatedMemoryContext(com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext) FileMetaData(parquet.hadoop.metadata.FileMetaData) BlockMetaData(parquet.hadoop.metadata.BlockMetaData) HivePageSourceFactory(com.facebook.presto.hive.HivePageSourceFactory) String.format(java.lang.String.format) ConnectorSession(com.facebook.presto.spi.ConnectorSession) Objects(java.util.Objects) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) ParquetPredicateUtils.buildParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) HiveUtil.getDeserializerClassName(com.facebook.presto.hive.HiveUtil.getDeserializerClassName) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) BlockMetaData(parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) ParquetReader(com.facebook.presto.hive.parquet.reader.ParquetReader) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) AggregatedMemoryContext(com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) ParquetTypeUtils.getParquetType(com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType) MessageType(parquet.schema.MessageType) FileSystem(org.apache.hadoop.fs.FileSystem) FileMetaData(parquet.hadoop.metadata.FileMetaData) MessageType(parquet.schema.MessageType) ParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicate) ParquetPredicateUtils.buildParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate)

Aggregations

HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)2 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)2 REGULAR (com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR)2 HIVE_CANNOT_OPEN_SPLIT (com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT)2 HIVE_MISSING_DATA (com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA)2 HdfsParquetDataSource.buildHdfsParquetDataSource (com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource)2 ParquetTypeUtils.getParquetType (com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType)2 ParquetPredicate (com.facebook.presto.hive.parquet.predicate.ParquetPredicate)2 ParquetPredicateUtils.buildParquetPredicate (com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate)2 ParquetPredicateUtils.predicateMatches (com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.predicateMatches)2 PrestoException (com.facebook.presto.spi.PrestoException)2 TupleDomain (com.facebook.presto.spi.predicate.TupleDomain)2 TypeManager (com.facebook.presto.spi.type.TypeManager)2 IOException (java.io.IOException)2 String.format (java.lang.String.format)2 List (java.util.List)2 Objects (java.util.Objects)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2 Optional (java.util.Optional)2 Properties (java.util.Properties)2