Search in sources :

Example 1 with HiveFileContext

use of com.facebook.presto.hive.HiveFileContext in project presto by prestodb.

the class IcebergPageSourceProvider method createParquetPageSource.

private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, SchemaTableName tableName, List<IcebergColumnHandle> regularColumns, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, boolean columnIndexFilterEnabled) {
    AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        long fileSize = fileStatus.getLen();
        long modificationTime = fileStatus.getModificationTime();
        HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
        FSDataInputStream inputStream = fileSystem.openFile(path, hiveFileContext);
        dataSource = buildHdfsParquetDataSource(inputStream, path, fileFormatDataSourceStats);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        // Mapping from Iceberg field ID to Parquet fields.
        Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
        List<org.apache.parquet.schema.Type> parquetFields = regularColumns.stream().map(column -> {
            if (parquetIdToField.isEmpty()) {
                // This is a migrated table
                return getParquetTypeByName(column.getName(), fileSchema);
            }
            return parquetIdToField.get(column.getId());
        }).collect(toList());
        // TODO: support subfield pushdown
        MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
        final ParquetDataSource finalDataSource = dataSource;
        List<BlockMetaData> blocks = new ArrayList<>();
        List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
            if ((firstDataPage >= start) && (firstDataPage < (start + length)) && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
                blocks.add(block);
                blockIndexStores.add(columnIndexStore.orElse(null));
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks, dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
        ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Type> prestoTypes = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
        for (int columnIndex = 0; columnIndex < regularColumns.size(); columnIndex++) {
            IcebergColumnHandle column = regularColumns.get(columnIndex);
            namesBuilder.add(column.getName());
            org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
            Type prestoType = column.getType();
            prestoTypes.add(prestoType);
            if (parquetField == null) {
                internalFields.add(Optional.empty());
            } else {
                internalFields.add(constructField(column.getType(), messageColumnIO.getChild(parquetField.getName())));
            }
        }
        return new ParquetPageSource(parquetReader, prestoTypes.build(), internalFields.build(), namesBuilder.build(), new RuntimeStats());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof ParquetCorruptionException) {
            throw new PrestoException(ICEBERG_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ConnectorTransactionHandle(com.facebook.presto.spi.connector.ConnectorTransactionHandle) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) ConnectorPageSourceProvider(com.facebook.presto.spi.connector.ConnectorPageSourceProvider) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) ORC_ICEBERG_ID_KEY(com.facebook.presto.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) IcebergSessionProperties.getOrcLazyReadSmallRanges(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ICEBERG_BAD_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) MetadataReader(com.facebook.presto.parquet.cache.MetadataReader) StandardTypes(com.facebook.presto.common.type.StandardTypes) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) IcebergSessionProperties.getOrcMaxReadBlockSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) ArrayList(java.util.ArrayList) IcebergSessionProperties.getOrcTinyStripeThreshold(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ROOT_COLUMN_ID(com.facebook.presto.iceberg.IcebergOrcColumn.ROOT_COLUMN_ID) ICEBERG_MISSING_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) FileFormat(org.apache.iceberg.FileFormat) Domain(com.facebook.presto.common.predicate.Domain) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) HiveSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize) ColumnHandle(com.facebook.presto.spi.ColumnHandle) IcebergSessionProperties.isOrcZstdJniDecompressionEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcZstdJniDecompressionEnabled) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OrcReader(com.facebook.presto.orc.OrcReader) ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) NO_CACHE_CONSTRAINTS(com.facebook.presto.hive.CacheQuota.NO_CACHE_CONSTRAINTS) IcebergSessionProperties.getOrcMaxBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) OrcBatchPageSource(com.facebook.presto.hive.orc.OrcBatchPageSource) SchemaTableName(com.facebook.presto.spi.SchemaTableName) SplitContext(com.facebook.presto.spi.SplitContext) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) HdfsContext(com.facebook.presto.hive.HdfsContext) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) HiveSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPredicate(com.facebook.presto.orc.OrcPredicate) HiveDwrfEncryptionProvider(com.facebook.presto.hive.HiveDwrfEncryptionProvider) String.format(java.lang.String.format) IcebergSessionProperties.isOrcBloomFiltersEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) Objects(java.util.Objects) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) IntStream(java.util.stream.IntStream) ConnectorTableLayoutHandle(com.facebook.presto.spi.ConnectorTableLayoutHandle) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) Function(java.util.function.Function) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) ImmutableList(com.google.common.collect.ImmutableList) ICEBERG_CANNOT_OPEN_SPLIT(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) OrcType(com.facebook.presto.orc.metadata.OrcType) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) IcebergSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) OrcEncoding(com.facebook.presto.orc.OrcEncoding) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) Field(com.facebook.presto.parquet.Field) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) IcebergSessionProperties.getOrcStreamBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) FileStatus(org.apache.hadoop.fs.FileStatus) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) OrcPredicate(com.facebook.presto.orc.OrcPredicate) Predicate(com.facebook.presto.parquet.predicate.Predicate) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) MessageType(org.apache.parquet.schema.MessageType) Optional(java.util.Optional) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) Objects(java.util.Objects) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) RuntimeStats(com.facebook.presto.common.RuntimeStats) PrestoException(com.facebook.presto.spi.PrestoException) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) IOException(java.io.IOException) PrestoException(com.facebook.presto.spi.PrestoException) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) MessageType(org.apache.parquet.schema.MessageType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem)

Example 2 with HiveFileContext

use of com.facebook.presto.hive.HiveFileContext in project presto by prestodb.

the class RcFilePageSourceFactory method createPageSource.

@Override
public Optional<? extends ConnectorPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long fileSize, Storage storage, SchemaTableName tableName, Map<String, String> tableParameters, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone hiveStorageTimeZone, HiveFileContext hiveFileContext, Optional<EncryptionInformation> encryptionInformation) {
    if (!columns.isEmpty() && columns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
        throw new UnsupportedOperationException("Partial aggregation pushdown only supported for ORC/Parquet files. " + "Table " + tableName.toString() + " has file (" + path.toString() + ") of format " + storage.getStorageFormat().getOutputFormat() + ". Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
    }
    RcFileEncoding rcFileEncoding;
    if (LazyBinaryColumnarSerDe.class.getName().equals(storage.getStorageFormat().getSerDe())) {
        rcFileEncoding = new BinaryRcFileEncoding();
    } else if (ColumnarSerDe.class.getName().equals(storage.getStorageFormat().getSerDe())) {
        rcFileEncoding = createTextVectorEncoding(getHiveSchema(storage.getSerdeParameters(), tableParameters), hiveStorageTimeZone);
    } else {
        return Optional.empty();
    }
    if (fileSize == 0) {
        throw new PrestoException(HIVE_BAD_DATA, "RCFile is empty: " + path);
    }
    FSDataInputStream inputStream;
    try {
        inputStream = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration).openFile(path, hiveFileContext);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    try {
        ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
        for (HiveColumnHandle column : columns) {
            readColumns.put(column.getHiveColumnIndex(), column.getHiveType().getType(typeManager));
        }
        RcFileReader rcFileReader = new RcFileReader(new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats), rcFileEncoding, readColumns.build(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, new DataSize(8, Unit.MEGABYTE));
        return Optional.of(new RcFilePageSource(rcFileReader, columns, typeManager));
    } catch (Throwable e) {
        try {
            inputStream.close();
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof RcFileCorruptionException) {
            throw new PrestoException(HIVE_BAD_DATA, message, e);
        }
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) SERIALIZATION_LAST_COLUMN_TAKES_REST(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST) FIELD_DELIM(org.apache.hadoop.hive.serde.serdeConstants.FIELD_DELIM) SERIALIZATION_NULL_FORMAT(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_FORMAT) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) Unit(io.airlift.units.DataSize.Unit) RcFileCorruptionException(com.facebook.presto.rcfile.RcFileCorruptionException) AGGREGATED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.AGGREGATED) SchemaTableName(com.facebook.presto.spi.SchemaTableName) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ESCAPE_CHAR(org.apache.hadoop.hive.serde.serdeConstants.ESCAPE_CHAR) ImmutableMap(com.google.common.collect.ImmutableMap) HiveFileContext(com.facebook.presto.hive.HiveFileContext) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe) DEFAULT_NULL_SEQUENCE(com.facebook.presto.rcfile.text.TextRcFileEncoding.DEFAULT_NULL_SEQUENCE) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveBatchPageSourceFactory(com.facebook.presto.hive.HiveBatchPageSourceFactory) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) RcFileEncoding(com.facebook.presto.rcfile.RcFileEncoding) Slice(io.airlift.slice.Slice) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) LazyUtils.getByte(org.apache.hadoop.hive.serde2.lazy.LazyUtils.getByte) PrestoException(com.facebook.presto.spi.PrestoException) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) HadoopCodecFactory(com.facebook.presto.rcfile.HadoopCodecFactory) Inject(javax.inject.Inject) SERIALIZATION_EXTEND_NESTING_LEVELS(org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters.SERIALIZATION_EXTEND_NESTING_LEVELS) MAPKEY_DELIM(org.apache.hadoop.hive.serde.serdeConstants.MAPKEY_DELIM) SERIALIZATION_FORMAT(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT) TypeManager(com.facebook.presto.common.type.TypeManager) DEFAULT_SEPARATORS(com.facebook.presto.rcfile.text.TextRcFileEncoding.DEFAULT_SEPARATORS) Objects.requireNonNull(java.util.Objects.requireNonNull) COLLECTION_DELIM(org.apache.hadoop.hive.serde.serdeConstants.COLLECTION_DELIM) HIVE_BAD_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_BAD_DATA) Type(com.facebook.presto.common.type.Type) AircompressorCodecFactory(com.facebook.presto.rcfile.AircompressorCodecFactory) Storage(com.facebook.presto.hive.metastore.Storage) Properties(java.util.Properties) RcFileReader(com.facebook.presto.rcfile.RcFileReader) IOException(java.io.IOException) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) TextRcFileEncoding(com.facebook.presto.rcfile.text.TextRcFileEncoding) BinaryRcFileEncoding(com.facebook.presto.rcfile.binary.BinaryRcFileEncoding) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) MetastoreUtil.getHiveSchema(com.facebook.presto.hive.metastore.MetastoreUtil.getHiveSchema) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) RcFileEncoding(com.facebook.presto.rcfile.RcFileEncoding) TextRcFileEncoding(com.facebook.presto.rcfile.text.TextRcFileEncoding) BinaryRcFileEncoding(com.facebook.presto.rcfile.binary.BinaryRcFileEncoding) IOException(java.io.IOException) RcFileReader(com.facebook.presto.rcfile.RcFileReader) RcFileCorruptionException(com.facebook.presto.rcfile.RcFileCorruptionException) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) ImmutableMap(com.google.common.collect.ImmutableMap) Type(com.facebook.presto.common.type.Type) RcFileCorruptionException(com.facebook.presto.rcfile.RcFileCorruptionException) HadoopCodecFactory(com.facebook.presto.rcfile.HadoopCodecFactory) DataSize(io.airlift.units.DataSize) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) BinaryRcFileEncoding(com.facebook.presto.rcfile.binary.BinaryRcFileEncoding) AircompressorCodecFactory(com.facebook.presto.rcfile.AircompressorCodecFactory) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle)

Example 3 with HiveFileContext

use of com.facebook.presto.hive.HiveFileContext in project presto by prestodb.

the class ParquetPageSourceFactory method createParquetPageSource.

public static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, SchemaTableName tableName, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, StandardFunctionResolution functionResolution, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, HiveFileContext hiveFileContext, ParquetMetadataSource parquetMetadataSource, boolean columnIndexFilterEnabled) {
    AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).openFile(path, hiveFileContext);
        dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
        ParquetMetadata parquetMetadata = parquetMetadataSource.getParquetMetadata(dataSource, fileSize, hiveFileContext.isCacheable()).getParquetMetadata();
        if (!columns.isEmpty() && columns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
            return new AggregatedParquetPageSource(columns, parquetMetadata, typeManager, functionResolution);
        }
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getTypeSignature()), fileSchema, useParquetColumnNames, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                footerBlocks.add(block);
            }
        }
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
        final ParquetDataSource finalDataSource = dataSource;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
        for (BlockMetaData block : footerBlocks.build()) {
            Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
            if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
                blocks.add(block);
                blockIndexStores.add(columnIndexStore.orElse(null));
                hiveFileContext.incrementCounter("parquet.blocksRead", 1);
                hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount());
                hiveFileContext.incrementCounter("parquet.totalBytesRead", block.getTotalByteSize());
            } else {
                hiveFileContext.incrementCounter("parquet.blocksSkipped", 1);
                hiveFileContext.incrementCounter("parquet.rowsSkipped", block.getRowCount());
                hiveFileContext.incrementCounter("parquet.totalBytesSkipped", block.getTotalByteSize());
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
        ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
        for (HiveColumnHandle column : columns) {
            checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SYNTHESIZED, "column type must be regular or synthesized column");
            String name = column.getName();
            Type type = typeManager.getType(column.getTypeSignature());
            namesBuilder.add(name);
            typesBuilder.add(type);
            if (column.getColumnType() == SYNTHESIZED) {
                Subfield pushedDownSubfield = getPushedDownSubfield(column);
                List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
                Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
                if (columnIO.isPresent()) {
                    fieldsBuilder.add(constructField(type, columnIO.get()));
                } else {
                    fieldsBuilder.add(Optional.empty());
                }
            } else if (getParquetType(type, fileSchema, useParquetColumnNames, column, tableName, path).isPresent()) {
                String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName();
                fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName)));
            } else {
                fieldsBuilder.add(Optional.empty());
            }
        }
        return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), hiveFileContext.getStats());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new PrestoException(HIVE_BAD_DATA, e);
        }
        if (e instanceof AccessControlException) {
            throw new PrestoException(PERMISSION_DENIED, e.getMessage(), e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) DateTimeZone(org.joda.time.DateTimeZone) TINYINT(com.facebook.presto.common.type.StandardTypes.TINYINT) HIVE_PARTITION_SCHEMA_MISMATCH(com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) ROW(com.facebook.presto.common.type.StandardTypes.ROW) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) AGGREGATED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.AGGREGATED) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) ParquetMetadataSource(com.facebook.presto.parquet.cache.ParquetMetadataSource) Set(java.util.Set) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ARRAY(com.facebook.presto.common.type.StandardTypes.ARRAY) ConnectorSession(com.facebook.presto.spi.ConnectorSession) CHAR(com.facebook.presto.common.type.StandardTypes.CHAR) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) HiveBatchPageSourceFactory(com.facebook.presto.hive.HiveBatchPageSourceFactory) ColumnIO(org.apache.parquet.io.ColumnIO) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) StandardTypes(com.facebook.presto.common.type.StandardTypes) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) ArrayList(java.util.ArrayList) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) HIVE_BAD_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_BAD_DATA) DATE(com.facebook.presto.common.type.StandardTypes.DATE) IOException(java.io.IOException) ParquetTypeUtils.nestedColumnPath(com.facebook.presto.parquet.ParquetTypeUtils.nestedColumnPath) Domain(com.facebook.presto.common.predicate.Domain) INTEGER(com.facebook.presto.common.type.StandardTypes.INTEGER) HiveColumnHandle.getPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.getPushedDownSubfield) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) HiveSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) REAL(com.facebook.presto.common.type.StandardTypes.REAL) RowType(com.facebook.presto.common.type.RowType) ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HiveColumnHandle.isPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.isPushedDownSubfield) StandardFunctionResolution(com.facebook.presto.spi.function.StandardFunctionResolution) ParquetTypeUtils.lookupColumnByName(com.facebook.presto.parquet.ParquetTypeUtils.lookupColumnByName) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) MAP(com.facebook.presto.common.type.StandardTypes.MAP) Locale(java.util.Locale) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) BIGINT(com.facebook.presto.common.type.StandardTypes.BIGINT) HiveSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled) ImmutableSet(com.google.common.collect.ImmutableSet) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Collectors(java.util.stream.Collectors) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) VARCHAR(com.facebook.presto.common.type.StandardTypes.VARCHAR) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled) Entry(java.util.Map.Entry) Optional(java.util.Optional) TIMESTAMP(com.facebook.presto.common.type.StandardTypes.TIMESTAMP) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) SMALLINT(com.facebook.presto.common.type.StandardTypes.SMALLINT) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) DECIMAL(com.facebook.presto.common.type.StandardTypes.DECIMAL) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) VARBINARY(com.facebook.presto.common.type.StandardTypes.VARBINARY) Storage(com.facebook.presto.hive.metastore.Storage) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) PERMISSION_DENIED(com.facebook.presto.spi.StandardErrorCode.PERMISSION_DENIED) Field(com.facebook.presto.parquet.Field) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) AccessControlException(org.apache.hadoop.security.AccessControlException) SYNTHESIZED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.SYNTHESIZED) HiveSessionProperties.columnIndexFilterEnabled(com.facebook.presto.hive.HiveSessionProperties.columnIndexFilterEnabled) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList(com.google.common.collect.ImmutableList) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(com.facebook.presto.parquet.predicate.Predicate) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) HiveColumnHandle.getPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.getPushedDownSubfield) HiveColumnHandle.isPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.isPushedDownSubfield) Subfield(com.facebook.presto.common.Subfield) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) Optional(java.util.Optional) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) AccessControlException(org.apache.hadoop.security.AccessControlException) IOException(java.io.IOException) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) ColumnIO(org.apache.parquet.io.ColumnIO) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) AccessControlException(org.apache.hadoop.security.AccessControlException) RowType(com.facebook.presto.common.type.RowType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Type(com.facebook.presto.common.type.Type) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 4 with HiveFileContext

use of com.facebook.presto.hive.HiveFileContext in project presto by prestodb.

the class OrcBatchPageSourceFactory method createOrcPageSource.

public static ConnectorPageSource createOrcPageSource(OrcEncoding orcEncoding, HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, boolean useOrcColumnNames, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone hiveStorageTimeZone, TypeManager typeManager, StandardFunctionResolution functionResolution, DataSize maxBufferSize, DataSize streamBufferSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, FileFormatDataSourceStats stats, int domainCompactionThreshold, OrcFileTailSource orcFileTailSource, StripeMetadataSourceFactory stripeMetadataSourceFactory, HiveFileContext hiveFileContext, OrcReaderOptions orcReaderOptions, Optional<EncryptionInformation> encryptionInformation, DwrfEncryptionProvider dwrfEncryptionProvider) {
    checkArgument(domainCompactionThreshold >= 1, "domainCompactionThreshold must be at least 1");
    OrcDataSource orcDataSource;
    try {
        FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(sessionUser, path, configuration).openFile(path, hiveFileContext);
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, orcReaderOptions.getMaxMergeDistance(), maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    OrcAggregatedMemoryContext systemMemoryUsage = new HiveOrcAggregatedMemoryContext();
    try {
        DwrfKeyProvider dwrfKeyProvider = new ProjectionBasedDwrfKeyProvider(encryptionInformation, columns, useOrcColumnNames, path);
        OrcReader reader = new OrcReader(orcDataSource, orcEncoding, orcFileTailSource, stripeMetadataSourceFactory, new HiveOrcAggregatedMemoryContext(), orcReaderOptions, hiveFileContext.isCacheable(), dwrfEncryptionProvider, dwrfKeyProvider, hiveFileContext.getStats());
        List<HiveColumnHandle> physicalColumns = getPhysicalHiveColumnHandles(columns, useOrcColumnNames, reader.getTypes(), path);
        ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
        ImmutableList.Builder<ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
        for (HiveColumnHandle column : physicalColumns) {
            if (column.getColumnType() == REGULAR) {
                Type type = typeManager.getType(column.getTypeSignature());
                includedColumns.put(column.getHiveColumnIndex(), type);
                columnReferences.add(new ColumnReference<>(column, column.getHiveColumnIndex(), type));
            }
        }
        if (!physicalColumns.isEmpty() && physicalColumns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
            return new AggregatedOrcPageSource(physicalColumns, reader.getFooter(), typeManager, functionResolution);
        }
        OrcPredicate predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences.build(), orcBloomFiltersEnabled, Optional.of(domainCompactionThreshold));
        OrcBatchRecordReader recordReader = reader.createBatchRecordReader(includedColumns.build(), predicate, start, length, hiveStorageTimeZone, systemMemoryUsage, INITIAL_BATCH_SIZE);
        return new OrcBatchPageSource(recordReader, reader.getOrcDataSource(), physicalColumns, typeManager, systemMemoryUsage, stats, hiveFileContext.getStats());
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) DateTimeZone(org.joda.time.DateTimeZone) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) HiveSessionProperties.getOrcTinyStripeThreshold(com.facebook.presto.hive.HiveSessionProperties.getOrcTinyStripeThreshold) StandardFunctionResolution(com.facebook.presto.spi.function.StandardFunctionResolution) FixedPageSource(com.facebook.presto.spi.FixedPageSource) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) AGGREGATED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.AGGREGATED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HiveSessionProperties.getOrcLazyReadSmallRanges(com.facebook.presto.hive.HiveSessionProperties.getOrcLazyReadSmallRanges) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) HiveSessionProperties.isOrcBloomFiltersEnabled(com.facebook.presto.hive.HiveSessionProperties.isOrcBloomFiltersEnabled) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableMap(com.google.common.collect.ImmutableMap) HiveSessionProperties.isOrcZstdJniDecompressionEnabled(com.facebook.presto.hive.HiveSessionProperties.isOrcZstdJniDecompressionEnabled) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPredicate(com.facebook.presto.orc.OrcPredicate) HiveFileContext(com.facebook.presto.hive.HiveFileContext) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) HiveBatchPageSourceFactory(com.facebook.presto.hive.HiveBatchPageSourceFactory) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) HiveSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.hive.HiveSessionProperties.getOrcMaxMergeDistance) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) PrestoException(com.facebook.presto.spi.PrestoException) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) HiveSessionProperties.getOrcMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getOrcMaxReadBlockSize) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) HiveSessionProperties.getOrcMaxBufferSize(com.facebook.presto.hive.HiveSessionProperties.getOrcMaxBufferSize) Objects.requireNonNull(java.util.Objects.requireNonNull) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) Type(com.facebook.presto.common.type.Type) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) ColumnReference(com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference) Storage(com.facebook.presto.hive.metastore.Storage) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) OrcEncoding(com.facebook.presto.orc.OrcEncoding) HiveSessionProperties.getOrcStreamBufferSize(com.facebook.presto.hive.HiveSessionProperties.getOrcStreamBufferSize) NO_ENCRYPTION(com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION) IOException(java.io.IOException) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) HiveUtil.getPhysicalHiveColumnHandles(com.facebook.presto.hive.HiveUtil.getPhysicalHiveColumnHandles) OrcReader(com.facebook.presto.orc.OrcReader) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) ImmutableList(com.google.common.collect.ImmutableList) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcDataSource(com.facebook.presto.orc.OrcDataSource) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) ImmutableMap(com.google.common.collect.ImmutableMap) Type(com.facebook.presto.common.type.Type) OrcReader(com.facebook.presto.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) OrcPredicate(com.facebook.presto.orc.OrcPredicate) ColumnReference(com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference)

Example 5 with HiveFileContext

use of com.facebook.presto.hive.HiveFileContext in project presto by prestodb.

the class AlluxioCachingFileSystem method initialize.

@Override
public synchronized void initialize(URI uri, Configuration configuration) throws IOException {
    this.localCacheFileSystem = new LocalCacheFileSystem(dataTier, uriStatus -> {
        // CacheContext is the mechanism to pass the hiveFileContext to the source filesystem
        // hiveFileContext is critical to use to open file.
        CacheContext cacheContext = uriStatus.getCacheContext();
        checkState(cacheContext instanceof PrestoCacheContext);
        HiveFileContext hiveFileContext = ((PrestoCacheContext) cacheContext).getHiveFileContext();
        try {
            return dataTier.openFile(new Path(uriStatus.getPath()), hiveFileContext);
        } catch (Exception e) {
            throw new IOException("Failed to open file", e);
        }
    });
    this.cacheQuotaEnabled = configuration.getBoolean(USER_CLIENT_CACHE_QUOTA_ENABLED.getName(), false);
    localCacheFileSystem.initialize(uri, configuration);
}
Also used : UTF_8(java.nio.charset.StandardCharsets.UTF_8) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem) USER_CLIENT_CACHE_QUOTA_ENABLED(alluxio.conf.PropertyKey.USER_CLIENT_CACHE_QUOTA_ENABLED) CacheContext(alluxio.client.file.CacheContext) LocalCacheFileSystem(alluxio.hadoop.LocalCacheFileSystem) IOException(java.io.IOException) HiveFileContext(com.facebook.presto.hive.HiveFileContext) CachingFileSystem(com.facebook.presto.cache.CachingFileSystem) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Hashing.md5(com.google.common.hash.Hashing.md5) URIStatus(alluxio.client.file.URIStatus) FileInfo(alluxio.wire.FileInfo) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Path(org.apache.hadoop.fs.Path) LocalCacheFileSystem(alluxio.hadoop.LocalCacheFileSystem) HiveFileContext(com.facebook.presto.hive.HiveFileContext) IOException(java.io.IOException) CacheContext(alluxio.client.file.CacheContext) IOException(java.io.IOException)

Aggregations

HiveFileContext (com.facebook.presto.hive.HiveFileContext)6 IOException (java.io.IOException)6 Configuration (org.apache.hadoop.conf.Configuration)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 Path (org.apache.hadoop.fs.Path)6 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)5 Type (com.facebook.presto.common.type.Type)5 TypeManager (com.facebook.presto.common.type.TypeManager)5 EncryptionInformation (com.facebook.presto.hive.EncryptionInformation)5 FileFormatDataSourceStats (com.facebook.presto.hive.FileFormatDataSourceStats)5 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)5 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)5 ConnectorPageSource (com.facebook.presto.spi.ConnectorPageSource)5 ConnectorSession (com.facebook.presto.spi.ConnectorSession)5 PrestoException (com.facebook.presto.spi.PrestoException)5 SchemaTableName (com.facebook.presto.spi.SchemaTableName)5 ImmutableMap (com.google.common.collect.ImmutableMap)5 DataSize (io.airlift.units.DataSize)5 String.format (java.lang.String.format)5 List (java.util.List)5