Search in sources :

Example 1 with ParquetWriterOptions

use of com.facebook.presto.parquet.writer.ParquetWriterOptions in project presto by prestodb.

the class ParquetFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
    if (!isParquetOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(getParquetWriterPageSize(session)).setMaxBlockSize(getParquetWriterBlockSize(session)).build();
    CompressionCodecName compressionCodecName = getCompression(conf);
    List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
    List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        return Optional.of(new ParquetFileWriter(fileSystem.create(path), rollbackAction, fileColumnNames, fileColumnTypes, parquetWriterOptions, fileInputColumnIndexes, compressionCodecName));
    } catch (IOException e) {
        throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) Inject(com.google.inject.Inject) HiveSessionProperties.getParquetWriterPageSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterPageSize) Callable(java.util.concurrent.Callable) META_TABLE_COLUMNS(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS) PrestoException(com.facebook.presto.spi.PrestoException) HiveSessionProperties.isParquetOptimizedWriterEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetOptimizedWriterEnabled) NodeVersion(com.facebook.presto.hive.NodeVersion) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getParquetWriterBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) Splitter(com.google.common.base.Splitter) Type(com.facebook.presto.common.type.Type) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) META_TABLE_COLUMN_TYPES(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES) Properties(java.util.Properties) ParquetOutputFormat(org.apache.parquet.hadoop.ParquetOutputFormat) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) HiveFileWriterFactory(com.facebook.presto.hive.HiveFileWriterFactory) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions) HiveFileWriter(com.facebook.presto.hive.HiveFileWriter) IOException(java.io.IOException) ConnectorSession(com.facebook.presto.spi.ConnectorSession) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HIVE_WRITER_OPEN_ERROR(com.facebook.presto.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveType.toHiveTypes(com.facebook.presto.hive.HiveType.toHiveTypes) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) Type(com.facebook.presto.common.type.Type) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions)

Example 2 with ParquetWriterOptions

use of com.facebook.presto.parquet.writer.ParquetWriterOptions in project presto by prestodb.

the class IcebergFileWriterFactory method createParquetWriter.

private IcebergFileWriter createParquetWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session, HdfsContext hdfsContext) {
    List<String> fileColumnNames = icebergSchema.columns().stream().map(Types.NestedField::name).collect(toImmutableList());
    List<Type> fileColumnTypes = icebergSchema.columns().stream().map(column -> toPrestoType(column.type(), typeManager)).collect(toImmutableList());
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(outputPath, false);
            return null;
        };
        ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(getParquetWriterPageSize(session)).setMaxPageSize(getParquetWriterBlockSize(session)).build();
        return new IcebergParquetFileWriter(hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.create(outputPath)), rollbackAction, fileColumnNames, fileColumnTypes, convert(icebergSchema, "table"), makeTypeMap(fileColumnTypes, fileColumnNames), parquetWriterOptions, IntStream.range(0, fileColumnNames.size()).toArray(), getCompressionCodec(session).getParquetCompressionCodec().get(), outputPath, hdfsEnvironment, hdfsContext);
    } catch (IOException e) {
        throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) Types(org.apache.iceberg.types.Types) FileSystem(org.apache.hadoop.fs.FileSystem) DataSink(com.facebook.presto.common.io.DataSink) IcebergSessionProperties.getOrcMaxBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) NodeVersion(com.facebook.presto.hive.NodeVersion) PRESTO_VERSION_NAME(com.facebook.presto.hive.HiveMetadata.PRESTO_VERSION_NAME) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getParquetWriterBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) HdfsContext(com.facebook.presto.hive.HdfsContext) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) ParquetSchemaUtil.convert(org.apache.iceberg.parquet.ParquetSchemaUtil.convert) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveDwrfEncryptionProvider(com.facebook.presto.hive.HiveDwrfEncryptionProvider) Schema(org.apache.iceberg.Schema) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) ICEBERG_WRITER_OPEN_ERROR(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR) IcebergSessionProperties.isOrcOptimizedWriterValidate(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcOptimizedWriterValidate) Optional(java.util.Optional) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) HiveSessionProperties(com.facebook.presto.hive.HiveSessionProperties) IntStream(java.util.stream.IntStream) HiveSessionProperties.getParquetWriterPageSize(com.facebook.presto.hive.HiveSessionProperties.getParquetWriterPageSize) Callable(java.util.concurrent.Callable) PrestoException(com.facebook.presto.spi.PrestoException) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) IcebergSessionProperties.getCompressionCodec(com.facebook.presto.iceberg.IcebergSessionProperties.getCompressionCodec) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) PrimitiveTypeMapBuilder.makeTypeMap(com.facebook.presto.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap) TypeConverter.toOrcType(com.facebook.presto.iceberg.TypeConverter.toOrcType) OrcWriterStats(com.facebook.presto.orc.OrcWriterStats) Type(com.facebook.presto.common.type.Type) IcebergSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) DefaultOrcWriterFlushPolicy(com.facebook.presto.orc.DefaultOrcWriterFlushPolicy) OrcFileWriterConfig(com.facebook.presto.hive.OrcFileWriterConfig) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) FileFormat(org.apache.iceberg.FileFormat) ICEBERG_WRITE_VALIDATION_FAILED(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED) JobConf(org.apache.hadoop.mapred.JobConf) IcebergSessionProperties.getOrcOptimizedWriterValidateMode(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcOptimizedWriterValidateMode) PRESTO_QUERY_ID_NAME(com.facebook.presto.hive.metastore.MetastoreUtil.PRESTO_QUERY_ID_NAME) IcebergSessionProperties.getOrcStreamBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) Types(org.apache.iceberg.types.Types) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) TypeConverter.toPrestoType(com.facebook.presto.iceberg.TypeConverter.toPrestoType) TypeConverter.toOrcType(com.facebook.presto.iceberg.TypeConverter.toOrcType) Type(com.facebook.presto.common.type.Type) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetWriterOptions(com.facebook.presto.parquet.writer.ParquetWriterOptions)

Aggregations

Type (com.facebook.presto.common.type.Type)2 TypeManager (com.facebook.presto.common.type.TypeManager)2 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)2 HiveSessionProperties.getParquetWriterBlockSize (com.facebook.presto.hive.HiveSessionProperties.getParquetWriterBlockSize)2 HiveSessionProperties.getParquetWriterPageSize (com.facebook.presto.hive.HiveSessionProperties.getParquetWriterPageSize)2 NodeVersion (com.facebook.presto.hive.NodeVersion)2 ParquetWriterOptions (com.facebook.presto.parquet.writer.ParquetWriterOptions)2 ConnectorSession (com.facebook.presto.spi.ConnectorSession)2 PrestoException (com.facebook.presto.spi.PrestoException)2 IOException (java.io.IOException)2 List (java.util.List)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2 Optional (java.util.Optional)2 Callable (java.util.concurrent.Callable)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 JobConf (org.apache.hadoop.mapred.JobConf)2 DataSink (com.facebook.presto.common.io.DataSink)1 OutputStreamDataSink (com.facebook.presto.common.io.OutputStreamDataSink)1 EncryptionInformation (com.facebook.presto.hive.EncryptionInformation)1