Search in sources :

Example 1 with ParquetWriterOptions

use of io.trino.parquet.writer.ParquetWriterOptions in project trino by trinodb.

the class ParquetFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!HiveSessionProperties.isParquetOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(HiveSessionProperties.getParquetWriterPageSize(session)).setMaxBlockSize(HiveSessionProperties.getParquetWriterBlockSize(session)).setBatchSize(HiveSessionProperties.getParquetBatchSize(session)).build();
    CompressionCodecName compressionCodecName = getCompression(conf);
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(fileColumnTypes, fileColumnNames);
        return Optional.of(new ParquetFileWriter(fileSystem.create(path, false), rollbackAction, fileColumnTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, fileInputColumnIndexes, compressionCodecName, nodeVersion.toString()));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) NodeVersion(io.trino.plugin.hive.NodeVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Properties(java.util.Properties) ParquetOutputFormat(org.apache.parquet.hadoop.ParquetOutputFormat) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveSessionProperties(io.trino.plugin.hive.HiveSessionProperties) WriterKind(io.trino.plugin.hive.WriterKind) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) Type(io.trino.spi.type.Type) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Example 2 with ParquetWriterOptions

use of io.trino.parquet.writer.ParquetWriterOptions in project trino by trinodb.

the class IcebergFileWriterFactory method createParquetWriter.

private IcebergFileWriter createParquetWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session, HdfsContext hdfsContext) {
    List<String> fileColumnNames = icebergSchema.columns().stream().map(Types.NestedField::name).collect(toImmutableList());
    List<Type> fileColumnTypes = icebergSchema.columns().stream().map(column -> toTrinoType(column.type(), typeManager)).collect(toImmutableList());
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), outputPath, jobConf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(outputPath, false);
            return null;
        };
        ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(getParquetWriterPageSize(session)).setMaxBlockSize(getParquetWriterBlockSize(session)).setBatchSize(getParquetWriterBatchSize(session)).build();
        return new IcebergParquetFileWriter(hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.create(outputPath)), rollbackAction, fileColumnTypes, convert(icebergSchema, "table"), makeTypeMap(fileColumnTypes, fileColumnNames), parquetWriterOptions, IntStream.range(0, fileColumnNames.size()).toArray(), getCompressionCodec(session).getParquetCompressionCodec(), nodeVersion.toString(), outputPath, hdfsEnvironment, hdfsContext);
    } catch (IOException e) {
        throw new TrinoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) Types(org.apache.iceberg.types.Types) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) FileSystem(org.apache.hadoop.fs.FileSystem) IcebergSessionProperties.getOrcWriterMaxStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeSize) IcebergSessionProperties.getParquetWriterPageSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterPageSize) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) IcebergSessionProperties.isOrcWriterValidate(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcWriterValidate) PrimitiveTypeMapBuilder.makeTypeMap(io.trino.plugin.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) IcebergSessionProperties.getParquetWriterBatchSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBatchSize) ParquetSchemaUtil.convert(org.apache.iceberg.parquet.ParquetSchemaUtil.convert) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TrinoException(io.trino.spi.TrinoException) Schema(org.apache.iceberg.Schema) IcebergSessionProperties.getCompressionCodec(io.trino.plugin.iceberg.IcebergSessionProperties.getCompressionCodec) IcebergSessionProperties.getOrcWriterMaxStripeRows(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeRows) OrcDataSourceId(io.trino.orc.OrcDataSourceId) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) ICEBERG_WRITER_OPEN_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR) Optional(java.util.Optional) IntStream(java.util.stream.IntStream) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) ICEBERG_WRITE_VALIDATION_FAILED(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) IcebergSessionProperties.getOrcWriterMaxDictionaryMemory(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxDictionaryMemory) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) IcebergSessionProperties.getOrcWriterMinStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMinStripeSize) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) IcebergSessionProperties.getOrcWriterValidateMode(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterValidateMode) MetricsConfig(org.apache.iceberg.MetricsConfig) IcebergSessionProperties.getParquetWriterBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) IcebergSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStringStatisticsLimit) JobConf(org.apache.hadoop.mapred.JobConf) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) Types(org.apache.iceberg.types.Types) IOException(java.io.IOException) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) Type(io.trino.spi.type.Type) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions)

Example 3 with ParquetWriterOptions

use of io.trino.parquet.writer.ParquetWriterOptions in project trino by trinodb.

the class DeltaLakePageSink method createParquetFileWriter.

private FileWriter createParquetFileWriter(Path path) {
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxBlockSize(getParquetWriterBlockSize(session)).setMaxPageSize(getParquetWriterPageSize(session)).build();
    CompressionCodecName compressionCodecName = getCompressionCodec(session).getParquetCompressionCodec();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        List<Type> parquetTypes = dataColumnTypes.stream().map(type -> {
            if (type instanceof TimestampWithTimeZoneType) {
                verify(((TimestampWithTimeZoneType) type).getPrecision() == 3, "Unsupported type: %s", type);
                return TIMESTAMP_MILLIS;
            }
            return type;
        }).collect(toImmutableList());
        // we use identity column mapping; input page already contains only data columns per
        // DataLagePageSink.getDataPage()
        int[] identityMapping = new int[dataColumnTypes.size()];
        for (int i = 0; i < identityMapping.length; ++i) {
            identityMapping[i] = i;
        }
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(parquetTypes, dataColumnNames);
        return new ParquetFileWriter(fileSystem.create(path), rollbackAction, parquetTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, identityMapping, compressionCodecName, trinoVersion);
    } catch (IOException e) {
        throw new TrinoException(DELTA_LAKE_BAD_WRITE, "Error creating Parquet file", e);
    }
}
Also used : RecordFileWriter(io.trino.plugin.hive.RecordFileWriter) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionConfigUtil.configureCompression(io.trino.plugin.hive.util.CompressionConfigUtil.configureCompression) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) Slices.wrappedBuffer(io.airlift.slice.Slices.wrappedBuffer) PARQUET(io.trino.plugin.hive.HiveStorageFormat.PARQUET) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ParquetFileWriter(io.trino.plugin.hive.parquet.ParquetFileWriter) Path(org.apache.hadoop.fs.Path) DeltaLakeSessionProperties.getParquetWriterPageSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterPageSize) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) DeltaLakeSessionProperties.isParquetOptimizedWriterEnabled(io.trino.plugin.deltalake.DeltaLakeSessionProperties.isParquetOptimizedWriterEnabled) TrinoException(io.trino.spi.TrinoException) FileUtils.escapePathName(org.apache.hadoop.hive.common.FileUtils.escapePathName) String.format(java.lang.String.format) Collectors.joining(java.util.stream.Collectors.joining) DeltaLakeSessionProperties.getParquetWriterBlockSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterBlockSize) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Function.identity(java.util.function.Function.identity) FileUtils(org.apache.hadoop.hive.common.FileUtils) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) JsonCodec(io.airlift.json.JsonCodec) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) TIMESTAMP_MILLIS(io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS) Type(io.trino.spi.type.Type) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) Page(io.trino.spi.Page) Callable(java.util.concurrent.Callable) CompletableFuture(java.util.concurrent.CompletableFuture) IOConstants(org.apache.hadoop.hive.ql.io.IOConstants) StorageFormat.fromHiveStorageFormat(io.trino.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) ArrayList(java.util.ArrayList) FileWriter(io.trino.plugin.hive.FileWriter) HiveType(io.trino.plugin.hive.HiveType) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) PageIndexerFactory(io.trino.spi.PageIndexerFactory) Objects.requireNonNull(java.util.Objects.requireNonNull) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) HiveWriteUtils(io.trino.plugin.hive.util.HiveWriteUtils) HiveTypeName(io.trino.plugin.hive.HiveTypeName) ConnectorPageSink(io.trino.spi.connector.ConnectorPageSink) Properties(java.util.Properties) DELTA_LAKE_BAD_WRITE(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_BAD_WRITE) MoreFutures(io.airlift.concurrent.MoreFutures) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) Ints(com.google.common.primitives.Ints) JobConf(org.apache.hadoop.mapred.JobConf) DeltaLakeSessionProperties.getCompressionCodec(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getCompressionCodec) PageIndexer(io.trino.spi.PageIndexer) Futures(com.google.common.util.concurrent.Futures) UUID.randomUUID(java.util.UUID.randomUUID) Collectors.toList(java.util.stream.Collectors.toList) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) ParquetFileWriter(io.trino.plugin.hive.parquet.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Aggregations

ParquetWriterOptions (io.trino.parquet.writer.ParquetWriterOptions)3 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)3 TrinoException (io.trino.spi.TrinoException)3 ConnectorSession (io.trino.spi.connector.ConnectorSession)3 Type (io.trino.spi.type.Type)3 TypeManager (io.trino.spi.type.TypeManager)3 IOException (java.io.IOException)3 List (java.util.List)3 Objects.requireNonNull (java.util.Objects.requireNonNull)3 Optional (java.util.Optional)3 Callable (java.util.concurrent.Callable)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 JobConf (org.apache.hadoop.mapred.JobConf)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ParquetSchemaConverter (io.trino.parquet.writer.ParquetSchemaConverter)2 FileWriter (io.trino.plugin.hive.FileWriter)2 NodeVersion (io.trino.plugin.hive.NodeVersion)2 Properties (java.util.Properties)2 Collectors.toList (java.util.stream.Collectors.toList)2