Search in sources :

Example 1 with ParquetSchemaConverter

use of io.trino.parquet.writer.ParquetSchemaConverter in project trino by trinodb.

the class ParquetTester method writeParquetColumnTrino.

private static void writeParquetColumnTrino(File outputFile, List<Type> types, List<String> columnNames, Iterator<?>[] values, int size, CompressionCodecName compressionCodecName) throws Exception {
    checkArgument(types.size() == columnNames.size() && types.size() == values.length);
    ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(types, columnNames);
    ParquetWriter writer = new ParquetWriter(new FileOutputStream(outputFile), schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), ParquetWriterOptions.builder().setMaxPageSize(DataSize.ofBytes(100)).setMaxBlockSize(DataSize.ofBytes(100000)).build(), compressionCodecName, "test-version");
    PageBuilder pageBuilder = new PageBuilder(types);
    for (int i = 0; i < types.size(); ++i) {
        Type type = types.get(i);
        Iterator<?> iterator = values[i];
        BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i);
        for (int j = 0; j < size; ++j) {
            checkState(iterator.hasNext());
            Object value = iterator.next();
            writeValue(type, blockBuilder, value);
        }
    }
    pageBuilder.declarePositions(size);
    writer.write(pageBuilder.build());
    writer.close();
}
Also used : HiveUtil.isMapType(io.trino.plugin.hive.util.HiveUtil.isMapType) HiveUtil.isRowType(io.trino.plugin.hive.util.HiveUtil.isRowType) MapType(io.trino.spi.type.MapType) CharType(io.trino.spi.type.CharType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) MessageType(org.apache.parquet.schema.MessageType) DecimalType(io.trino.spi.type.DecimalType) Type(io.trino.spi.type.Type) VarcharType(io.trino.spi.type.VarcharType) HiveUtil.isArrayType(io.trino.plugin.hive.util.HiveUtil.isArrayType) HiveUtil.isStructuralType(io.trino.plugin.hive.util.HiveUtil.isStructuralType) ParquetWriter(io.trino.parquet.writer.ParquetWriter) FileOutputStream(java.io.FileOutputStream) PageBuilder(io.trino.spi.PageBuilder) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) BlockBuilder(io.trino.spi.block.BlockBuilder)

Example 2 with ParquetSchemaConverter

use of io.trino.parquet.writer.ParquetSchemaConverter in project trino by trinodb.

the class ParquetFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!HiveSessionProperties.isParquetOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(HiveSessionProperties.getParquetWriterPageSize(session)).setMaxBlockSize(HiveSessionProperties.getParquetWriterBlockSize(session)).setBatchSize(HiveSessionProperties.getParquetBatchSize(session)).build();
    CompressionCodecName compressionCodecName = getCompression(conf);
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(fileColumnTypes, fileColumnNames);
        return Optional.of(new ParquetFileWriter(fileSystem.create(path, false), rollbackAction, fileColumnTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, fileInputColumnIndexes, compressionCodecName, nodeVersion.toString()));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) NodeVersion(io.trino.plugin.hive.NodeVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Properties(java.util.Properties) ParquetOutputFormat(org.apache.parquet.hadoop.ParquetOutputFormat) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveSessionProperties(io.trino.plugin.hive.HiveSessionProperties) WriterKind(io.trino.plugin.hive.WriterKind) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) Type(io.trino.spi.type.Type) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Example 3 with ParquetSchemaConverter

use of io.trino.parquet.writer.ParquetSchemaConverter in project trino by trinodb.

the class DeltaLakePageSink method createParquetFileWriter.

private FileWriter createParquetFileWriter(Path path) {
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxBlockSize(getParquetWriterBlockSize(session)).setMaxPageSize(getParquetWriterPageSize(session)).build();
    CompressionCodecName compressionCodecName = getCompressionCodec(session).getParquetCompressionCodec();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        List<Type> parquetTypes = dataColumnTypes.stream().map(type -> {
            if (type instanceof TimestampWithTimeZoneType) {
                verify(((TimestampWithTimeZoneType) type).getPrecision() == 3, "Unsupported type: %s", type);
                return TIMESTAMP_MILLIS;
            }
            return type;
        }).collect(toImmutableList());
        // we use identity column mapping; input page already contains only data columns per
        // DataLagePageSink.getDataPage()
        int[] identityMapping = new int[dataColumnTypes.size()];
        for (int i = 0; i < identityMapping.length; ++i) {
            identityMapping[i] = i;
        }
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(parquetTypes, dataColumnNames);
        return new ParquetFileWriter(fileSystem.create(path), rollbackAction, parquetTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, identityMapping, compressionCodecName, trinoVersion);
    } catch (IOException e) {
        throw new TrinoException(DELTA_LAKE_BAD_WRITE, "Error creating Parquet file", e);
    }
}
Also used : RecordFileWriter(io.trino.plugin.hive.RecordFileWriter) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionConfigUtil.configureCompression(io.trino.plugin.hive.util.CompressionConfigUtil.configureCompression) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) Slices.wrappedBuffer(io.airlift.slice.Slices.wrappedBuffer) PARQUET(io.trino.plugin.hive.HiveStorageFormat.PARQUET) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ParquetFileWriter(io.trino.plugin.hive.parquet.ParquetFileWriter) Path(org.apache.hadoop.fs.Path) DeltaLakeSessionProperties.getParquetWriterPageSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterPageSize) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) DeltaLakeSessionProperties.isParquetOptimizedWriterEnabled(io.trino.plugin.deltalake.DeltaLakeSessionProperties.isParquetOptimizedWriterEnabled) TrinoException(io.trino.spi.TrinoException) FileUtils.escapePathName(org.apache.hadoop.hive.common.FileUtils.escapePathName) String.format(java.lang.String.format) Collectors.joining(java.util.stream.Collectors.joining) DeltaLakeSessionProperties.getParquetWriterBlockSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterBlockSize) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Function.identity(java.util.function.Function.identity) FileUtils(org.apache.hadoop.hive.common.FileUtils) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) JsonCodec(io.airlift.json.JsonCodec) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) TIMESTAMP_MILLIS(io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS) Type(io.trino.spi.type.Type) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) Page(io.trino.spi.Page) Callable(java.util.concurrent.Callable) CompletableFuture(java.util.concurrent.CompletableFuture) IOConstants(org.apache.hadoop.hive.ql.io.IOConstants) StorageFormat.fromHiveStorageFormat(io.trino.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) ArrayList(java.util.ArrayList) FileWriter(io.trino.plugin.hive.FileWriter) HiveType(io.trino.plugin.hive.HiveType) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) PageIndexerFactory(io.trino.spi.PageIndexerFactory) Objects.requireNonNull(java.util.Objects.requireNonNull) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) HiveWriteUtils(io.trino.plugin.hive.util.HiveWriteUtils) HiveTypeName(io.trino.plugin.hive.HiveTypeName) ConnectorPageSink(io.trino.spi.connector.ConnectorPageSink) Properties(java.util.Properties) DELTA_LAKE_BAD_WRITE(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_BAD_WRITE) MoreFutures(io.airlift.concurrent.MoreFutures) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) Ints(com.google.common.primitives.Ints) JobConf(org.apache.hadoop.mapred.JobConf) DeltaLakeSessionProperties.getCompressionCodec(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getCompressionCodec) PageIndexer(io.trino.spi.PageIndexer) Futures(com.google.common.util.concurrent.Futures) UUID.randomUUID(java.util.UUID.randomUUID) Collectors.toList(java.util.stream.Collectors.toList) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) ParquetFileWriter(io.trino.plugin.hive.parquet.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Aggregations

ParquetSchemaConverter (io.trino.parquet.writer.ParquetSchemaConverter)3 Type (io.trino.spi.type.Type)3 ParquetWriterOptions (io.trino.parquet.writer.ParquetWriterOptions)2 FileWriter (io.trino.plugin.hive.FileWriter)2 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)2 TrinoException (io.trino.spi.TrinoException)2 ConnectorSession (io.trino.spi.connector.ConnectorSession)2 TypeManager (io.trino.spi.type.TypeManager)2 IOException (java.io.IOException)2 List (java.util.List)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2 Optional (java.util.Optional)2 Properties (java.util.Properties)2 Callable (java.util.concurrent.Callable)2 Collectors.toList (java.util.stream.Collectors.toList)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 JobConf (org.apache.hadoop.mapred.JobConf)2 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)2 Verify.verify (com.google.common.base.Verify.verify)1