Search in sources :

Example 1 with WriterKind

use of io.trino.plugin.hive.WriterKind in project trino by trinodb.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    if (transaction.isAcidDeleteOperation(writerKind)) {
        // For delete, set the "row" column to -1
        fileInputColumnIndexes[fileInputColumnIndexes.length - 1] = -1;
    }
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        OrcDataSink orcDataSink = createOrcDataSink(fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), new OrcReaderOptions(), fileSystem.open(path), readStats);
                } catch (IOException e) {
                    throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        if (transaction.isInsert() && useAcidSchema) {
            // Only add the ACID columns if the request is for insert-type operations - - for delete operations,
            // the columns are added by the caller.  This is because the ACID columns for delete operations
            // depend on the rows being deleted, whereas the ACID columns for INSERT are completely determined
            // by bucket and writeId.
            Type rowType = createRowType(fileColumnNames, fileColumnTypes);
            fileColumnNames = ACID_COLUMN_NAMES;
            fileColumnTypes = createAcidColumnPrestoTypes(rowType);
        }
        return Optional.of(new OrcFileWriter(orcDataSink, writerKind, transaction, useAcidSchema, bucketNumber, rollbackAction, fileColumnNames, fileColumnTypes, createRootOrcType(fileColumnNames, fileColumnTypes), compression, getOrcWriterOptions(schema, orcWriterOptions).withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) OrcConf(org.apache.orc.OrcConf) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) ENGLISH(java.util.Locale.ENGLISH) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize) TrinoException(io.trino.spi.TrinoException) HiveSessionProperties.getOrcOptimizedWriterValidateMode(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterValidateMode) WriterKind(io.trino.plugin.hive.WriterKind) HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMinStripeSize) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) AcidSchema.createAcidColumnPrestoTypes(io.trino.plugin.hive.acid.AcidSchema.createAcidColumnPrestoTypes) HIVE_WRITE_VALIDATION_FAILED(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) Optional(java.util.Optional) ACID_COLUMN_NAMES(io.trino.plugin.hive.acid.AcidSchema.ACID_COLUMN_NAMES) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) Flatten(org.weakref.jmx.Flatten) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) HiveSessionProperties.isOrcOptimizedWriterValidate(io.trino.plugin.hive.HiveSessionProperties.isOrcOptimizedWriterValidate) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.hive.HiveSessionProperties.getOrcStringStatisticsLimit) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) Properties(java.util.Properties) HIVE_UNSUPPORTED_FORMAT(io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveUtil.getOrcWriterOptions(io.trino.plugin.hive.util.HiveUtil.getOrcWriterOptions) CompressionKind(io.trino.orc.metadata.CompressionKind) JobConf(org.apache.hadoop.mapred.JobConf) Collectors.toList(java.util.stream.Collectors.toList) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows) CompressionKind(io.trino.orc.metadata.CompressionKind) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) OrcDataSink(io.trino.orc.OrcDataSink) IOException(java.io.IOException) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) Type(io.trino.spi.type.Type) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcReaderOptions(io.trino.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) Supplier(java.util.function.Supplier)

Example 2 with WriterKind

use of io.trino.plugin.hive.WriterKind in project trino by trinodb.

the class ParquetFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!HiveSessionProperties.isParquetOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(HiveSessionProperties.getParquetWriterPageSize(session)).setMaxBlockSize(HiveSessionProperties.getParquetWriterBlockSize(session)).setBatchSize(HiveSessionProperties.getParquetBatchSize(session)).build();
    CompressionCodecName compressionCodecName = getCompression(conf);
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(fileColumnTypes, fileColumnNames);
        return Optional.of(new ParquetFileWriter(fileSystem.create(path, false), rollbackAction, fileColumnTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, fileInputColumnIndexes, compressionCodecName, nodeVersion.toString()));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) NodeVersion(io.trino.plugin.hive.NodeVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Properties(java.util.Properties) ParquetOutputFormat(org.apache.parquet.hadoop.ParquetOutputFormat) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveSessionProperties(io.trino.plugin.hive.HiveSessionProperties) WriterKind(io.trino.plugin.hive.WriterKind) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) Type(io.trino.spi.type.Type) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Aggregations

FileWriter (io.trino.plugin.hive.FileWriter)2 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)2 HIVE_WRITER_OPEN_ERROR (io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR)2 HiveFileWriterFactory (io.trino.plugin.hive.HiveFileWriterFactory)2 HiveSessionProperties.getTimestampPrecision (io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision)2 NodeVersion (io.trino.plugin.hive.NodeVersion)2 WriterKind (io.trino.plugin.hive.WriterKind)2 AcidTransaction (io.trino.plugin.hive.acid.AcidTransaction)2 StorageFormat (io.trino.plugin.hive.metastore.StorageFormat)2 HiveUtil.getColumnNames (io.trino.plugin.hive.util.HiveUtil.getColumnNames)2 HiveUtil.getColumnTypes (io.trino.plugin.hive.util.HiveUtil.getColumnTypes)2 TrinoException (io.trino.spi.TrinoException)2 ConnectorSession (io.trino.spi.connector.ConnectorSession)2 Type (io.trino.spi.type.Type)2 TypeManager (io.trino.spi.type.TypeManager)2 IOException (java.io.IOException)2 List (java.util.List)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2 Optional (java.util.Optional)2 OptionalInt (java.util.OptionalInt)2