Search in sources :

Example 1 with HIVE_WRITER_OPEN_ERROR

use of io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR in project trino by trinodb.

the class RcFileFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    RcFileEncoding rcFileEncoding;
    if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerde())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerde())) {
        rcFileEncoding = createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC));
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        OutputStream outputStream = fileSystem.create(path, false);
        Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty();
        if (isRcfileOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path), fileSystem.getFileStatus(path).getLen(), stats);
                } catch (IOException e) {
                    throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes, codecName, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory));
    } catch (Exception e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e);
    }
}
Also used : HdfsRcFileDataSource(io.trino.plugin.hive.rcfile.HdfsRcFileDataSource) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) RcFileDataSource(io.trino.rcfile.RcFileDataSource) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) HiveSessionProperties.isRcfileOptimizedWriterValidate(io.trino.plugin.hive.HiveSessionProperties.isRcfileOptimizedWriterValidate) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) OutputStream(java.io.OutputStream) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) Properties(java.util.Properties) ImmutableMap(com.google.common.collect.ImmutableMap) RCFileOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) RcFileEncoding(io.trino.rcfile.RcFileEncoding) ColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe) JobConf(org.apache.hadoop.mapred.JobConf) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) List(java.util.List) RcFilePageSourceFactory.createTextVectorEncoding(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory.createTextVectorEncoding) Collectors.toList(java.util.stream.Collectors.toList) HIVE_WRITE_VALIDATION_FAILED(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) Optional(java.util.Optional) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) OutputStream(java.io.OutputStream) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) RcFileEncoding(io.trino.rcfile.RcFileEncoding) IOException(java.io.IOException) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) Type(io.trino.spi.type.Type) HdfsRcFileDataSource(io.trino.plugin.hive.rcfile.HdfsRcFileDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) Supplier(java.util.function.Supplier) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding)

Example 2 with HIVE_WRITER_OPEN_ERROR

use of io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR in project trino by trinodb.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    if (transaction.isAcidDeleteOperation(writerKind)) {
        // For delete, set the "row" column to -1
        fileInputColumnIndexes[fileInputColumnIndexes.length - 1] = -1;
    }
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        OrcDataSink orcDataSink = createOrcDataSink(fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), new OrcReaderOptions(), fileSystem.open(path), readStats);
                } catch (IOException e) {
                    throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        if (transaction.isInsert() && useAcidSchema) {
            // Only add the ACID columns if the request is for insert-type operations - - for delete operations,
            // the columns are added by the caller.  This is because the ACID columns for delete operations
            // depend on the rows being deleted, whereas the ACID columns for INSERT are completely determined
            // by bucket and writeId.
            Type rowType = createRowType(fileColumnNames, fileColumnTypes);
            fileColumnNames = ACID_COLUMN_NAMES;
            fileColumnTypes = createAcidColumnPrestoTypes(rowType);
        }
        return Optional.of(new OrcFileWriter(orcDataSink, writerKind, transaction, useAcidSchema, bucketNumber, rollbackAction, fileColumnNames, fileColumnTypes, createRootOrcType(fileColumnNames, fileColumnTypes), compression, getOrcWriterOptions(schema, orcWriterOptions).withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) OrcConf(org.apache.orc.OrcConf) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) ENGLISH(java.util.Locale.ENGLISH) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize) TrinoException(io.trino.spi.TrinoException) HiveSessionProperties.getOrcOptimizedWriterValidateMode(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterValidateMode) WriterKind(io.trino.plugin.hive.WriterKind) HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMinStripeSize) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) AcidSchema.createAcidColumnPrestoTypes(io.trino.plugin.hive.acid.AcidSchema.createAcidColumnPrestoTypes) HIVE_WRITE_VALIDATION_FAILED(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) Optional(java.util.Optional) ACID_COLUMN_NAMES(io.trino.plugin.hive.acid.AcidSchema.ACID_COLUMN_NAMES) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) Flatten(org.weakref.jmx.Flatten) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) HiveSessionProperties.isOrcOptimizedWriterValidate(io.trino.plugin.hive.HiveSessionProperties.isOrcOptimizedWriterValidate) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.hive.HiveSessionProperties.getOrcStringStatisticsLimit) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) Properties(java.util.Properties) HIVE_UNSUPPORTED_FORMAT(io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveUtil.getOrcWriterOptions(io.trino.plugin.hive.util.HiveUtil.getOrcWriterOptions) CompressionKind(io.trino.orc.metadata.CompressionKind) JobConf(org.apache.hadoop.mapred.JobConf) Collectors.toList(java.util.stream.Collectors.toList) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows) CompressionKind(io.trino.orc.metadata.CompressionKind) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) OrcDataSink(io.trino.orc.OrcDataSink) IOException(java.io.IOException) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) Type(io.trino.spi.type.Type) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcReaderOptions(io.trino.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) Supplier(java.util.function.Supplier)

Example 3 with HIVE_WRITER_OPEN_ERROR

use of io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR in project trino by trinodb.

the class ParquetFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!HiveSessionProperties.isParquetOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(HiveSessionProperties.getParquetWriterPageSize(session)).setMaxBlockSize(HiveSessionProperties.getParquetWriterBlockSize(session)).setBatchSize(HiveSessionProperties.getParquetBatchSize(session)).build();
    CompressionCodecName compressionCodecName = getCompression(conf);
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(fileColumnTypes, fileColumnNames);
        return Optional.of(new ParquetFileWriter(fileSystem.create(path, false), rollbackAction, fileColumnTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, fileInputColumnIndexes, compressionCodecName, nodeVersion.toString()));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) NodeVersion(io.trino.plugin.hive.NodeVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Properties(java.util.Properties) ParquetOutputFormat(org.apache.parquet.hadoop.ParquetOutputFormat) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveSessionProperties(io.trino.plugin.hive.HiveSessionProperties) WriterKind(io.trino.plugin.hive.WriterKind) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) Type(io.trino.spi.type.Type) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Example 4 with HIVE_WRITER_OPEN_ERROR

use of io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR in project trino by trinodb.

the class HiveWriterFactory method createWriter.

public HiveWriter createWriter(Page partitionColumns, int position, OptionalInt bucketNumber) {
    if (bucketCount.isPresent()) {
        checkArgument(bucketNumber.isPresent(), "Bucket not provided for bucketed table");
        checkArgument(bucketNumber.getAsInt() < bucketCount.getAsInt(), "Bucket number %s must be less than bucket count %s", bucketNumber, bucketCount);
    } else {
        checkArgument(bucketNumber.isEmpty(), "Bucket number provided by for table that is not bucketed");
    }
    List<String> partitionValues = createPartitionValues(partitionColumnTypes, partitionColumns, position);
    Optional<String> partitionName;
    if (!partitionColumnNames.isEmpty()) {
        partitionName = Optional.of(FileUtils.makePartName(partitionColumnNames, partitionValues));
    } else {
        partitionName = Optional.empty();
    }
    // attempt to get the existing partition (if this is an existing partitioned table)
    Optional<Partition> partition = Optional.empty();
    if (!partitionValues.isEmpty() && table != null) {
        partition = pageSinkMetadataProvider.getPartition(partitionValues);
    }
    UpdateMode updateMode;
    Properties schema;
    WriteInfo writeInfo;
    StorageFormat outputStorageFormat;
    if (partition.isEmpty()) {
        if (table == null) {
            // Write to: a new partition in a new partitioned table,
            // or a new unpartitioned table.
            updateMode = UpdateMode.NEW;
            schema = new Properties();
            schema.setProperty(IOConstants.COLUMNS, dataColumns.stream().map(DataColumn::getName).collect(joining(",")));
            schema.setProperty(IOConstants.COLUMNS_TYPES, dataColumns.stream().map(DataColumn::getHiveType).map(HiveType::getHiveTypeName).map(HiveTypeName::toString).collect(joining(":")));
            if (partitionName.isEmpty()) {
                // new unpartitioned table
                writeInfo = locationService.getTableWriteInfo(locationHandle, false);
            } else {
                // a new partition in a new partitioned table
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
                if (!writeInfo.getWriteMode().isWritePathSameAsTargetPath()) {
                    // verify that the target directory for the partition does not already exist
                    if (HiveWriteUtils.pathExists(new HdfsContext(session), hdfsEnvironment, writeInfo.getTargetPath())) {
                        throw new TrinoException(HIVE_PATH_ALREADY_EXISTS, format("Target directory for new partition '%s' of table '%s.%s' already exists: %s", partitionName, schemaName, tableName, writeInfo.getTargetPath()));
                    }
                }
            }
        } else {
            // or an existing unpartitioned table
            if (partitionName.isPresent()) {
                // a new partition in an existing partitioned table
                updateMode = UpdateMode.NEW;
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
            } else {
                switch(insertExistingPartitionsBehavior) {
                    case APPEND:
                        updateMode = UpdateMode.APPEND;
                        writeInfo = locationService.getTableWriteInfo(locationHandle, false);
                        break;
                    case OVERWRITE:
                        updateMode = UpdateMode.OVERWRITE;
                        writeInfo = locationService.getTableWriteInfo(locationHandle, true);
                        break;
                    case ERROR:
                        throw new TrinoException(HIVE_TABLE_READ_ONLY, "Unpartitioned Hive tables are immutable");
                    default:
                        throw new IllegalArgumentException("Unsupported insert existing table behavior: " + insertExistingPartitionsBehavior);
                }
            }
            schema = getHiveSchema(table);
        }
        if (partitionName.isPresent()) {
            // Write to a new partition
            outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat);
        } else {
            // Write to a new/existing unpartitioned table
            outputStorageFormat = fromHiveStorageFormat(tableStorageFormat);
        }
    } else {
        switch(insertExistingPartitionsBehavior) {
            // Write to: an existing partition in an existing partitioned table
            case APPEND:
                // Append to an existing partition
                updateMode = UpdateMode.APPEND;
                // Check the column types in partition schema match the column types in table schema
                List<Column> tableColumns = table.getDataColumns();
                List<Column> existingPartitionColumns = partition.get().getColumns();
                for (int i = 0; i < min(existingPartitionColumns.size(), tableColumns.size()); i++) {
                    HiveType tableType = tableColumns.get(i).getType();
                    HiveType partitionType = existingPartitionColumns.get(i).getType();
                    if (!tableType.equals(partitionType)) {
                        throw new TrinoException(HIVE_PARTITION_SCHEMA_MISMATCH, format("" + "You are trying to write into an existing partition in a table. " + "The table schema has changed since the creation of the partition. " + "Inserting rows into such partition is not supported. " + "The column '%s' in table '%s' is declared as type '%s', " + "but partition '%s' declared column '%s' as type '%s'.", tableColumns.get(i).getName(), tableName, tableType, partitionName, existingPartitionColumns.get(i).getName(), partitionType));
                    }
                }
                HiveWriteUtils.checkPartitionIsWritable(partitionName.get(), partition.get());
                outputStorageFormat = partition.get().getStorage().getStorageFormat();
                schema = getHiveSchema(partition.get(), table);
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
                break;
            case OVERWRITE:
                // Overwrite an existing partition
                // 
                // The behavior of overwrite considered as if first dropping the partition and inserting a new partition, thus:
                // * No partition writable check is required.
                // * Table schema and storage format is used for the new partition (instead of existing partition schema and storage format).
                updateMode = UpdateMode.OVERWRITE;
                outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat);
                schema = getHiveSchema(table);
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, Optional.empty(), partitionName.get());
                break;
            case ERROR:
                throw new TrinoException(HIVE_PARTITION_READ_ONLY, "Cannot insert into an existing partition of Hive table: " + partitionName.get());
            default:
                throw new IllegalArgumentException(format("Unsupported insert existing partitions behavior: %s", insertExistingPartitionsBehavior));
        }
    }
    additionalTableParameters.forEach(schema::setProperty);
    validateSchema(partitionName, schema);
    int bucketToUse = bucketNumber.isEmpty() ? 0 : bucketNumber.getAsInt();
    Path path;
    String fileNameWithExtension;
    if (transaction.isAcidTransactionRunning()) {
        String subdir = computeAcidSubdir(transaction);
        Path subdirPath = new Path(writeInfo.getWritePath(), subdir);
        path = createHiveBucketPath(subdirPath, bucketToUse, table.getParameters());
        fileNameWithExtension = path.getName();
    } else {
        String fileName = computeFileName(bucketNumber);
        fileNameWithExtension = fileName + getFileExtension(conf, outputStorageFormat);
        path = new Path(writeInfo.getWritePath(), fileNameWithExtension);
    }
    boolean useAcidSchema = isCreateTransactionalTable || (table != null && isFullAcidTable(table.getParameters()));
    FileWriter hiveFileWriter = null;
    for (HiveFileWriterFactory fileWriterFactory : fileWriterFactories) {
        Optional<FileWriter> fileWriter = fileWriterFactory.createFileWriter(path, dataColumns.stream().map(DataColumn::getName).collect(toList()), outputStorageFormat, schema, conf, session, bucketNumber, transaction, useAcidSchema, WriterKind.INSERT);
        if (fileWriter.isPresent()) {
            hiveFileWriter = fileWriter.get();
            break;
        }
    }
    if (hiveFileWriter == null) {
        hiveFileWriter = new RecordFileWriter(path, dataColumns.stream().map(DataColumn::getName).collect(toList()), outputStorageFormat, schema, partitionStorageFormat.getEstimatedWriterMemoryUsage(), conf, typeManager, parquetTimeZone, session);
    }
    String writerImplementation = hiveFileWriter.getClass().getName();
    Consumer<HiveWriter> onCommit = hiveWriter -> {
        Optional<Long> size;
        try {
            size = Optional.of(hiveWriter.getWrittenBytes());
        } catch (RuntimeException e) {
            // Do not fail the query if file system is not available
            size = Optional.empty();
        }
        eventClient.post(new WriteCompletedEvent(session.getQueryId(), path.toString(), schemaName, tableName, partitionName.orElse(null), outputStorageFormat.getOutputFormat(), writerImplementation, nodeManager.getCurrentNode().getVersion(), nodeManager.getCurrentNode().getHost(), session.getIdentity().getPrincipal().map(Principal::getName).orElse(null), nodeManager.getEnvironment(), sessionProperties, size.orElse(null), hiveWriter.getRowCount()));
    };
    if (!sortedBy.isEmpty()) {
        FileSystem fileSystem;
        Path tempFilePath;
        if (sortedWritingTempStagingPathEnabled) {
            String tempPrefix = sortedWritingTempStagingPath.replace("${USER}", new HdfsContext(session).getIdentity().getUser());
            tempFilePath = new Path(tempPrefix, ".tmp-sort." + path.getParent().getName() + "." + path.getName());
        } else {
            tempFilePath = new Path(path.getParent(), ".tmp-sort." + path.getName());
        }
        try {
            Configuration configuration = new Configuration(conf);
            // Explicitly set the default FS to local file system to avoid getting HDFS when sortedWritingTempStagingPath specifies no scheme
            configuration.set(FS_DEFAULT_NAME_KEY, "file:///");
            fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), tempFilePath, configuration);
        } catch (IOException e) {
            throw new TrinoException(HIVE_WRITER_OPEN_ERROR, e);
        }
        List<Type> types = dataColumns.stream().map(column -> column.getHiveType().getType(typeManager, getTimestampPrecision(session))).collect(toImmutableList());
        Map<String, Integer> columnIndexes = new HashMap<>();
        for (int i = 0; i < dataColumns.size(); i++) {
            columnIndexes.put(dataColumns.get(i).getName(), i);
        }
        List<Integer> sortFields = new ArrayList<>();
        List<SortOrder> sortOrders = new ArrayList<>();
        for (SortingColumn column : sortedBy) {
            Integer index = columnIndexes.get(column.getColumnName());
            if (index == null) {
                throw new TrinoException(HIVE_INVALID_METADATA, format("Sorting column '%s' does exist in table '%s.%s'", column.getColumnName(), schemaName, tableName));
            }
            sortFields.add(index);
            sortOrders.add(column.getOrder().getSortOrder());
        }
        hiveFileWriter = new SortingFileWriter(fileSystem, tempFilePath, hiveFileWriter, sortBufferSize, maxOpenSortFiles, types, sortFields, sortOrders, pageSorter, typeManager.getTypeOperators(), OrcFileWriterFactory::createOrcDataSink);
    }
    return new HiveWriter(hiveFileWriter, partitionName, updateMode, fileNameWithExtension, writeInfo.getWritePath().toString(), writeInfo.getTargetPath().toString(), onCommit, hiveWriterStats);
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) InsertExistingPartitionsBehavior(io.trino.plugin.hive.HiveSessionProperties.InsertExistingPartitionsBehavior) OrcFileWriterFactory(io.trino.plugin.hive.orc.OrcFileWriterFactory) FileSystem(org.apache.hadoop.fs.FileSystem) NodeManager(io.trino.spi.NodeManager) CompressionConfigUtil.configureCompression(io.trino.plugin.hive.util.CompressionConfigUtil.configureCompression) DIRECT_TO_TARGET_EXISTING_DIRECTORY(io.trino.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_EXISTING_DIRECTORY) AcidUtils.isInsertOnlyTable(org.apache.hadoop.hive.ql.io.AcidUtils.isInsertOnlyTable) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Matcher(java.util.regex.Matcher) Column(io.trino.plugin.hive.metastore.Column) HiveIgnoreKeyTextOutputFormat(org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat) Collectors.toMap(java.util.stream.Collectors.toMap) HIVE_PARTITION_SCHEMA_MISMATCH(io.trino.plugin.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) PageSorter(io.trino.spi.PageSorter) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HIVE_FILESYSTEM_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) Table(io.trino.plugin.hive.metastore.Table) ImmutableMap(com.google.common.collect.ImmutableMap) AcidUtils.isFullAcidTable(org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.getTemporaryStagingDirectoryPath(io.trino.plugin.hive.HiveSessionProperties.getTemporaryStagingDirectoryPath) EventClient(io.airlift.event.client.EventClient) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) UUID(java.util.UUID) Math.min(java.lang.Math.min) HIVE_PATH_ALREADY_EXISTS(io.trino.plugin.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS) MetastoreUtil.getHiveSchema(io.trino.plugin.hive.metastore.MetastoreUtil.getHiveSchema) AcidUtils.deleteDeltaSubdir(org.apache.hadoop.hive.ql.io.AcidUtils.deleteDeltaSubdir) Sets(com.google.common.collect.Sets) String.format(java.lang.String.format) Collectors.joining(java.util.stream.Collectors.joining) ReflectionUtil(org.apache.hive.common.util.ReflectionUtil) DataSize(io.airlift.units.DataSize) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Principal(java.security.Principal) AcidUtils.deltaSubdir(org.apache.hadoop.hive.ql.io.AcidUtils.deltaSubdir) HivePageSinkMetadataProvider(io.trino.plugin.hive.metastore.HivePageSinkMetadataProvider) Entry(java.util.Map.Entry) Function.identity(java.util.function.Function.identity) FileUtils(org.apache.hadoop.hive.common.FileUtils) Optional(java.util.Optional) Pattern(java.util.regex.Pattern) Partition(io.trino.plugin.hive.metastore.Partition) Type(io.trino.spi.type.Type) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) Page(io.trino.spi.Page) HashMap(java.util.HashMap) HiveWriteUtils.createPartitionValues(io.trino.plugin.hive.util.HiveWriteUtils.createPartitionValues) IOConstants(org.apache.hadoop.hive.ql.io.IOConstants) StorageFormat.fromHiveStorageFormat(io.trino.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) OptionalInt(java.util.OptionalInt) HIVE_PARTITION_READ_ONLY(io.trino.plugin.hive.HiveErrorCode.HIVE_PARTITION_READ_ONLY) ArrayList(java.util.ArrayList) HiveSessionProperties.getCompressionCodec(io.trino.plugin.hive.HiveSessionProperties.getCompressionCodec) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) HIVE_INVALID_METADATA(io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA) ImmutableList(com.google.common.collect.ImmutableList) COMPRESSRESULT(org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT) Objects.requireNonNull(java.util.Objects.requireNonNull) HiveWriteUtils(io.trino.plugin.hive.util.HiveWriteUtils) HiveSessionProperties.isTemporaryStagingDirectoryEnabled(io.trino.plugin.hive.HiveSessionProperties.isTemporaryStagingDirectoryEnabled) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveSessionProperties.getInsertExistingPartitionsBehavior(io.trino.plugin.hive.HiveSessionProperties.getInsertExistingPartitionsBehavior) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) Properties(java.util.Properties) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HIVE_UNSUPPORTED_FORMAT(io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) SortOrder(io.trino.spi.connector.SortOrder) JobConf(org.apache.hadoop.mapred.JobConf) Consumer(java.util.function.Consumer) UUID.randomUUID(java.util.UUID.randomUUID) Collectors.toList(java.util.stream.Collectors.toList) FS_DEFAULT_NAME_KEY(org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) HIVE_TABLE_READ_ONLY(io.trino.plugin.hive.HiveErrorCode.HIVE_TABLE_READ_ONLY) UpdateMode(io.trino.plugin.hive.PartitionUpdate.UpdateMode) WriteInfo(io.trino.plugin.hive.LocationService.WriteInfo) TypeManager(io.trino.spi.type.TypeManager) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) UpdateMode(io.trino.plugin.hive.PartitionUpdate.UpdateMode) ArrayList(java.util.ArrayList) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) StorageFormat.fromHiveStorageFormat(io.trino.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) Properties(java.util.Properties) Column(io.trino.plugin.hive.metastore.Column) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) WriteInfo(io.trino.plugin.hive.LocationService.WriteInfo) FileSystem(org.apache.hadoop.fs.FileSystem) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getTemporaryStagingDirectoryPath(io.trino.plugin.hive.HiveSessionProperties.getTemporaryStagingDirectoryPath) Partition(io.trino.plugin.hive.metastore.Partition) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) Optional(java.util.Optional) SortOrder(io.trino.spi.connector.SortOrder) IOException(java.io.IOException) Type(io.trino.spi.type.Type) TrinoException(io.trino.spi.TrinoException) Principal(java.security.Principal)

Aggregations

HIVE_WRITER_OPEN_ERROR (io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR)4 HiveSessionProperties.getTimestampPrecision (io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision)4 AcidTransaction (io.trino.plugin.hive.acid.AcidTransaction)4 StorageFormat (io.trino.plugin.hive.metastore.StorageFormat)4 HiveUtil.getColumnNames (io.trino.plugin.hive.util.HiveUtil.getColumnNames)4 HiveUtil.getColumnTypes (io.trino.plugin.hive.util.HiveUtil.getColumnTypes)4 TrinoException (io.trino.spi.TrinoException)4 ConnectorSession (io.trino.spi.connector.ConnectorSession)4 Type (io.trino.spi.type.Type)4 TypeManager (io.trino.spi.type.TypeManager)4 IOException (java.io.IOException)4 List (java.util.List)4 Objects.requireNonNull (java.util.Objects.requireNonNull)4 Optional (java.util.Optional)4 OptionalInt (java.util.OptionalInt)4 Properties (java.util.Properties)4 Collectors.toList (java.util.stream.Collectors.toList)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 JobConf (org.apache.hadoop.mapred.JobConf)4