Examples with HIVE_WRITER_OPEN_ERROR - io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN

Example 1 with HIVE_WRITER_OPEN_ERROR

use of io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR in project hetu-core by openlookeng.

the class HiveWriterFactory method createWriter.

private HiveWriter createWriter(List<String> partitionValues, OptionalInt bucketNumber, Optional<Options> vacuumOptions, boolean forMerge) {
    boolean isTxnTable = isTxnTable();
    if (bucketCount.isPresent()) {
        checkArgument(bucketNumber.isPresent(), "Bucket not provided for bucketed table");
        checkArgument(bucketNumber.getAsInt() < bucketCount.getAsInt(), "Bucket number %s must be less than bucket count %s", bucketNumber, bucketCount);
    } else {
        checkArgument(isTxnTable || !bucketNumber.isPresent(), "Bucket number provided by for table that is not bucketed");
    }
    String fileName;
    if (bucketNumber.isPresent()) {
        fileName = computeBucketedFileName(queryId, bucketNumber.getAsInt());
    } else {
        // Snapshot: don't use UUID. File name needs to be deterministic.
        if (isSnapshotEnabled) {
            fileName = String.format(ENGLISH, "%s_%d_%d_%d", queryId, session.getTaskId().getAsInt(), session.getPipelineId().getAsInt(), session.getDriverId().getAsInt());
        } else {
            fileName = queryId + "_" + randomUUID();
        }
    }
    Optional<String> partitionName;
    if (!partitionColumnNames.isEmpty()) {
        partitionName = Optional.of(FileUtils.makePartName(partitionColumnNames, partitionValues));
    } else {
        partitionName = Optional.empty();
    }
    // attempt to get the existing partition (if this is an existing partitioned table)
    Optional<Partition> partition = Optional.empty();
    if (!partitionValues.isEmpty() && table != null) {
        partition = pageSinkMetadataProvider.getPartition(partitionValues);
    }
    UpdateMode updateMode;
    Properties schema;
    WriteInfo writeInfo;
    StorageFormat outputStorageFormat;
    if (!partition.isPresent()) {
        if (table == null) {
            // Write to: a new partition in a new partitioned table,
            // or a new unpartitioned table.
            updateMode = UpdateMode.NEW;
            schema = new Properties();
            schema.setProperty(IOConstants.COLUMNS, dataColumns.stream().map(DataColumn::getName).collect(joining(",")));
            schema.setProperty(IOConstants.COLUMNS_TYPES, dataColumns.stream().map(DataColumn::getHiveType).map(HiveType::getHiveTypeName).map(HiveTypeName::toString).collect(joining(":")));
            setAdditionalSchemaProperties(schema);
            if (!partitionName.isPresent()) {
                // new unpartitioned table
                writeInfo = locationService.getTableWriteInfo(locationHandle, false);
            } else {
                // a new partition in a new partitioned table
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
                if (!writeInfo.getWriteMode().isWritePathSameAsTargetPath()) {
                    // verify that the target directory for the partition does not already exist
                    if (HiveWriteUtils.pathExists(new HdfsContext(session, schemaName, tableName), hdfsEnvironment, writeInfo.getTargetPath())) {
                        throw new PrestoException(HIVE_PATH_ALREADY_EXISTS, format("Target directory for new partition '%s' of table '%s.%s' already exists: %s", partitionName, schemaName, tableName, writeInfo.getTargetPath()));
                    }
                }
            }
        } else {
            // or an existing unpartitioned table
            if (partitionName.isPresent()) {
                // a new partition in an existing partitioned table
                updateMode = UpdateMode.NEW;
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
            } else {
                switch(insertExistingPartitionsBehavior) {
                    case APPEND:
                        checkState(!immutablePartitions);
                        updateMode = UpdateMode.APPEND;
                        writeInfo = locationService.getTableWriteInfo(locationHandle, false);
                        break;
                    case OVERWRITE:
                        updateMode = UpdateMode.OVERWRITE;
                        writeInfo = locationService.getTableWriteInfo(locationHandle, true);
                        break;
                    case ERROR:
                        throw new PrestoException(HIVE_TABLE_READ_ONLY, "Unpartitioned Hive tables are immutable");
                    default:
                        throw new IllegalArgumentException("Unsupported insert existing table behavior: " + insertExistingPartitionsBehavior);
                }
            }
            schema = getHiveSchema(table);
        }
        if (partitionName.isPresent()) {
            // Write to a new partition
            outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat);
        } else {
            // Write to a new/existing unpartitioned table
            outputStorageFormat = fromHiveStorageFormat(tableStorageFormat);
        }
    } else {
        // Write to: an existing partition in an existing partitioned table
        if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.APPEND) {
            // Append to an existing partition
            checkState(!immutablePartitions);
            updateMode = UpdateMode.APPEND;
            // Check the column types in partition schema match the column types in table schema
            List<Column> tableColumns = table.getDataColumns();
            List<Column> existingPartitionColumns = partition.get().getColumns();
            for (int i = 0; i < min(existingPartitionColumns.size(), tableColumns.size()); i++) {
                HiveType tableType = tableColumns.get(i).getType();
                HiveType partitionType = existingPartitionColumns.get(i).getType();
                if (!tableType.equals(partitionType)) {
                    throw new PrestoException(HIVE_PARTITION_SCHEMA_MISMATCH, format("" + "You are trying to write into an existing partition in a table. " + "The table schema has changed since the creation of the partition. " + "Inserting rows into such partition is not supported. " + "The column '%s' in table '%s' is declared as type '%s', " + "but partition '%s' declared column '%s' as type '%s'.", tableColumns.get(i).getName(), tableName, tableType, partitionName, existingPartitionColumns.get(i).getName(), partitionType));
                }
            }
            HiveWriteUtils.checkPartitionIsWritable(partitionName.get(), partition.get());
            outputStorageFormat = partition.get().getStorage().getStorageFormat();
            schema = getHiveSchema(partition.get(), table);
            writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
        } else if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.OVERWRITE) {
            // Overwrite an existing partition
            // 
            // The behavior of overwrite considered as if first dropping the partition and inserting a new partition, thus:
            // * No partition writable check is required.
            // * Table schema and storage format is used for the new partition (instead of existing partition schema and storage format).
            updateMode = UpdateMode.OVERWRITE;
            outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat);
            schema = getHiveSchema(table);
            writeInfo = locationService.getPartitionWriteInfo(locationHandle, Optional.empty(), partitionName.get());
            checkWriteMode(writeInfo);
        } else if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.ERROR) {
            throw new PrestoException(HIVE_PARTITION_READ_ONLY, "Cannot insert into an existing partition of Hive table: " + partitionName.get());
        } else {
            throw new IllegalArgumentException(format("Unsupported insert existing partitions behavior: %s", insertExistingPartitionsBehavior));
        }
    }
    schema.putAll(additionalTableParameters);
    if (acidWriteType != HiveACIDWriteType.DELETE) {
        validateSchema(partitionName, schema);
    }
    Path path;
    Optional<AcidOutputFormat.Options> acidOptions;
    String fileNameWithExtension;
    if (isTxnTable) {
        WriteIdInfo writeIdInfo = locationHandle.getJsonSerializablewriteIdInfo().get();
        AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).minimumWriteId(writeIdInfo.getMinWriteId()).maximumWriteId(writeIdInfo.getMaxWriteId()).statementId(writeIdInfo.getStatementId()).bucket(bucketNumber.isPresent() ? bucketNumber.getAsInt() : 0);
        if (acidWriteType == HiveACIDWriteType.DELETE) {
            // to support delete as insert
            options.writingDeleteDelta(true);
        } else if (acidWriteType == HiveACIDWriteType.INSERT_OVERWRITE) {
            // In case of ACID txn tables, dont delete old data. Just create new base in same partition.
            options.writingBase(true);
        }
        if (vacuumOptions.isPresent() && HiveACIDWriteType.isVacuum(acidWriteType)) {
            Options vOptions = vacuumOptions.get();
            // Use the original bucket file number itself.
            // Compacted delta directories will not have statementId
            options.maximumWriteId(vOptions.getMaximumWriteId()).minimumWriteId(vOptions.getMinimumWriteId()).writingBase(vOptions.isWritingBase()).writingDeleteDelta(vOptions.isWritingDeleteDelta()).bucket(vOptions.getBucketId()).statementId(-1);
        }
        if (AcidUtils.isInsertOnlyTable(schema)) {
            String subdir;
            if (options.isWritingBase()) {
                subdir = AcidUtils.baseDir(options.getMaximumWriteId());
            } else if (HiveACIDWriteType.isVacuum(acidWriteType)) {
                // Only for Minor compacted delta will not have statement Id.
                subdir = AcidUtils.deltaSubdir(options.getMinimumWriteId(), options.getMaximumWriteId());
            } else {
                subdir = AcidUtils.deltaSubdir(options.getMinimumWriteId(), options.getMaximumWriteId(), options.getStatementId());
            }
            Path parentDir = new Path(writeInfo.getWritePath(), subdir);
            fileName = String.format("%06d", options.getBucketId()) + "_0" + getFileExtension(conf, outputStorageFormat);
            path = new Path(parentDir, fileName);
            Properties properties = new Properties();
            properties.setProperty("transactional_properties", "insert_only");
            options.tableProperties(properties);
        } else {
            path = AcidUtils.createFilename(writeInfo.getWritePath(), options);
        }
        // In case of ACID entire delta directory should be renamed from staging directory.
        fileNameWithExtension = path.getParent().getName();
        acidOptions = Optional.of(options);
    } else {
        fileNameWithExtension = fileName + getFileExtension(conf, outputStorageFormat);
        path = new Path(writeInfo.getWritePath(), fileNameWithExtension);
        acidOptions = Optional.empty();
    }
    FileSystem fileSystem;
    try {
        fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, conf);
    } catch (IOException e) {
        throw new PrestoException(HIVE_WRITER_OPEN_ERROR, e);
    }
    if (isSnapshotEnabled) {
        // Snapshot: use a recognizable name pattern, in case they need to be deleted/renamed
        String oldFileName = path.getName();
        String newFileName = toSnapshotFileName(oldFileName, queryId);
        path = new Path(path.getParent(), newFileName);
        if (fileNameWithExtension.equals(oldFileName)) {
            fileNameWithExtension = newFileName;
        }
    }
    HiveFileWriter hiveFileWriter = null;
    if (isSnapshotEnabled && !forMerge) {
        // Add a suffix to file name for sub files
        String oldFileName = path.getName();
        String newFileName = toSnapshotSubFile(oldFileName);
        path = new Path(path.getParent(), newFileName);
        if (fileNameWithExtension.equals(oldFileName)) {
            fileNameWithExtension = newFileName;
        }
        // Always create a simple ORC writer for snapshot files. These will be merged in the end.
        logContainingFolderInfo(fileSystem, path, "Creating SnapshotTempFileWriter for %s", path);
        try {
            Path finalPath = path;
            hiveFileWriter = new SnapshotTempFileWriter(orcFileWriterFactory.createOrcDataSink(session, fileSystem, path), dataColumns.stream().map(column -> column.getHiveType().getType(typeManager)).collect(Collectors.toList()));
        } catch (IOException e) {
            throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
        }
    } else {
        conf.set("table.write.path", writeInfo.getWritePath().toString());
        for (HiveFileWriterFactory fileWriterFactory : fileWriterFactories) {
            Optional<HiveFileWriter> fileWriter = fileWriterFactory.createFileWriter(path, dataColumns.stream().map(DataColumn::getName).collect(toList()), outputStorageFormat, schema, conf, session, acidOptions, Optional.of(acidWriteType));
            if (fileWriter.isPresent()) {
                hiveFileWriter = fileWriter.get();
                break;
            }
        }
        if (isSnapshotEnabled) {
            // TODO-cp-I2BZ0A: assuming all files to be of ORC type
            checkState(hiveFileWriter instanceof OrcFileWriter, "Only support ORC format with snapshot");
            logContainingFolderInfo(fileSystem, path, "Creating file writer for final result: %s", path);
        }
        if (hiveFileWriter == null) {
            hiveFileWriter = new RecordFileWriter(path, dataColumns.stream().map(DataColumn::getName).collect(toList()), outputStorageFormat, schema, partitionStorageFormat.getEstimatedWriterSystemMemoryUsage(), conf, typeManager, parquetTimeZone, session);
        }
        if (isTxnTable) {
            hiveFileWriter.initWriter(true, path, fileSystem);
        }
    }
    Path finalPath = path;
    String writerImplementation = hiveFileWriter.getClass().getName();
    Consumer<HiveWriter> onCommit;
    if (isSnapshotEnabled && !forMerge) {
        // Only send "commit" event for the merged file
        onCommit = hiveWriter -> {
        };
    } else {
        onCommit = hiveWriter -> {
            Optional<Long> size;
            try {
                size = Optional.of(hdfsEnvironment.getFileSystem(session.getUser(), finalPath, conf).getFileStatus(finalPath).getLen());
            } catch (IOException | RuntimeException e) {
                // Do not fail the query if file system is not available
                size = Optional.empty();
            }
            eventClient.post(new WriteCompletedEvent(session.getQueryId(), finalPath.toString(), schemaName, tableName, partitionName.orElse(null), outputStorageFormat.getOutputFormat(), writerImplementation, nodeManager.getCurrentNode().getVersion(), nodeManager.getCurrentNode().getHost(), session.getIdentity().getPrincipal().map(Principal::getName).orElse(null), nodeManager.getEnvironment(), sessionProperties, size.orElse(null), hiveWriter.getRowCount()));
        };
    }
    if (!sortedBy.isEmpty() || (isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType))) {
        List<Type> types = dataColumns.stream().map(column -> column.getHiveType().getType(typeManager)).collect(Collectors.toList());
        Map<String, Integer> columnIndexes = new HashMap<>();
        for (int i = 0; i < dataColumns.size(); i++) {
            columnIndexes.put(dataColumns.get(i).getName(), i);
        }
        if (sortedBy.isEmpty() && isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType)) {
            // Add $rowId column as the last column in the page
            types.add(HiveColumnHandle.updateRowIdHandle().getHiveType().getType(typeManager));
            columnIndexes.put(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME, dataColumns.size());
        }
        List<Integer> sortFields = new ArrayList<>();
        List<SortOrder> sortOrders = new ArrayList<>();
        List<SortingColumn> sortigColumns = this.sortedBy;
        if (sortedBy.isEmpty() && isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType)) {
            sortigColumns = ImmutableList.of(new SortingColumn(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME, SortingColumn.Order.ASCENDING));
        }
        for (SortingColumn column : sortigColumns) {
            Integer index = columnIndexes.get(column.getColumnName());
            if (index == null) {
                throw new PrestoException(HIVE_INVALID_METADATA, format("Sorting column '%s' does not exist in table '%s.%s'", column.getColumnName(), schemaName, tableName));
            }
            sortFields.add(index);
            sortOrders.add(column.getOrder().getSortOrder());
        }
        FileSystem sortFileSystem = fileSystem;
        String child = ".tmp-sort." + path.getName();
        Path tempFilePrefix = new Path(path.getParent(), child);
        hiveFileWriter = new SortingFileWriter(sortFileSystem, tempFilePrefix, hiveFileWriter, sortBufferSize, maxOpenSortFiles, types, sortFields, sortOrders, pageSorter, (fs, p) -> orcFileWriterFactory.createOrcDataSink(session, fs, p));
    }
    return new HiveWriter(hiveFileWriter, partitionName, updateMode, fileNameWithExtension, writeInfo.getWritePath().toString(), writeInfo.getTargetPath().toString(), path.toString(), onCommit, // Snapshot: only update stats when merging files
    isSnapshotEnabled && !forMerge ? null : hiveWriterStats, hiveFileWriter.getExtraPartitionFiles());
}

Also used : DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) StorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat) UpdateMode(io.prestosql.plugin.hive.PartitionUpdate.UpdateMode) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_FILESYSTEM_ERROR(io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR) HdfsContext(io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext) FileStatus(org.apache.hadoop.fs.FileStatus) SortOrder(io.prestosql.spi.block.SortOrder) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) HiveIgnoreKeyTextOutputFormat(org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat) Collectors.toMap(java.util.stream.Collectors.toMap) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HiveUtil.getColumnNames(io.prestosql.plugin.hive.HiveUtil.getColumnNames) PropertyMetadata(io.prestosql.spi.session.PropertyMetadata) Path(org.apache.hadoop.fs.Path) HIVE_PARTITION_SCHEMA_MISMATCH(io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) Type(io.prestosql.spi.type.Type) ENGLISH(java.util.Locale.ENGLISH) PrestoException(io.prestosql.spi.PrestoException) ImmutableSet(com.google.common.collect.ImmutableSet) HIVE_WRITER_OPEN_ERROR(io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) ImmutableMap(com.google.common.collect.ImmutableMap) EventClient(io.airlift.event.client.EventClient) HIVE_UNSUPPORTED_FORMAT(io.prestosql.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Set(java.util.Set) HIVE_PATH_ALREADY_EXISTS(io.prestosql.plugin.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS) Math.min(java.lang.Math.min) Collectors(java.util.stream.Collectors) HiveWriteUtils.createPartitionValues(io.prestosql.plugin.hive.HiveWriteUtils.createPartitionValues) Sets(com.google.common.collect.Sets) String.format(java.lang.String.format) Collectors.joining(java.util.stream.Collectors.joining) Preconditions.checkState(com.google.common.base.Preconditions.checkState) ReflectionUtil(org.apache.hive.common.util.ReflectionUtil) DataSize(io.airlift.units.DataSize) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Principal(java.security.Principal) Table(io.prestosql.plugin.hive.metastore.Table) HIVE_TABLE_READ_ONLY(io.prestosql.plugin.hive.HiveErrorCode.HIVE_TABLE_READ_ONLY) HdfsOrcDataSource(io.prestosql.plugin.hive.orc.HdfsOrcDataSource) Function.identity(java.util.function.Function.identity) FileUtils(org.apache.hadoop.hive.common.FileUtils) Optional(java.util.Optional) SortingColumn(io.prestosql.plugin.hive.metastore.SortingColumn) ConfigurationUtils.toJobConf(io.prestosql.plugin.hive.util.ConfigurationUtils.toJobConf) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) HivePageSinkMetadataProvider(io.prestosql.plugin.hive.metastore.HivePageSinkMetadataProvider) Partition(io.prestosql.plugin.hive.metastore.Partition) Logger(io.airlift.log.Logger) HiveUtil.getColumnTypes(io.prestosql.plugin.hive.HiveUtil.getColumnTypes) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) HashMap(java.util.HashMap) StorageFormat.fromHiveStorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) IOConstants(org.apache.hadoop.hive.ql.io.IOConstants) TempFileReader(io.prestosql.plugin.hive.util.TempFileReader) NOT_FOUND(io.prestosql.spi.StandardErrorCode.NOT_FOUND) OptionalInt(java.util.OptionalInt) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) ImmutableList(com.google.common.collect.ImmutableList) COMPRESSRESULT(org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT) Objects.requireNonNull(java.util.Objects.requireNonNull) HIVE_PARTITION_READ_ONLY(io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_READ_ONLY) DIRECT_TO_TARGET_EXISTING_DIRECTORY(io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_EXISTING_DIRECTORY) Properties(java.util.Properties) InsertExistingPartitionsBehavior(io.prestosql.plugin.hive.HiveSessionProperties.InsertExistingPartitionsBehavior) HiveConf(org.apache.hadoop.hive.conf.HiveConf) TypeManager(io.prestosql.spi.type.TypeManager) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) HIVE_INVALID_METADATA(io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA) Page(io.prestosql.spi.Page) IOException(java.io.IOException) PageSorter(io.prestosql.spi.PageSorter) Options(org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options) JobConf(org.apache.hadoop.mapred.JobConf) Consumer(java.util.function.Consumer) UUID.randomUUID(java.util.UUID.randomUUID) Collectors.toList(java.util.stream.Collectors.toList) MetastoreUtil.getHiveSchema(io.prestosql.plugin.hive.metastore.MetastoreUtil.getHiveSchema) Column(io.prestosql.plugin.hive.metastore.Column) NodeManager(io.prestosql.spi.NodeManager) WriteInfo(io.prestosql.plugin.hive.LocationService.WriteInfo) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) HashMap(java.util.HashMap) UpdateMode(io.prestosql.plugin.hive.PartitionUpdate.UpdateMode) ArrayList(java.util.ArrayList) WriteInfo(io.prestosql.plugin.hive.LocationService.WriteInfo) FileSystem(org.apache.hadoop.fs.FileSystem) Options(org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options) Options(org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options) PrestoException(io.prestosql.spi.PrestoException) StorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat) StorageFormat.fromHiveStorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) Properties(java.util.Properties) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) SortingColumn(io.prestosql.plugin.hive.metastore.SortingColumn) Column(io.prestosql.plugin.hive.metastore.Column) HdfsContext(io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext) Path(org.apache.hadoop.fs.Path) Partition(io.prestosql.plugin.hive.metastore.Partition) SortingColumn(io.prestosql.plugin.hive.metastore.SortingColumn) SortOrder(io.prestosql.spi.block.SortOrder) IOException(java.io.IOException) Type(io.prestosql.spi.type.Type)

Example 2 with HIVE_WRITER_OPEN_ERROR

use of io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR in project boostkit-bigdata by kunpengcompute.

the class HiveWriterFactory method createWriter.

private HiveWriter createWriter(List<String> partitionValues, OptionalInt bucketNumber, Optional<Options> vacuumOptions, boolean forMerge) {
    boolean isTxnTable = isTxnTable();
    if (bucketCount.isPresent()) {
        checkArgument(bucketNumber.isPresent(), "Bucket not provided for bucketed table");
        checkArgument(bucketNumber.getAsInt() < bucketCount.getAsInt(), "Bucket number %s must be less than bucket count %s", bucketNumber, bucketCount);
    } else {
        checkArgument(isTxnTable || !bucketNumber.isPresent(), "Bucket number provided by for table that is not bucketed");
    }
    String fileName;
    if (bucketNumber.isPresent()) {
        fileName = computeBucketedFileName(queryId, bucketNumber.getAsInt());
    } else {
        // Snapshot: don't use UUID. File name needs to be deterministic.
        if (isSnapshotEnabled) {
            fileName = String.format(ENGLISH, "%s_%d_%d_%d", queryId, session.getTaskId().getAsInt(), session.getPipelineId().getAsInt(), session.getDriverId().getAsInt());
        } else {
            fileName = queryId + "_" + randomUUID();
        }
    }
    Optional<String> partitionName;
    if (!partitionColumnNames.isEmpty()) {
        partitionName = Optional.of(FileUtils.makePartName(partitionColumnNames, partitionValues));
    } else {
        partitionName = Optional.empty();
    }
    // attempt to get the existing partition (if this is an existing partitioned table)
    Optional<Partition> partition = Optional.empty();
    if (!partitionValues.isEmpty() && table != null) {
        partition = pageSinkMetadataProvider.getPartition(partitionValues);
    }
    UpdateMode updateMode;
    Properties schema;
    WriteInfo writeInfo;
    StorageFormat outputStorageFormat;
    if (!partition.isPresent()) {
        if (table == null) {
            // Write to: a new partition in a new partitioned table,
            // or a new unpartitioned table.
            updateMode = UpdateMode.NEW;
            schema = new Properties();
            schema.setProperty(IOConstants.COLUMNS, dataColumns.stream().map(DataColumn::getName).collect(joining(",")));
            schema.setProperty(IOConstants.COLUMNS_TYPES, dataColumns.stream().map(DataColumn::getHiveType).map(HiveType::getHiveTypeName).map(HiveTypeName::toString).collect(joining(":")));
            setAdditionalSchemaProperties(schema);
            if (!partitionName.isPresent()) {
                // new unpartitioned table
                writeInfo = locationService.getTableWriteInfo(locationHandle, false);
            } else {
                // a new partition in a new partitioned table
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
                if (!writeInfo.getWriteMode().isWritePathSameAsTargetPath()) {
                    // verify that the target directory for the partition does not already exist
                    if (HiveWriteUtils.pathExists(new HdfsContext(session, schemaName, tableName), hdfsEnvironment, writeInfo.getTargetPath())) {
                        throw new PrestoException(HIVE_PATH_ALREADY_EXISTS, format("Target directory for new partition '%s' of table '%s.%s' already exists: %s", partitionName, schemaName, tableName, writeInfo.getTargetPath()));
                    }
                }
            }
        } else {
            // or an existing unpartitioned table
            if (partitionName.isPresent()) {
                // a new partition in an existing partitioned table
                updateMode = UpdateMode.NEW;
                writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
            } else {
                switch(insertExistingPartitionsBehavior) {
                    case APPEND:
                        checkState(!immutablePartitions);
                        updateMode = UpdateMode.APPEND;
                        writeInfo = locationService.getTableWriteInfo(locationHandle, false);
                        break;
                    case OVERWRITE:
                        updateMode = UpdateMode.OVERWRITE;
                        writeInfo = locationService.getTableWriteInfo(locationHandle, true);
                        break;
                    case ERROR:
                        throw new PrestoException(HIVE_TABLE_READ_ONLY, "Unpartitioned Hive tables are immutable");
                    default:
                        throw new IllegalArgumentException("Unsupported insert existing table behavior: " + insertExistingPartitionsBehavior);
                }
            }
            schema = getHiveSchema(table);
        }
        if (partitionName.isPresent()) {
            // Write to a new partition
            outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat);
        } else {
            // Write to a new/existing unpartitioned table
            outputStorageFormat = fromHiveStorageFormat(tableStorageFormat);
        }
    } else {
        // Write to: an existing partition in an existing partitioned table
        if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.APPEND) {
            // Append to an existing partition
            checkState(!immutablePartitions);
            updateMode = UpdateMode.APPEND;
            // Check the column types in partition schema match the column types in table schema
            List<Column> tableColumns = table.getDataColumns();
            List<Column> existingPartitionColumns = partition.get().getColumns();
            for (int i = 0; i < min(existingPartitionColumns.size(), tableColumns.size()); i++) {
                HiveType tableType = tableColumns.get(i).getType();
                HiveType partitionType = existingPartitionColumns.get(i).getType();
                if (!tableType.equals(partitionType)) {
                    throw new PrestoException(HIVE_PARTITION_SCHEMA_MISMATCH, format("" + "You are trying to write into an existing partition in a table. " + "The table schema has changed since the creation of the partition. " + "Inserting rows into such partition is not supported. " + "The column '%s' in table '%s' is declared as type '%s', " + "but partition '%s' declared column '%s' as type '%s'.", tableColumns.get(i).getName(), tableName, tableType, partitionName, existingPartitionColumns.get(i).getName(), partitionType));
                }
            }
            HiveWriteUtils.checkPartitionIsWritable(partitionName.get(), partition.get());
            outputStorageFormat = partition.get().getStorage().getStorageFormat();
            schema = getHiveSchema(partition.get(), table);
            writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get());
        } else if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.OVERWRITE) {
            // Overwrite an existing partition
            // 
            // The behavior of overwrite considered as if first dropping the partition and inserting a new partition, thus:
            // * No partition writable check is required.
            // * Table schema and storage format is used for the new partition (instead of existing partition schema and storage format).
            updateMode = UpdateMode.OVERWRITE;
            outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat);
            schema = getHiveSchema(table);
            writeInfo = locationService.getPartitionWriteInfo(locationHandle, Optional.empty(), partitionName.get());
            checkWriteMode(writeInfo);
        } else if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.ERROR) {
            throw new PrestoException(HIVE_PARTITION_READ_ONLY, "Cannot insert into an existing partition of Hive table: " + partitionName.get());
        } else {
            throw new IllegalArgumentException(format("Unsupported insert existing partitions behavior: %s", insertExistingPartitionsBehavior));
        }
    }
    schema.putAll(additionalTableParameters);
    if (acidWriteType != HiveACIDWriteType.DELETE) {
        validateSchema(partitionName, schema);
    }
    Path path;
    Optional<AcidOutputFormat.Options> acidOptions;
    String fileNameWithExtension;
    if (isTxnTable) {
        WriteIdInfo writeIdInfo = locationHandle.getJsonSerializablewriteIdInfo().get();
        AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).minimumWriteId(writeIdInfo.getMinWriteId()).maximumWriteId(writeIdInfo.getMaxWriteId()).statementId(writeIdInfo.getStatementId()).bucket(bucketNumber.isPresent() ? bucketNumber.getAsInt() : 0);
        if (acidWriteType == HiveACIDWriteType.DELETE) {
            // to support delete as insert
            options.writingDeleteDelta(true);
        } else if (acidWriteType == HiveACIDWriteType.INSERT_OVERWRITE) {
            // In case of ACID txn tables, dont delete old data. Just create new base in same partition.
            options.writingBase(true);
        }
        if (vacuumOptions.isPresent() && HiveACIDWriteType.isVacuum(acidWriteType)) {
            Options vOptions = vacuumOptions.get();
            // Use the original bucket file number itself.
            // Compacted delta directories will not have statementId
            options.maximumWriteId(vOptions.getMaximumWriteId()).minimumWriteId(vOptions.getMinimumWriteId()).writingBase(vOptions.isWritingBase()).writingDeleteDelta(vOptions.isWritingDeleteDelta()).bucket(vOptions.getBucketId()).statementId(-1);
        }
        if (AcidUtils.isInsertOnlyTable(schema)) {
            String subdir;
            if (options.isWritingBase()) {
                subdir = AcidUtils.baseDir(options.getMaximumWriteId());
            } else if (HiveACIDWriteType.isVacuum(acidWriteType)) {
                // Only for Minor compacted delta will not have statement Id.
                subdir = AcidUtils.deltaSubdir(options.getMinimumWriteId(), options.getMaximumWriteId());
            } else {
                subdir = AcidUtils.deltaSubdir(options.getMinimumWriteId(), options.getMaximumWriteId(), options.getStatementId());
            }
            Path parentDir = new Path(writeInfo.getWritePath(), subdir);
            fileName = String.format("%06d", options.getBucketId()) + "_0" + getFileExtension(conf, outputStorageFormat);
            path = new Path(parentDir, fileName);
            Properties properties = new Properties();
            properties.setProperty("transactional_properties", "insert_only");
            options.tableProperties(properties);
        } else {
            path = AcidUtils.createFilename(writeInfo.getWritePath(), options);
        }
        // In case of ACID entire delta directory should be renamed from staging directory.
        fileNameWithExtension = path.getParent().getName();
        acidOptions = Optional.of(options);
    } else {
        fileNameWithExtension = fileName + getFileExtension(conf, outputStorageFormat);
        path = new Path(writeInfo.getWritePath(), fileNameWithExtension);
        acidOptions = Optional.empty();
    }
    FileSystem fileSystem;
    try {
        fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, conf);
    } catch (IOException e) {
        throw new PrestoException(HIVE_WRITER_OPEN_ERROR, e);
    }
    if (isSnapshotEnabled) {
        // Snapshot: use a recognizable name pattern, in case they need to be deleted/renamed
        String oldFileName = path.getName();
        String newFileName = toSnapshotFileName(oldFileName, queryId);
        path = new Path(path.getParent(), newFileName);
        if (fileNameWithExtension.equals(oldFileName)) {
            fileNameWithExtension = newFileName;
        }
    }
    HiveFileWriter hiveFileWriter = null;
    if (isSnapshotEnabled && !forMerge) {
        // Add a suffix to file name for sub files
        String oldFileName = path.getName();
        String newFileName = toSnapshotSubFile(oldFileName);
        path = new Path(path.getParent(), newFileName);
        if (fileNameWithExtension.equals(oldFileName)) {
            fileNameWithExtension = newFileName;
        }
        // Always create a simple ORC writer for snapshot files. These will be merged in the end.
        logContainingFolderInfo(fileSystem, path, "Creating SnapshotTempFileWriter for %s", path);
        try {
            Path finalPath = path;
            hiveFileWriter = new SnapshotTempFileWriter(orcFileWriterFactory.createOrcDataSink(session, fileSystem, path), dataColumns.stream().map(column -> column.getHiveType().getType(typeManager)).collect(Collectors.toList()));
        } catch (IOException e) {
            throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
        }
    } else {
        conf.set("table.write.path", writeInfo.getWritePath().toString());
        for (HiveFileWriterFactory fileWriterFactory : fileWriterFactories) {
            Optional<HiveFileWriter> fileWriter = fileWriterFactory.createFileWriter(path, dataColumns.stream().map(DataColumn::getName).collect(toList()), outputStorageFormat, schema, conf, session, acidOptions, Optional.of(acidWriteType));
            if (fileWriter.isPresent()) {
                hiveFileWriter = fileWriter.get();
                break;
            }
        }
        if (isSnapshotEnabled) {
            // TODO-cp-I2BZ0A: assuming all files to be of ORC type
            checkState(hiveFileWriter instanceof OrcFileWriter, "Only support ORC format with snapshot");
            logContainingFolderInfo(fileSystem, path, "Creating file writer for final result: %s", path);
        }
        if (hiveFileWriter == null) {
            hiveFileWriter = new RecordFileWriter(path, dataColumns.stream().map(DataColumn::getName).collect(toList()), outputStorageFormat, schema, partitionStorageFormat.getEstimatedWriterSystemMemoryUsage(), conf, typeManager, parquetTimeZone, session);
        }
        if (isTxnTable) {
            hiveFileWriter.initWriter(true, path, fileSystem);
        }
    }
    Path finalPath = path;
    String writerImplementation = hiveFileWriter.getClass().getName();
    Consumer<HiveWriter> onCommit;
    if (isSnapshotEnabled && !forMerge) {
        // Only send "commit" event for the merged file
        onCommit = hiveWriter -> {
        };
    } else {
        onCommit = hiveWriter -> {
            Optional<Long> size;
            try {
                size = Optional.of(hdfsEnvironment.getFileSystem(session.getUser(), finalPath, conf).getFileStatus(finalPath).getLen());
            } catch (IOException | RuntimeException e) {
                // Do not fail the query if file system is not available
                size = Optional.empty();
            }
            eventClient.post(new WriteCompletedEvent(session.getQueryId(), finalPath.toString(), schemaName, tableName, partitionName.orElse(null), outputStorageFormat.getOutputFormat(), writerImplementation, nodeManager.getCurrentNode().getVersion(), nodeManager.getCurrentNode().getHost(), session.getIdentity().getPrincipal().map(Principal::getName).orElse(null), nodeManager.getEnvironment(), sessionProperties, size.orElse(null), hiveWriter.getRowCount()));
        };
    }
    if (!sortedBy.isEmpty() || (isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType))) {
        List<Type> types = dataColumns.stream().map(column -> column.getHiveType().getType(typeManager)).collect(Collectors.toList());
        Map<String, Integer> columnIndexes = new HashMap<>();
        for (int i = 0; i < dataColumns.size(); i++) {
            columnIndexes.put(dataColumns.get(i).getName(), i);
        }
        if (sortedBy.isEmpty() && isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType)) {
            // Add $rowId column as the last column in the page
            types.add(HiveColumnHandle.updateRowIdHandle().getHiveType().getType(typeManager));
            columnIndexes.put(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME, dataColumns.size());
        }
        List<Integer> sortFields = new ArrayList<>();
        List<SortOrder> sortOrders = new ArrayList<>();
        List<SortingColumn> sortigColumns = this.sortedBy;
        if (sortedBy.isEmpty() && isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType)) {
            sortigColumns = ImmutableList.of(new SortingColumn(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME, SortingColumn.Order.ASCENDING));
        }
        for (SortingColumn column : sortigColumns) {
            Integer index = columnIndexes.get(column.getColumnName());
            if (index == null) {
                throw new PrestoException(HIVE_INVALID_METADATA, format("Sorting column '%s' does not exist in table '%s.%s'", column.getColumnName(), schemaName, tableName));
            }
            sortFields.add(index);
            sortOrders.add(column.getOrder().getSortOrder());
        }
        FileSystem sortFileSystem = fileSystem;
        String child = ".tmp-sort." + path.getName();
        Path tempFilePrefix = new Path(path.getParent(), child);
        hiveFileWriter = new SortingFileWriter(sortFileSystem, tempFilePrefix, hiveFileWriter, sortBufferSize, maxOpenSortFiles, types, sortFields, sortOrders, pageSorter, (fs, p) -> orcFileWriterFactory.createOrcDataSink(session, fs, p));
    }
    return new HiveWriter(hiveFileWriter, partitionName, updateMode, fileNameWithExtension, writeInfo.getWritePath().toString(), writeInfo.getTargetPath().toString(), path.toString(), onCommit, // Snapshot: only update stats when merging files
    isSnapshotEnabled && !forMerge ? null : hiveWriterStats, hiveFileWriter.getExtraPartitionFiles());
}

Aggregations

Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)2 Preconditions.checkState (com.google.common.base.Preconditions.checkState)2 Strings (com.google.common.base.Strings)2 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 Sets (com.google.common.collect.Sets)2 EventClient (io.airlift.event.client.EventClient)2 Logger (io.airlift.log.Logger)2 DataSize (io.airlift.units.DataSize)2 MEGABYTE (io.airlift.units.DataSize.Unit.MEGABYTE)2 OrcDataSourceId (io.prestosql.orc.OrcDataSourceId)2 HdfsContext (io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext)2 HIVE_FILESYSTEM_ERROR (io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR)2 HIVE_INVALID_METADATA (io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA)2 HIVE_PARTITION_READ_ONLY (io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_READ_ONLY)2 HIVE_PARTITION_SCHEMA_MISMATCH (io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH)2 HIVE_PATH_ALREADY_EXISTS (io.prestosql.plugin.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS)2 HIVE_TABLE_READ_ONLY (io.prestosql.plugin.hive.HiveErrorCode.HIVE_TABLE_READ_ONLY)2