Search in sources :

Example 1 with CompressionKind

use of io.trino.orc.metadata.CompressionKind in project trino by trinodb.

the class OrcFileWriterFactory method getCompression.

private static CompressionKind getCompression(Properties schema, JobConf configuration) {
    String compressionName = OrcConf.COMPRESS.getString(schema, configuration);
    if (compressionName == null) {
        return CompressionKind.ZLIB;
    }
    CompressionKind compression;
    try {
        compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH));
    } catch (IllegalArgumentException e) {
        throw new TrinoException(HIVE_UNSUPPORTED_FORMAT, "Unknown ORC compression type " + compressionName);
    }
    return compression;
}
Also used : CompressionKind(io.trino.orc.metadata.CompressionKind) TrinoException(io.trino.spi.TrinoException)

Example 2 with CompressionKind

use of io.trino.orc.metadata.CompressionKind in project trino by trinodb.

the class OrcTester method assertRoundTrip.

private void assertRoundTrip(Type writeType, Type readType, List<?> writeValues, List<?> readValues) throws Exception {
    OrcWriterStats stats = new OrcWriterStats();
    for (CompressionKind compression : compressions) {
        boolean hiveSupported = (compression != LZ4) && (compression != ZSTD) && !isTimestampTz(writeType) && !isTimestampTz(readType);
        for (Format format : formats) {
            // write Hive, read Trino
            if (hiveSupported) {
                try (TempFile tempFile = new TempFile()) {
                    writeOrcColumnHive(tempFile.getFile(), format, compression, writeType, writeValues.iterator());
                    assertFileContentsTrino(readType, tempFile, readValues, false, false);
                }
            }
        }
        // write Trino, read Hive and Trino
        try (TempFile tempFile = new TempFile()) {
            writeOrcColumnTrino(tempFile.getFile(), compression, writeType, writeValues.iterator(), stats);
            if (hiveSupported) {
                assertFileContentsHive(readType, tempFile, readValues);
            }
            assertFileContentsTrino(readType, tempFile, readValues, false, false);
            if (skipBatchTestsEnabled) {
                assertFileContentsTrino(readType, tempFile, readValues, true, false);
            }
            if (skipStripeTestsEnabled) {
                assertFileContentsTrino(readType, tempFile, readValues, false, true);
            }
        }
    }
    assertEquals(stats.getWriterSizeInBytes(), 0);
}
Also used : CompressionKind(io.trino.orc.metadata.CompressionKind) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat)

Example 3 with CompressionKind

use of io.trino.orc.metadata.CompressionKind in project trino by trinodb.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    if (transaction.isAcidDeleteOperation(writerKind)) {
        // For delete, set the "row" column to -1
        fileInputColumnIndexes[fileInputColumnIndexes.length - 1] = -1;
    }
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        OrcDataSink orcDataSink = createOrcDataSink(fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), new OrcReaderOptions(), fileSystem.open(path), readStats);
                } catch (IOException e) {
                    throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        if (transaction.isInsert() && useAcidSchema) {
            // Only add the ACID columns if the request is for insert-type operations - - for delete operations,
            // the columns are added by the caller.  This is because the ACID columns for delete operations
            // depend on the rows being deleted, whereas the ACID columns for INSERT are completely determined
            // by bucket and writeId.
            Type rowType = createRowType(fileColumnNames, fileColumnTypes);
            fileColumnNames = ACID_COLUMN_NAMES;
            fileColumnTypes = createAcidColumnPrestoTypes(rowType);
        }
        return Optional.of(new OrcFileWriter(orcDataSink, writerKind, transaction, useAcidSchema, bucketNumber, rollbackAction, fileColumnNames, fileColumnTypes, createRootOrcType(fileColumnNames, fileColumnTypes), compression, getOrcWriterOptions(schema, orcWriterOptions).withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) OrcConf(org.apache.orc.OrcConf) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) ENGLISH(java.util.Locale.ENGLISH) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize) TrinoException(io.trino.spi.TrinoException) HiveSessionProperties.getOrcOptimizedWriterValidateMode(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterValidateMode) WriterKind(io.trino.plugin.hive.WriterKind) HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMinStripeSize) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) AcidSchema.createAcidColumnPrestoTypes(io.trino.plugin.hive.acid.AcidSchema.createAcidColumnPrestoTypes) HIVE_WRITE_VALIDATION_FAILED(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) Optional(java.util.Optional) ACID_COLUMN_NAMES(io.trino.plugin.hive.acid.AcidSchema.ACID_COLUMN_NAMES) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) Flatten(org.weakref.jmx.Flatten) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) HiveSessionProperties.isOrcOptimizedWriterValidate(io.trino.plugin.hive.HiveSessionProperties.isOrcOptimizedWriterValidate) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.hive.HiveSessionProperties.getOrcStringStatisticsLimit) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) Properties(java.util.Properties) HIVE_UNSUPPORTED_FORMAT(io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveUtil.getOrcWriterOptions(io.trino.plugin.hive.util.HiveUtil.getOrcWriterOptions) CompressionKind(io.trino.orc.metadata.CompressionKind) JobConf(org.apache.hadoop.mapred.JobConf) Collectors.toList(java.util.stream.Collectors.toList) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows) CompressionKind(io.trino.orc.metadata.CompressionKind) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) OrcDataSink(io.trino.orc.OrcDataSink) IOException(java.io.IOException) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) Type(io.trino.spi.type.Type) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcReaderOptions(io.trino.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) Supplier(java.util.function.Supplier)

Aggregations

CompressionKind (io.trino.orc.metadata.CompressionKind)3 TrinoException (io.trino.spi.TrinoException)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 OrcDataSink (io.trino.orc.OrcDataSink)1 OrcDataSource (io.trino.orc.OrcDataSource)1 OrcDataSourceId (io.trino.orc.OrcDataSourceId)1 OrcReaderOptions (io.trino.orc.OrcReaderOptions)1 OrcWriterOptions (io.trino.orc.OrcWriterOptions)1 OrcWriterStats (io.trino.orc.OrcWriterStats)1 OutputStreamOrcDataSink (io.trino.orc.OutputStreamOrcDataSink)1 OrcType.createRootOrcType (io.trino.orc.metadata.OrcType.createRootOrcType)1 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)1 FileWriter (io.trino.plugin.hive.FileWriter)1 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)1 HIVE_UNSUPPORTED_FORMAT (io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT)1 HIVE_WRITER_OPEN_ERROR (io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR)1 HIVE_WRITE_VALIDATION_FAILED (io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED)1 HiveFileWriterFactory (io.trino.plugin.hive.HiveFileWriterFactory)1 PRESTO_QUERY_ID_NAME (io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME)1 PRESTO_VERSION_NAME (io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME)1