Examples with OrcDataSink - io.trino.orc.OrcDataSink

Example 1 with OrcDataSink

use of io.trino.orc.OrcDataSink in project trino by trinodb.

the class IcebergFileWriterFactory method createOrcWriter.

private IcebergFileWriter createOrcWriter(MetricsConfig metricsConfig, Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), outputPath, jobConf);
        OrcDataSink orcDataSink = hdfsEnvironment.doAs(session.getIdentity(), () -> new OutputStreamOrcDataSink(fileSystem.create(outputPath)));
        Callable<Void> rollbackAction = () -> {
            hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.delete(outputPath, false));
            return null;
        };
        List<Types.NestedField> columnFields = icebergSchema.columns();
        List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
        List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toTrinoType(type, typeManager)).collect(toImmutableList());
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(outputPath).getLen()), new OrcReaderOptions(), hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(outputPath)), readStats);
                } catch (IOException e) {
                    throw new TrinoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        return new IcebergOrcFileWriter(metricsConfig, icebergSchema, orcDataSink, rollbackAction, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcWriterOptions.withStripeMinSize(getOrcWriterMinStripeSize(session)).withStripeMaxSize(getOrcWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcWriterValidateMode(session), orcWriterStats);
    } catch (IOException e) {
        throw new TrinoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}

Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) Types(org.apache.iceberg.types.Types) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) FileSystem(org.apache.hadoop.fs.FileSystem) IcebergSessionProperties.getOrcWriterMaxStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeSize) IcebergSessionProperties.getParquetWriterPageSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterPageSize) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) IcebergSessionProperties.isOrcWriterValidate(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcWriterValidate) PrimitiveTypeMapBuilder.makeTypeMap(io.trino.plugin.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) IcebergSessionProperties.getParquetWriterBatchSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBatchSize) ParquetSchemaUtil.convert(org.apache.iceberg.parquet.ParquetSchemaUtil.convert) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TrinoException(io.trino.spi.TrinoException) Schema(org.apache.iceberg.Schema) IcebergSessionProperties.getCompressionCodec(io.trino.plugin.iceberg.IcebergSessionProperties.getCompressionCodec) IcebergSessionProperties.getOrcWriterMaxStripeRows(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeRows) OrcDataSourceId(io.trino.orc.OrcDataSourceId) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) ICEBERG_WRITER_OPEN_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR) Optional(java.util.Optional) IntStream(java.util.stream.IntStream) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) ICEBERG_WRITE_VALIDATION_FAILED(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) IcebergSessionProperties.getOrcWriterMaxDictionaryMemory(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxDictionaryMemory) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) IcebergSessionProperties.getOrcWriterMinStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMinStripeSize) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) IcebergSessionProperties.getOrcWriterValidateMode(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterValidateMode) MetricsConfig(org.apache.iceberg.MetricsConfig) IcebergSessionProperties.getParquetWriterBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) IcebergSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStringStatisticsLimit) JobConf(org.apache.hadoop.mapred.JobConf) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) Types(org.apache.iceberg.types.Types) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) OrcDataSink(io.trino.orc.OrcDataSink) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) IOException(java.io.IOException) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) Type(io.trino.spi.type.Type) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) OrcReaderOptions(io.trino.orc.OrcReaderOptions) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) Supplier(java.util.function.Supplier)

Example 2 with OrcDataSink

use of io.trino.orc.OrcDataSink in project trino by trinodb.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    if (transaction.isAcidDeleteOperation(writerKind)) {
        // For delete, set the "row" column to -1
        fileInputColumnIndexes[fileInputColumnIndexes.length - 1] = -1;
    }
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        OrcDataSink orcDataSink = createOrcDataSink(fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), new OrcReaderOptions(), fileSystem.open(path), readStats);
                } catch (IOException e) {
                    throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        if (transaction.isInsert() && useAcidSchema) {
            // Only add the ACID columns if the request is for insert-type operations - - for delete operations,
            // the columns are added by the caller.  This is because the ACID columns for delete operations
            // depend on the rows being deleted, whereas the ACID columns for INSERT are completely determined
            // by bucket and writeId.
            Type rowType = createRowType(fileColumnNames, fileColumnTypes);
            fileColumnNames = ACID_COLUMN_NAMES;
            fileColumnTypes = createAcidColumnPrestoTypes(rowType);
        }
        return Optional.of(new OrcFileWriter(orcDataSink, writerKind, transaction, useAcidSchema, bucketNumber, rollbackAction, fileColumnNames, fileColumnTypes, createRootOrcType(fileColumnNames, fileColumnTypes), compression, getOrcWriterOptions(schema, orcWriterOptions).withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}

Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) OrcConf(org.apache.orc.OrcConf) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) ENGLISH(java.util.Locale.ENGLISH) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize) TrinoException(io.trino.spi.TrinoException) HiveSessionProperties.getOrcOptimizedWriterValidateMode(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterValidateMode) WriterKind(io.trino.plugin.hive.WriterKind) HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMinStripeSize) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) AcidSchema.createAcidColumnPrestoTypes(io.trino.plugin.hive.acid.AcidSchema.createAcidColumnPrestoTypes) HIVE_WRITE_VALIDATION_FAILED(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) Optional(java.util.Optional) ACID_COLUMN_NAMES(io.trino.plugin.hive.acid.AcidSchema.ACID_COLUMN_NAMES) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) Flatten(org.weakref.jmx.Flatten) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) HiveSessionProperties.isOrcOptimizedWriterValidate(io.trino.plugin.hive.HiveSessionProperties.isOrcOptimizedWriterValidate) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) HiveSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.hive.HiveSessionProperties.getOrcStringStatisticsLimit) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) Properties(java.util.Properties) HIVE_UNSUPPORTED_FORMAT(io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveUtil.getOrcWriterOptions(io.trino.plugin.hive.util.HiveUtil.getOrcWriterOptions) CompressionKind(io.trino.orc.metadata.CompressionKind) JobConf(org.apache.hadoop.mapred.JobConf) Collectors.toList(java.util.stream.Collectors.toList) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(io.trino.plugin.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows) CompressionKind(io.trino.orc.metadata.CompressionKind) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) OrcDataSink(io.trino.orc.OrcDataSink) IOException(java.io.IOException) AcidSchema.createRowType(io.trino.plugin.hive.acid.AcidSchema.createRowType) Type(io.trino.spi.type.Type) OrcType.createRootOrcType(io.trino.orc.metadata.OrcType.createRootOrcType) OrcReaderOptions(io.trino.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) Supplier(java.util.function.Supplier)

Aggregations

ImmutableMap (com.google.common.collect.ImmutableMap)2 OrcDataSink (io.trino.orc.OrcDataSink)2 OrcDataSource (io.trino.orc.OrcDataSource)2 OrcDataSourceId (io.trino.orc.OrcDataSourceId)2 OrcReaderOptions (io.trino.orc.OrcReaderOptions)2 OrcWriterOptions (io.trino.orc.OrcWriterOptions)2 OrcWriterStats (io.trino.orc.OrcWriterStats)2 OutputStreamOrcDataSink (io.trino.orc.OutputStreamOrcDataSink)2 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)2 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)2 PRESTO_QUERY_ID_NAME (io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME)2 PRESTO_VERSION_NAME (io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME)2 NodeVersion (io.trino.plugin.hive.NodeVersion)2 TrinoException (io.trino.spi.TrinoException)2 ConnectorSession (io.trino.spi.connector.ConnectorSession)2 Type (io.trino.spi.type.Type)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 CompressionKind (io.trino.orc.metadata.CompressionKind)1 OrcType.createRootOrcType (io.trino.orc.metadata.OrcType.createRootOrcType)1