Search in sources :

Example 16 with TypeManager

use of io.trino.spi.type.TypeManager in project trino by trinodb.

the class ParquetFileWriterFactory method createFileWriter.

@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
    if (!HiveSessionProperties.isParquetOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(HiveSessionProperties.getParquetWriterPageSize(session)).setMaxBlockSize(HiveSessionProperties.getParquetWriterBlockSize(session)).setBatchSize(HiveSessionProperties.getParquetBatchSize(session)).build();
    CompressionCodecName compressionCodecName = getCompression(conf);
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(fileColumnTypes, fileColumnNames);
        return Optional.of(new ParquetFileWriter(fileSystem.create(path, false), rollbackAction, fileColumnTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, fileInputColumnIndexes, compressionCodecName, nodeVersion.toString()));
    } catch (IOException e) {
        throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) OptionalInt(java.util.OptionalInt) FileWriter(io.trino.plugin.hive.FileWriter) Inject(javax.inject.Inject) HiveUtil.getColumnNames(io.trino.plugin.hive.util.HiveUtil.getColumnNames) NodeVersion(io.trino.plugin.hive.NodeVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) HIVE_WRITER_OPEN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveUtil.getColumnTypes(io.trino.plugin.hive.util.HiveUtil.getColumnTypes) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Properties(java.util.Properties) ParquetOutputFormat(org.apache.parquet.hadoop.ParquetOutputFormat) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) HiveSessionProperties(io.trino.plugin.hive.HiveSessionProperties) WriterKind(io.trino.plugin.hive.WriterKind) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HiveFileWriterFactory(io.trino.plugin.hive.HiveFileWriterFactory) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) Type(io.trino.spi.type.Type) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Example 17 with TypeManager

use of io.trino.spi.type.TypeManager in project trino by trinodb.

the class IcebergPageSourceProvider method createOrcPageSource.

private static ReaderPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> columns, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, FileFormatDataSourceStats stats, TypeManager typeManager, Optional<NameMapping> nameMapping) {
    OrcDataSource orcDataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options, inputStream, stats);
        OrcReader reader = OrcReader.createOrcReader(orcDataSource, options).orElseThrow(() -> new TrinoException(ICEBERG_BAD_DATA, "ORC file is zero length"));
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        if (nameMapping.isPresent() && !hasIds(reader.getRootColumn())) {
            fileColumns = fileColumns.stream().map(orcColumn -> setMissingFieldIds(orcColumn, nameMapping.get(), ImmutableList.of(orcColumn.getColumnName()))).collect(toImmutableList());
        }
        Map<Integer, OrcColumn> fileColumnsByIcebergId = mapIdsToOrcFileColumns(fileColumns);
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled());
        Map<IcebergColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        Optional<ReaderColumns> columnProjections = projectColumns(columns);
        Map<Integer, List<List<Integer>>> projectionsByFieldId = columns.stream().collect(groupingBy(column -> column.getBaseColumnIdentity().getId(), mapping(IcebergColumnHandle::getPath, toUnmodifiableList())));
        List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(columns);
        List<OrcColumn> fileReadColumns = new ArrayList<>(readColumns.size());
        List<Type> fileReadTypes = new ArrayList<>(readColumns.size());
        List<ProjectedLayout> projectedLayouts = new ArrayList<>(readColumns.size());
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(readColumns.size());
        for (IcebergColumnHandle column : readColumns) {
            verify(column.isBaseColumn(), "Column projections must be based from a root column");
            OrcColumn orcColumn = fileColumnsByIcebergId.get(column.getId());
            if (orcColumn != null) {
                Type readType = getOrcReadType(column.getType(), typeManager);
                if (column.getType() == UUID && !"UUID".equals(orcColumn.getAttributes().get(ICEBERG_BINARY_TYPE))) {
                    throw new TrinoException(ICEBERG_BAD_DATA, format("Expected ORC column for UUID data to be annotated with %s=UUID: %s", ICEBERG_BINARY_TYPE, orcColumn));
                }
                List<List<Integer>> fieldIdProjections = projectionsByFieldId.get(column.getId());
                ProjectedLayout projectedLayout = IcebergOrcProjectedLayout.createProjectedLayout(orcColumn, fieldIdProjections);
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                projectedLayouts.add(projectedLayout);
                for (Map.Entry<IcebergColumnHandle, Domain> domainEntry : effectivePredicateDomains.entrySet()) {
                    IcebergColumnHandle predicateColumn = domainEntry.getKey();
                    OrcColumn predicateOrcColumn = fileColumnsByIcebergId.get(predicateColumn.getId());
                    if (predicateOrcColumn != null && column.getColumnIdentity().equals(predicateColumn.getBaseColumnIdentity())) {
                        predicateBuilder.addColumn(predicateOrcColumn.getColumnId(), domainEntry.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(column.getType()));
            }
        }
        AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
        OrcDataSourceId orcDataSourceId = orcDataSource.getId();
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, projectedLayouts, predicateBuilder.build(), start, length, UTC, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSourceId, exception), new IdBasedFieldMapperFactory(readColumns));
        return new ReaderPageSource(new OrcPageSource(recordReader, columnAdaptations, orcDataSource, Optional.empty(), Optional.empty(), memoryUsage, stats), columnProjections);
    } catch (Exception e) {
        if (orcDataSource != null) {
            try {
                orcDataSource.close();
            } catch (IOException ignored) {
            }
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ORC_ICEBERG_ID_KEY(io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ICEBERG_CANNOT_OPEN_SPLIT(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) UUID(io.trino.spi.type.UuidType.UUID) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) ICEBERG_FILESYSTEM_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) IcebergSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MappedField(org.apache.iceberg.mapping.MappedField) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) IcebergSessionProperties.isOrcNestedLazy(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy) IcebergSessionProperties.getOrcMaxBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IcebergSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) ICEBERG_MISSING_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) MappedFields(org.apache.iceberg.mapping.MappedFields) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) OrcType(io.trino.orc.metadata.OrcType) Predicate(io.trino.parquet.predicate.Predicate) IcebergSessionProperties.isUseFileSizeFromMetadata(io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata) MapType(io.trino.spi.type.MapType) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) StandardTypes(io.trino.spi.type.StandardTypes) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IcebergSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) IcebergSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) UTC(org.joda.time.DateTimeZone.UTC) Field(io.trino.parquet.Field) Traverser(com.google.common.graph.Traverser) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) IcebergSessionProperties.getOrcStreamBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetSchemaUtil(org.apache.iceberg.parquet.ParquetSchemaUtil) OrcColumn(io.trino.orc.OrcColumn) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) MetadataReader(io.trino.parquet.reader.MetadataReader) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) OrcRecordReader(io.trino.orc.OrcRecordReader) NameMapping(org.apache.iceberg.mapping.NameMapping) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) RowType(io.trino.spi.type.RowType) ICEBERG_BAD_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) ParquetReader(io.trino.parquet.reader.ParquetReader) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) ICEBERG_CURSOR_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) OrcDataSourceId(io.trino.orc.OrcDataSourceId) MessageType(org.apache.parquet.schema.MessageType) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) IcebergSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) Function(java.util.function.Function) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) IcebergSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) OrcReader(io.trino.orc.OrcReader) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ICEBERG_BINARY_TYPE(io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) OrcCorruptionException(io.trino.orc.OrcCorruptionException) Collectors.toList(java.util.stream.Collectors.toList) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) ParquetDataSource(io.trino.parquet.ParquetDataSource) TypeManager(io.trino.spi.type.TypeManager) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) ArrayList(java.util.ArrayList) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) OrcDataSource(io.trino.orc.OrcDataSource) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OrcColumn(io.trino.orc.OrcColumn) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) TrinoException(io.trino.spi.TrinoException) OrcCorruptionException(io.trino.orc.OrcCorruptionException) OrcType(io.trino.orc.metadata.OrcType) MapType(io.trino.spi.type.MapType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) OrcReader(io.trino.orc.OrcReader) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation)

Example 18 with TypeManager

use of io.trino.spi.type.TypeManager in project trino by trinodb.

the class IcebergFileWriterFactory method createParquetWriter.

private IcebergFileWriter createParquetWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session, HdfsContext hdfsContext) {
    List<String> fileColumnNames = icebergSchema.columns().stream().map(Types.NestedField::name).collect(toImmutableList());
    List<Type> fileColumnTypes = icebergSchema.columns().stream().map(column -> toTrinoType(column.type(), typeManager)).collect(toImmutableList());
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), outputPath, jobConf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(outputPath, false);
            return null;
        };
        ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(getParquetWriterPageSize(session)).setMaxBlockSize(getParquetWriterBlockSize(session)).setBatchSize(getParquetWriterBatchSize(session)).build();
        return new IcebergParquetFileWriter(hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.create(outputPath)), rollbackAction, fileColumnTypes, convert(icebergSchema, "table"), makeTypeMap(fileColumnTypes, fileColumnNames), parquetWriterOptions, IntStream.range(0, fileColumnNames.size()).toArray(), getCompressionCodec(session).getParquetCompressionCodec(), nodeVersion.toString(), outputPath, hdfsEnvironment, hdfsContext);
    } catch (IOException e) {
        throw new TrinoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
    }
}
Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) Types(org.apache.iceberg.types.Types) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) FileSystem(org.apache.hadoop.fs.FileSystem) IcebergSessionProperties.getOrcWriterMaxStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeSize) IcebergSessionProperties.getParquetWriterPageSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterPageSize) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) IcebergSessionProperties.isOrcWriterValidate(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcWriterValidate) PrimitiveTypeMapBuilder.makeTypeMap(io.trino.plugin.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) IcebergSessionProperties.getParquetWriterBatchSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBatchSize) ParquetSchemaUtil.convert(org.apache.iceberg.parquet.ParquetSchemaUtil.convert) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TrinoException(io.trino.spi.TrinoException) Schema(org.apache.iceberg.Schema) IcebergSessionProperties.getCompressionCodec(io.trino.plugin.iceberg.IcebergSessionProperties.getCompressionCodec) IcebergSessionProperties.getOrcWriterMaxStripeRows(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeRows) OrcDataSourceId(io.trino.orc.OrcDataSourceId) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) ICEBERG_WRITER_OPEN_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR) Optional(java.util.Optional) IntStream(java.util.stream.IntStream) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) ICEBERG_WRITE_VALIDATION_FAILED(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) IcebergSessionProperties.getOrcWriterMaxDictionaryMemory(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxDictionaryMemory) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) IcebergSessionProperties.getOrcWriterMinStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMinStripeSize) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) IcebergSessionProperties.getOrcWriterValidateMode(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterValidateMode) MetricsConfig(org.apache.iceberg.MetricsConfig) IcebergSessionProperties.getParquetWriterBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) IcebergSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStringStatisticsLimit) JobConf(org.apache.hadoop.mapred.JobConf) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) Types(org.apache.iceberg.types.Types) IOException(java.io.IOException) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) Type(io.trino.spi.type.Type) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions)

Example 19 with TypeManager

use of io.trino.spi.type.TypeManager in project trino by trinodb.

the class PartitionsSystemTableProvider method getSystemTable.

@Override
public Optional<SystemTable> getSystemTable(HiveMetadata metadata, ConnectorSession session, SchemaTableName tableName) {
    if (!PARTITIONS.matches(tableName)) {
        return Optional.empty();
    }
    SchemaTableName sourceTableName = PARTITIONS.getSourceTableName(tableName);
    Table sourceTable = metadata.getMetastore().getTable(sourceTableName.getSchemaName(), sourceTableName.getTableName()).orElse(null);
    if (sourceTable == null || isDeltaLakeTable(sourceTable) || isIcebergTable(sourceTable)) {
        return Optional.empty();
    }
    verifyOnline(sourceTableName, Optional.empty(), getProtectMode(sourceTable), sourceTable.getParameters());
    HiveTableHandle sourceTableHandle = new HiveTableHandle(sourceTableName.getSchemaName(), sourceTableName.getTableName(), sourceTable.getParameters(), getPartitionKeyColumnHandles(sourceTable, typeManager), getRegularColumnHandles(sourceTable, typeManager, getTimestampPrecision(session)), getHiveBucketHandle(session, sourceTable, typeManager));
    List<HiveColumnHandle> partitionColumns = sourceTableHandle.getPartitionColumns();
    if (partitionColumns.isEmpty()) {
        return Optional.empty();
    }
    List<Type> partitionColumnTypes = partitionColumns.stream().map(HiveColumnHandle::getType).collect(toImmutableList());
    List<ColumnMetadata> partitionSystemTableColumns = partitionColumns.stream().map(column -> ColumnMetadata.builder().setName(column.getName()).setType(column.getType()).setComment(column.getComment()).setHidden(column.isHidden()).build()).collect(toImmutableList());
    Map<Integer, HiveColumnHandle> fieldIdToColumnHandle = IntStream.range(0, partitionColumns.size()).boxed().collect(toImmutableMap(identity(), partitionColumns::get));
    return Optional.of(createSystemTable(new ConnectorTableMetadata(tableName, partitionSystemTableColumns), constraint -> {
        Constraint targetConstraint = new Constraint(constraint.transformKeys(fieldIdToColumnHandle::get));
        Iterable<List<Object>> records = () -> stream(partitionManager.getPartitions(metadata.getMetastore(), sourceTableHandle, targetConstraint).getPartitions()).map(hivePartition -> IntStream.range(0, partitionColumns.size()).mapToObj(fieldIdToColumnHandle::get).map(columnHandle -> hivePartition.getKeys().get(columnHandle).getValue()).collect(// nullable
        toList())).iterator();
        return new InMemoryRecordSet(partitionColumnTypes, records).cursor();
    }));
}
Also used : IntStream(java.util.stream.IntStream) Constraint(io.trino.spi.connector.Constraint) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) HiveUtil.isDeltaLakeTable(io.trino.plugin.hive.util.HiveUtil.isDeltaLakeTable) Type(io.trino.spi.type.Type) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata) Inject(javax.inject.Inject) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) PARTITIONS(io.trino.plugin.hive.SystemTableHandler.PARTITIONS) HiveUtil.isIcebergTable(io.trino.plugin.hive.util.HiveUtil.isIcebergTable) Table(io.trino.plugin.hive.metastore.Table) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveUtil.getRegularColumnHandles(io.trino.plugin.hive.util.HiveUtil.getRegularColumnHandles) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) ConnectorSession(io.trino.spi.connector.ConnectorSession) InMemoryRecordSet(io.trino.spi.connector.InMemoryRecordSet) SchemaTableName(io.trino.spi.connector.SchemaTableName) Streams.stream(com.google.common.collect.Streams.stream) HiveBucketing.getHiveBucketHandle(io.trino.plugin.hive.util.HiveBucketing.getHiveBucketHandle) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Collectors.toList(java.util.stream.Collectors.toList) MetastoreUtil.verifyOnline(io.trino.plugin.hive.metastore.MetastoreUtil.verifyOnline) MetastoreUtil.getProtectMode(io.trino.plugin.hive.metastore.MetastoreUtil.getProtectMode) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) SystemTables.createSystemTable(io.trino.plugin.hive.util.SystemTables.createSystemTable) HiveUtil.getPartitionKeyColumnHandles(io.trino.plugin.hive.util.HiveUtil.getPartitionKeyColumnHandles) TypeManager(io.trino.spi.type.TypeManager) SystemTable(io.trino.spi.connector.SystemTable) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) HiveUtil.isDeltaLakeTable(io.trino.plugin.hive.util.HiveUtil.isDeltaLakeTable) HiveUtil.isIcebergTable(io.trino.plugin.hive.util.HiveUtil.isIcebergTable) Table(io.trino.plugin.hive.metastore.Table) SystemTables.createSystemTable(io.trino.plugin.hive.util.SystemTables.createSystemTable) SystemTable(io.trino.spi.connector.SystemTable) Constraint(io.trino.spi.connector.Constraint) SchemaTableName(io.trino.spi.connector.SchemaTableName) InMemoryRecordSet(io.trino.spi.connector.InMemoryRecordSet) Type(io.trino.spi.type.Type) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata)

Example 20 with TypeManager

use of io.trino.spi.type.TypeManager in project trino by trinodb.

the class S3SelectRecordCursorProvider method createRecordCursor.

@Override
public Optional<ReaderRecordCursorWithProjections> createRecordCursor(Configuration configuration, ConnectorSession session, Path path, long start, long length, long fileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, TypeManager typeManager, boolean s3SelectPushdownEnabled) {
    if (!s3SelectPushdownEnabled) {
        return Optional.empty();
    }
    try {
        this.hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
    } catch (IOException e) {
        throw new TrinoException(HIVE_FILESYSTEM_ERROR, "Failed getting FileSystem: " + path, e);
    }
    Optional<ReaderColumns> projectedReaderColumns = projectBaseColumns(columns);
    // Ignore predicates on partial columns for now.
    effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
    String serdeName = getDeserializerClassName(schema);
    if (CSV_SERDES.contains(serdeName)) {
        List<HiveColumnHandle> readerColumns = projectedReaderColumns.map(ReaderColumns::get).map(readColumns -> readColumns.stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
        IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager);
        String ionSqlQuery = queryBuilder.buildSql(readerColumns, effectivePredicate);
        S3SelectLineRecordReader recordReader = new S3SelectCsvRecordReader(configuration, path, start, length, schema, ionSqlQuery, s3ClientFactory);
        RecordCursor cursor = new S3SelectRecordCursor<>(configuration, path, recordReader, length, schema, readerColumns);
        return Optional.of(new ReaderRecordCursorWithProjections(cursor, projectedReaderColumns));
    }
    // unsupported serdes
    return Optional.empty();
}
Also used : ImmutableSet(com.google.common.collect.ImmutableSet) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) RecordCursor(io.trino.spi.connector.RecordCursor) Properties(java.util.Properties) HiveRecordCursorProvider(io.trino.plugin.hive.HiveRecordCursorProvider) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) List(java.util.List) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Configuration(org.apache.hadoop.conf.Configuration) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) Optional(java.util.Optional) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) HIVE_FILESYSTEM_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) TypeManager(io.trino.spi.type.TypeManager) RecordCursor(io.trino.spi.connector.RecordCursor) IOException(java.io.IOException) TrinoException(io.trino.spi.TrinoException) ReaderColumns(io.trino.plugin.hive.ReaderColumns) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle)

Aggregations

TypeManager (io.trino.spi.type.TypeManager)35 List (java.util.List)28 Objects.requireNonNull (java.util.Objects.requireNonNull)28 Optional (java.util.Optional)27 ConnectorSession (io.trino.spi.connector.ConnectorSession)25 TrinoException (io.trino.spi.TrinoException)23 IOException (java.io.IOException)22 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)21 Path (org.apache.hadoop.fs.Path)21 Map (java.util.Map)20 TupleDomain (io.trino.spi.predicate.TupleDomain)19 ImmutableList (com.google.common.collect.ImmutableList)18 ImmutableMap (com.google.common.collect.ImmutableMap)18 Set (java.util.Set)18 FileSystem (org.apache.hadoop.fs.FileSystem)17 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)15 ImmutableSet (com.google.common.collect.ImmutableSet)14 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)14 HdfsContext (io.trino.plugin.hive.HdfsEnvironment.HdfsContext)14 NOT_SUPPORTED (io.trino.spi.StandardErrorCode.NOT_SUPPORTED)14