Search in sources :

Example 1 with HdfsOrcDataSource

use of io.trino.plugin.hive.orc.HdfsOrcDataSource in project trino by trinodb.

the class IcebergFileWriterFactory method createOrcWriter.

private IcebergFileWriter createOrcWriter(MetricsConfig metricsConfig, Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), outputPath, jobConf);
        OrcDataSink orcDataSink = hdfsEnvironment.doAs(session.getIdentity(), () -> new OutputStreamOrcDataSink(fileSystem.create(outputPath)));
        Callable<Void> rollbackAction = () -> {
            hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.delete(outputPath, false));
            return null;
        };
        List<Types.NestedField> columnFields = icebergSchema.columns();
        List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
        List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toTrinoType(type, typeManager)).collect(toImmutableList());
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (isOrcWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(outputPath).getLen()), new OrcReaderOptions(), hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(outputPath)), readStats);
                } catch (IOException e) {
                    throw new TrinoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        return new IcebergOrcFileWriter(metricsConfig, icebergSchema, orcDataSink, rollbackAction, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcWriterOptions.withStripeMinSize(getOrcWriterMinStripeSize(session)).withStripeMaxSize(getOrcWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcWriterValidateMode(session), orcWriterStats);
    } catch (IOException e) {
        throw new TrinoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) Types(org.apache.iceberg.types.Types) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) FileSystem(org.apache.hadoop.fs.FileSystem) IcebergSessionProperties.getOrcWriterMaxStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeSize) IcebergSessionProperties.getParquetWriterPageSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterPageSize) OrcDataSink(io.trino.orc.OrcDataSink) OrcWriterStats(io.trino.orc.OrcWriterStats) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) IcebergSessionProperties.isOrcWriterValidate(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcWriterValidate) PrimitiveTypeMapBuilder.makeTypeMap(io.trino.plugin.iceberg.util.PrimitiveTypeMapBuilder.makeTypeMap) Path(org.apache.hadoop.fs.Path) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) OrcDataSource(io.trino.orc.OrcDataSource) IcebergSessionProperties.getParquetWriterBatchSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBatchSize) ParquetSchemaUtil.convert(org.apache.iceberg.parquet.ParquetSchemaUtil.convert) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TrinoException(io.trino.spi.TrinoException) Schema(org.apache.iceberg.Schema) IcebergSessionProperties.getCompressionCodec(io.trino.plugin.iceberg.IcebergSessionProperties.getCompressionCodec) IcebergSessionProperties.getOrcWriterMaxStripeRows(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxStripeRows) OrcDataSourceId(io.trino.orc.OrcDataSourceId) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) ICEBERG_WRITER_OPEN_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITER_OPEN_ERROR) Optional(java.util.Optional) IntStream(java.util.stream.IntStream) Type(io.trino.spi.type.Type) Callable(java.util.concurrent.Callable) ICEBERG_WRITE_VALIDATION_FAILED(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_WRITE_VALIDATION_FAILED) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) Managed(org.weakref.jmx.Managed) IcebergSessionProperties.getOrcWriterMaxDictionaryMemory(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMaxDictionaryMemory) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) IcebergSessionProperties.getOrcWriterMinStripeSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterMinStripeSize) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) IcebergSessionProperties.getOrcWriterValidateMode(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcWriterValidateMode) MetricsConfig(org.apache.iceberg.MetricsConfig) IcebergSessionProperties.getParquetWriterBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetWriterBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) IcebergSessionProperties.getOrcStringStatisticsLimit(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStringStatisticsLimit) JobConf(org.apache.hadoop.mapred.JobConf) OrcWriterOptions(io.trino.orc.OrcWriterOptions) PRESTO_VERSION_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME) TypeManager(io.trino.spi.type.TypeManager) Types(org.apache.iceberg.types.Types) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) OrcDataSink(io.trino.orc.OrcDataSink) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) IOException(java.io.IOException) TypeConverter.toTrinoType(io.trino.plugin.iceberg.TypeConverter.toTrinoType) Type(io.trino.spi.type.Type) TypeConverter.toOrcType(io.trino.plugin.iceberg.TypeConverter.toOrcType) OrcReaderOptions(io.trino.orc.OrcReaderOptions) OutputStreamOrcDataSink(io.trino.orc.OutputStreamOrcDataSink) FileSystem(org.apache.hadoop.fs.FileSystem) TrinoException(io.trino.spi.TrinoException) Supplier(java.util.function.Supplier)

Example 2 with HdfsOrcDataSource

use of io.trino.plugin.hive.orc.HdfsOrcDataSource in project trino by trinodb.

the class IcebergPageSourceProvider method createOrcPageSource.

private static ReaderPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> columns, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, FileFormatDataSourceStats stats, TypeManager typeManager, Optional<NameMapping> nameMapping) {
    OrcDataSource orcDataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options, inputStream, stats);
        OrcReader reader = OrcReader.createOrcReader(orcDataSource, options).orElseThrow(() -> new TrinoException(ICEBERG_BAD_DATA, "ORC file is zero length"));
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        if (nameMapping.isPresent() && !hasIds(reader.getRootColumn())) {
            fileColumns = fileColumns.stream().map(orcColumn -> setMissingFieldIds(orcColumn, nameMapping.get(), ImmutableList.of(orcColumn.getColumnName()))).collect(toImmutableList());
        }
        Map<Integer, OrcColumn> fileColumnsByIcebergId = mapIdsToOrcFileColumns(fileColumns);
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled());
        Map<IcebergColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        Optional<ReaderColumns> columnProjections = projectColumns(columns);
        Map<Integer, List<List<Integer>>> projectionsByFieldId = columns.stream().collect(groupingBy(column -> column.getBaseColumnIdentity().getId(), mapping(IcebergColumnHandle::getPath, toUnmodifiableList())));
        List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(columns);
        List<OrcColumn> fileReadColumns = new ArrayList<>(readColumns.size());
        List<Type> fileReadTypes = new ArrayList<>(readColumns.size());
        List<ProjectedLayout> projectedLayouts = new ArrayList<>(readColumns.size());
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(readColumns.size());
        for (IcebergColumnHandle column : readColumns) {
            verify(column.isBaseColumn(), "Column projections must be based from a root column");
            OrcColumn orcColumn = fileColumnsByIcebergId.get(column.getId());
            if (orcColumn != null) {
                Type readType = getOrcReadType(column.getType(), typeManager);
                if (column.getType() == UUID && !"UUID".equals(orcColumn.getAttributes().get(ICEBERG_BINARY_TYPE))) {
                    throw new TrinoException(ICEBERG_BAD_DATA, format("Expected ORC column for UUID data to be annotated with %s=UUID: %s", ICEBERG_BINARY_TYPE, orcColumn));
                }
                List<List<Integer>> fieldIdProjections = projectionsByFieldId.get(column.getId());
                ProjectedLayout projectedLayout = IcebergOrcProjectedLayout.createProjectedLayout(orcColumn, fieldIdProjections);
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                projectedLayouts.add(projectedLayout);
                for (Map.Entry<IcebergColumnHandle, Domain> domainEntry : effectivePredicateDomains.entrySet()) {
                    IcebergColumnHandle predicateColumn = domainEntry.getKey();
                    OrcColumn predicateOrcColumn = fileColumnsByIcebergId.get(predicateColumn.getId());
                    if (predicateOrcColumn != null && column.getColumnIdentity().equals(predicateColumn.getBaseColumnIdentity())) {
                        predicateBuilder.addColumn(predicateOrcColumn.getColumnId(), domainEntry.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(column.getType()));
            }
        }
        AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
        OrcDataSourceId orcDataSourceId = orcDataSource.getId();
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, projectedLayouts, predicateBuilder.build(), start, length, UTC, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSourceId, exception), new IdBasedFieldMapperFactory(readColumns));
        return new ReaderPageSource(new OrcPageSource(recordReader, columnAdaptations, orcDataSource, Optional.empty(), Optional.empty(), memoryUsage, stats), columnProjections);
    } catch (Exception e) {
        if (orcDataSource != null) {
            try {
                orcDataSource.close();
            } catch (IOException ignored) {
            }
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ORC_ICEBERG_ID_KEY(io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ICEBERG_CANNOT_OPEN_SPLIT(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) UUID(io.trino.spi.type.UuidType.UUID) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) ICEBERG_FILESYSTEM_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) IcebergSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MappedField(org.apache.iceberg.mapping.MappedField) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) IcebergSessionProperties.isOrcNestedLazy(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy) IcebergSessionProperties.getOrcMaxBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IcebergSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) ICEBERG_MISSING_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) MappedFields(org.apache.iceberg.mapping.MappedFields) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) OrcType(io.trino.orc.metadata.OrcType) Predicate(io.trino.parquet.predicate.Predicate) IcebergSessionProperties.isUseFileSizeFromMetadata(io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata) MapType(io.trino.spi.type.MapType) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) StandardTypes(io.trino.spi.type.StandardTypes) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IcebergSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) IcebergSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) UTC(org.joda.time.DateTimeZone.UTC) Field(io.trino.parquet.Field) Traverser(com.google.common.graph.Traverser) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) IcebergSessionProperties.getOrcStreamBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetSchemaUtil(org.apache.iceberg.parquet.ParquetSchemaUtil) OrcColumn(io.trino.orc.OrcColumn) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) MetadataReader(io.trino.parquet.reader.MetadataReader) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) OrcRecordReader(io.trino.orc.OrcRecordReader) NameMapping(org.apache.iceberg.mapping.NameMapping) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) RowType(io.trino.spi.type.RowType) ICEBERG_BAD_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) ParquetReader(io.trino.parquet.reader.ParquetReader) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) ICEBERG_CURSOR_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) OrcDataSourceId(io.trino.orc.OrcDataSourceId) MessageType(org.apache.parquet.schema.MessageType) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) IcebergSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) Function(java.util.function.Function) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) IcebergSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) OrcReader(io.trino.orc.OrcReader) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ICEBERG_BINARY_TYPE(io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) OrcCorruptionException(io.trino.orc.OrcCorruptionException) Collectors.toList(java.util.stream.Collectors.toList) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) ParquetDataSource(io.trino.parquet.ParquetDataSource) TypeManager(io.trino.spi.type.TypeManager) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) ArrayList(java.util.ArrayList) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) OrcDataSource(io.trino.orc.OrcDataSource) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OrcColumn(io.trino.orc.OrcColumn) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) TrinoException(io.trino.spi.TrinoException) OrcCorruptionException(io.trino.orc.OrcCorruptionException) OrcType(io.trino.orc.metadata.OrcType) MapType(io.trino.spi.type.MapType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) OrcReader(io.trino.orc.OrcReader) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation)

Example 3 with HdfsOrcDataSource

use of io.trino.plugin.hive.orc.HdfsOrcDataSource in project trino by trinodb.

the class SortingFileWriter method mergeFiles.

private void mergeFiles(Iterable<TempFile> files, Consumer<Page> consumer) {
    try (Closer closer = Closer.create()) {
        Collection<Iterator<Page>> iterators = new ArrayList<>();
        for (TempFile tempFile : files) {
            Path file = tempFile.getPath();
            OrcDataSource dataSource = new HdfsOrcDataSource(new OrcDataSourceId(file.toString()), fileSystem.getFileStatus(file).getLen(), new OrcReaderOptions(), fileSystem.open(file), new FileFormatDataSourceStats());
            closer.register(dataSource);
            iterators.add(new TempFileReader(types, dataSource));
        }
        new MergingPageIterator(iterators, types, sortFields, sortOrders, typeOperators).forEachRemaining(consumer);
        for (TempFile tempFile : files) {
            Path file = tempFile.getPath();
            if (!fileSystem.delete(file, false)) {
                throw new IOException("Failed to delete temporary file: " + file);
            }
        }
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
}
Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) MergingPageIterator(io.trino.plugin.hive.util.MergingPageIterator) OrcDataSourceId(io.trino.orc.OrcDataSourceId) ArrayList(java.util.ArrayList) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) OrcReaderOptions(io.trino.orc.OrcReaderOptions) MergingPageIterator(io.trino.plugin.hive.util.MergingPageIterator) Iterator(java.util.Iterator) TempFileReader(io.trino.plugin.hive.util.TempFileReader)

Aggregations

OrcDataSource (io.trino.orc.OrcDataSource)3 OrcDataSourceId (io.trino.orc.OrcDataSourceId)3 OrcReaderOptions (io.trino.orc.OrcReaderOptions)3 HdfsOrcDataSource (io.trino.plugin.hive.orc.HdfsOrcDataSource)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Path (org.apache.hadoop.fs.Path)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 Verify.verify (com.google.common.base.Verify.verify)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)1 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)1 Maps.uniqueIndex (com.google.common.collect.Maps.uniqueIndex)1 Traverser (com.google.common.graph.Traverser)1 Closer (com.google.common.io.Closer)1 AggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext)1 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)1