Search in sources :

Example 6 with OrcFileTailSource

use of com.facebook.presto.orc.cache.OrcFileTailSource in project presto by prestodb.

the class IcebergPageSourceProvider method createBatchOrcPageSource.

private static ConnectorPageSource createBatchOrcPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, boolean isCacheable, List<IcebergColumnHandle> regularColumns, TypeManager typeManager, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, OrcEncoding orcEncoding, DataSize maxBufferSize, DataSize streamBufferSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, int domainCompactionThreshold, OrcFileTailSource orcFileTailSource, StripeMetadataSourceFactory stripeMetadataSourceFactory, FileFormatDataSourceStats stats, Optional<EncryptionInformation> encryptionInformation, DwrfEncryptionProvider dwrfEncryptionProvider) {
    OrcDataSource orcDataSource = null;
    try {
        ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        long fileSize = fileStatus.getLen();
        long modificationTime = fileStatus.getModificationTime();
        HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.openFile(path, hiveFileContext));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options.getMaxMergeDistance(), maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats);
        // Todo: pass real columns to ProjectionBasedDwrfKeyProvider instead of ImmutableList.of()
        DwrfKeyProvider dwrfKeyProvider = new ProjectionBasedDwrfKeyProvider(encryptionInformation, ImmutableList.of(), true, path);
        RuntimeStats runtimeStats = new RuntimeStats();
        OrcReader reader = new OrcReader(orcDataSource, orcEncoding, orcFileTailSource, stripeMetadataSourceFactory, new HiveOrcAggregatedMemoryContext(), options, isCacheable, dwrfEncryptionProvider, dwrfKeyProvider, runtimeStats);
        List<HiveColumnHandle> physicalColumnHandles = new ArrayList<>(regularColumns.size());
        ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
        ImmutableList.Builder<TupleDomainOrcPredicate.ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
        List<IcebergOrcColumn> fileOrcColumns = getFileOrcColumns(reader);
        Map<Integer, IcebergOrcColumn> fileOrcColumnByIcebergId = fileOrcColumns.stream().filter(orcColumn -> orcColumn.getAttributes().containsKey(ORC_ICEBERG_ID_KEY)).collect(toImmutableMap(orcColumn -> Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY)), orcColumn -> IcebergOrcColumn.copy(orcColumn).setIcebergColumnId(Optional.of(Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY))))));
        Map<String, IcebergOrcColumn> fileOrcColumnsByName = uniqueIndex(fileOrcColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
        int nextMissingColumnIndex = fileOrcColumnsByName.size();
        for (IcebergColumnHandle column : regularColumns) {
            IcebergOrcColumn icebergOrcColumn;
            boolean isExcludeColumn = false;
            if (fileOrcColumnByIcebergId.isEmpty()) {
                icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
            } else {
                icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
                if (icebergOrcColumn == null) {
                    // Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
                    icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
                    if (icebergOrcColumn != null) {
                        isExcludeColumn = true;
                    }
                }
            }
            if (icebergOrcColumn != null) {
                HiveColumnHandle columnHandle = new HiveColumnHandle(// Todo: using orc file column name
                column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), icebergOrcColumn.getOrcColumnId(), icebergOrcColumn.getColumnType(), Optional.empty(), Optional.empty());
                physicalColumnHandles.add(columnHandle);
                // Skip SchemaEvolution column
                if (!isExcludeColumn) {
                    includedColumns.put(columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature()));
                    columnReferences.add(new TupleDomainOrcPredicate.ColumnReference<>(columnHandle, columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature())));
                }
            } else {
                physicalColumnHandles.add(new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), nextMissingColumnIndex++, REGULAR, Optional.empty(), Optional.empty()));
            }
        }
        TupleDomain<HiveColumnHandle> hiveColumnHandleTupleDomain = effectivePredicate.transform(column -> {
            IcebergOrcColumn icebergOrcColumn;
            if (fileOrcColumnByIcebergId.isEmpty()) {
                icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
            } else {
                icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
                if (icebergOrcColumn == null) {
                    // Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
                    icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
                }
            }
            return new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), // Note: the HiveColumnHandle.hiveColumnIndex starts from '0' while the IcebergColumnHandle.id starts from '1'
            icebergOrcColumn != null ? icebergOrcColumn.getOrcColumnId() : column.getId() - 1, icebergOrcColumn != null ? icebergOrcColumn.getColumnType() : REGULAR, Optional.empty(), Optional.empty());
        });
        OrcPredicate predicate = new TupleDomainOrcPredicate<>(hiveColumnHandleTupleDomain, columnReferences.build(), orcBloomFiltersEnabled, Optional.of(domainCompactionThreshold));
        OrcAggregatedMemoryContext systemMemoryUsage = new HiveOrcAggregatedMemoryContext();
        OrcBatchRecordReader recordReader = reader.createBatchRecordReader(includedColumns.build(), predicate, start, length, UTC, systemMemoryUsage, INITIAL_BATCH_SIZE);
        return new OrcBatchPageSource(recordReader, orcDataSource, physicalColumnHandles, typeManager, systemMemoryUsage, stats, runtimeStats);
    } catch (Exception e) {
        if (orcDataSource != null) {
            try {
                orcDataSource.close();
            } catch (IOException ignored) {
            }
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ConnectorTransactionHandle(com.facebook.presto.spi.connector.ConnectorTransactionHandle) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) ConnectorPageSourceProvider(com.facebook.presto.spi.connector.ConnectorPageSourceProvider) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) ORC_ICEBERG_ID_KEY(com.facebook.presto.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) IcebergSessionProperties.getOrcLazyReadSmallRanges(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ICEBERG_BAD_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) MetadataReader(com.facebook.presto.parquet.cache.MetadataReader) StandardTypes(com.facebook.presto.common.type.StandardTypes) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) IcebergSessionProperties.getOrcMaxReadBlockSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) ArrayList(java.util.ArrayList) IcebergSessionProperties.getOrcTinyStripeThreshold(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ROOT_COLUMN_ID(com.facebook.presto.iceberg.IcebergOrcColumn.ROOT_COLUMN_ID) ICEBERG_MISSING_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) FileFormat(org.apache.iceberg.FileFormat) Domain(com.facebook.presto.common.predicate.Domain) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) HiveSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize) ColumnHandle(com.facebook.presto.spi.ColumnHandle) IcebergSessionProperties.isOrcZstdJniDecompressionEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcZstdJniDecompressionEnabled) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OrcReader(com.facebook.presto.orc.OrcReader) ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) NO_CACHE_CONSTRAINTS(com.facebook.presto.hive.CacheQuota.NO_CACHE_CONSTRAINTS) IcebergSessionProperties.getOrcMaxBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) OrcBatchPageSource(com.facebook.presto.hive.orc.OrcBatchPageSource) SchemaTableName(com.facebook.presto.spi.SchemaTableName) SplitContext(com.facebook.presto.spi.SplitContext) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) HdfsContext(com.facebook.presto.hive.HdfsContext) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) HiveSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPredicate(com.facebook.presto.orc.OrcPredicate) HiveDwrfEncryptionProvider(com.facebook.presto.hive.HiveDwrfEncryptionProvider) String.format(java.lang.String.format) IcebergSessionProperties.isOrcBloomFiltersEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) Objects(java.util.Objects) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) IntStream(java.util.stream.IntStream) ConnectorTableLayoutHandle(com.facebook.presto.spi.ConnectorTableLayoutHandle) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) Function(java.util.function.Function) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) ImmutableList(com.google.common.collect.ImmutableList) ICEBERG_CANNOT_OPEN_SPLIT(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) OrcType(com.facebook.presto.orc.metadata.OrcType) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) IcebergSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) OrcEncoding(com.facebook.presto.orc.OrcEncoding) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) Field(com.facebook.presto.parquet.Field) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) IcebergSessionProperties.getOrcStreamBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) FileStatus(org.apache.hadoop.fs.FileStatus) RuntimeStats(com.facebook.presto.common.RuntimeStats) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) ArrayList(java.util.ArrayList) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) PrestoException(com.facebook.presto.spi.PrestoException) HiveFileContext(com.facebook.presto.hive.HiveFileContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcDataSource(com.facebook.presto.orc.OrcDataSource) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) OrcBatchPageSource(com.facebook.presto.hive.orc.OrcBatchPageSource) IOException(java.io.IOException) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) IOException(java.io.IOException) PrestoException(com.facebook.presto.spi.PrestoException) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) MessageType(org.apache.parquet.schema.MessageType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) OrcReader(com.facebook.presto.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) OrcPredicate(com.facebook.presto.orc.OrcPredicate)

Example 7 with OrcFileTailSource

use of com.facebook.presto.orc.cache.OrcFileTailSource in project presto by prestodb.

the class OrcFileRewriter method rewrite.

public OrcFileInfo rewrite(FileSystem fileSystem, Map<String, Type> allColumnTypes, Path input, Path output, BitSet rowsToDelete) throws IOException {
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
        OrcDataSource dataSource = orcDataEnvironment.createOrcDataSource(fileSystem, input, readerAttributes)) {
        OrcReader reader = new OrcReader(dataSource, ORC, orcFileTailSource, stripeMetadataSourceFactory, new RaptorOrcAggregatedMemoryContext(), new OrcReaderOptions(readerAttributes.getMaxMergeDistance(), readerAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE, readerAttributes.isZstdJniDecompressionEnabled()), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
        if (reader.getFooter().getNumberOfRows() < rowsToDelete.length()) {
            throw new IOException("File has fewer rows than deletion vector");
        }
        int deleteRowCount = rowsToDelete.cardinality();
        if (reader.getFooter().getNumberOfRows() == deleteRowCount) {
            return new OrcFileInfo(0, 0);
        }
        if (reader.getFooter().getNumberOfRows() >= Integer.MAX_VALUE) {
            throw new IOException("File has too many rows");
        }
        int inputRowCount = toIntExact(reader.getFooter().getNumberOfRows());
        Map<String, Integer> currentColumnIds = IntStream.range(0, reader.getColumnNames().size()).boxed().collect(toMap(reader.getColumnNames()::get, i -> i));
        ImmutableList.Builder<Type> writerColumnTypesBuilder = ImmutableList.builder();
        ImmutableList.Builder<String> writerColumnIdsBuilder = ImmutableList.builder();
        ImmutableList.Builder<Integer> readerColumnIndexBuilder = ImmutableList.builder();
        // Build columns for writer; keep the right ordinal
        Map<String, Type> orderedAllColumnTypes = new TreeMap<>(Comparator.comparingLong(Long::parseLong));
        orderedAllColumnTypes.putAll(allColumnTypes);
        for (Map.Entry<String, Type> columnType : orderedAllColumnTypes.entrySet()) {
            // Get the intersection of the provide columns and the actual columns
            Integer currentColumnIndex = currentColumnIds.get(columnType.getKey());
            if (currentColumnIndex != null) {
                readerColumnIndexBuilder.add(currentColumnIndex);
                writerColumnTypesBuilder.add(columnType.getValue());
                writerColumnIdsBuilder.add(columnType.getKey());
            }
        }
        List<Type> writerColumnTypes = writerColumnTypesBuilder.build();
        List<String> writerColumnIds = writerColumnIdsBuilder.build();
        List<Integer> readerColumnIndex = readerColumnIndexBuilder.build();
        Map<Integer, Type> readerColumns = IntStream.range(0, readerColumnIndex.size()).boxed().collect(toMap(readerColumnIndex::get, writerColumnTypes::get));
        if (writerColumnTypes.isEmpty()) {
            // no intersection; directly return
            return new OrcFileInfo(0, 0);
        }
        StorageTypeConverter converter = new StorageTypeConverter(typeManager);
        List<Type> writerStorageTypes = writerColumnTypes.stream().map(converter::toStorageType).collect(toImmutableList());
        long start = System.nanoTime();
        Map<String, String> userMetadata = ImmutableMap.of();
        if (reader.getFooter().getUserMetadata().containsKey(OrcFileMetadata.KEY)) {
            // build metadata if the original file has it
            ImmutableMap.Builder<Long, TypeSignature> metadataBuilder = ImmutableMap.builder();
            for (int i = 0; i < writerColumnIds.size(); i++) {
                metadataBuilder.put(Long.parseLong(writerColumnIds.get(i)), writerColumnTypes.get(i).getTypeSignature());
            }
            userMetadata = ImmutableMap.of(OrcFileMetadata.KEY, METADATA_CODEC.toJson(new OrcFileMetadata(metadataBuilder.build())));
        }
        StorageTypeConverter storageTypeConverter = new StorageTypeConverter(typeManager);
        try (Closer<OrcBatchRecordReader, IOException> recordReader = closer(reader.createBatchRecordReader(storageTypeConverter.toStorageTypes(readerColumns), TRUE, DEFAULT_STORAGE_TIMEZONE, new RaptorOrcAggregatedMemoryContext(), INITIAL_BATCH_SIZE), OrcBatchRecordReader::close);
            Closer<OrcWriter, IOException> writer = closer(new OrcWriter(orcDataEnvironment.createOrcDataSink(fileSystem, output), writerColumnIds, writerStorageTypes, ORC, compression, Optional.empty(), NO_ENCRYPTION, getDefaultOrcWriterOptions(), userMetadata, DEFAULT_STORAGE_TIMEZONE, validate, HASHED, stats), OrcWriter::close)) {
            OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, writerColumnTypes, readerColumnIndexBuilder.build());
            log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
            return fileInfo;
        }
    } catch (NotSupportedException e) {
        throw new PrestoException(NOT_SUPPORTED, e.getMessage(), e);
    }
}
Also used : JsonCodec(com.facebook.airlift.json.JsonCodec) Page(com.facebook.presto.common.Page) NotSupportedException(com.facebook.presto.common.NotSupportedException) DEFAULT_STORAGE_TIMEZONE(com.facebook.presto.raptor.storage.OrcStorageManager.DEFAULT_STORAGE_TIMEZONE) FileSystem(org.apache.hadoop.fs.FileSystem) OrcWriter(com.facebook.presto.orc.OrcWriter) TypeSignature(com.facebook.presto.common.type.TypeSignature) Closer.closer(com.facebook.presto.raptor.util.Closer.closer) Collectors.toMap(java.util.stream.Collectors.toMap) HUGE_MAX_READ_BLOCK_SIZE(com.facebook.presto.raptor.storage.OrcStorageManager.HUGE_MAX_READ_BLOCK_SIZE) Duration.nanosSince(io.airlift.units.Duration.nanosSince) Map(java.util.Map) RaptorOrcAggregatedMemoryContext(com.facebook.presto.raptor.RaptorOrcAggregatedMemoryContext) Path(org.apache.hadoop.fs.Path) RuntimeStats(com.facebook.presto.common.RuntimeStats) OrcDataSource(com.facebook.presto.orc.OrcDataSource) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) WriterStats(com.facebook.presto.orc.WriterStats) ImmutableMap(com.google.common.collect.ImmutableMap) Closer(com.facebook.presto.raptor.util.Closer) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) JsonCodec.jsonCodec(com.facebook.airlift.json.JsonCodec.jsonCodec) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) TRUE(com.facebook.presto.orc.OrcPredicate.TRUE) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Optional(java.util.Optional) OrcWriterOptions.getDefaultOrcWriterOptions(com.facebook.presto.orc.OrcWriterOptions.getDefaultOrcWriterOptions) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) IntStream(java.util.stream.IntStream) Logger(com.facebook.airlift.log.Logger) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) PrestoException(com.facebook.presto.spi.PrestoException) InterruptedIOException(java.io.InterruptedIOException) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) Math.toIntExact(java.lang.Math.toIntExact) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) HASHED(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode.HASHED) Type(com.facebook.presto.common.type.Type) NO_ENCRYPTION(com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION) IOException(java.io.IOException) TreeMap(java.util.TreeMap) BitSet(java.util.BitSet) Block(com.facebook.presto.common.block.Block) OrcReader(com.facebook.presto.orc.OrcReader) Comparator(java.util.Comparator) RuntimeStats(com.facebook.presto.common.RuntimeStats) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) OrcWriter(com.facebook.presto.orc.OrcWriter) PrestoException(com.facebook.presto.spi.PrestoException) RaptorOrcAggregatedMemoryContext(com.facebook.presto.raptor.RaptorOrcAggregatedMemoryContext) TypeSignature(com.facebook.presto.common.type.TypeSignature) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) OrcDataSource(com.facebook.presto.orc.OrcDataSource) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ImmutableMap(com.google.common.collect.ImmutableMap) Type(com.facebook.presto.common.type.Type) OrcReader(com.facebook.presto.orc.OrcReader) NotSupportedException(com.facebook.presto.common.NotSupportedException) Collectors.toMap(java.util.stream.Collectors.toMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TreeMap(java.util.TreeMap)

Aggregations

OrcFileTailSource (com.facebook.presto.orc.cache.OrcFileTailSource)7 List (java.util.List)7 Optional (java.util.Optional)7 StripeMetadataSourceFactory (com.facebook.presto.orc.StripeMetadataSourceFactory)6 OrcDataSourceId (com.facebook.presto.orc.OrcDataSourceId)5 CachingOrcFileTailSource (com.facebook.presto.orc.cache.CachingOrcFileTailSource)4 StorageOrcFileTailSource (com.facebook.presto.orc.cache.StorageOrcFileTailSource)4 OrcFileTail (com.facebook.presto.orc.metadata.OrcFileTail)4 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)4 Cache (com.google.common.cache.Cache)4 CacheBuilder (com.google.common.cache.CacheBuilder)4 Slice (io.airlift.slice.Slice)4 Math.toIntExact (java.lang.Math.toIntExact)4 MILLISECONDS (java.util.concurrent.TimeUnit.MILLISECONDS)4 ConfigBinder.configBinder (com.facebook.airlift.configuration.ConfigBinder.configBinder)3 Type (com.facebook.presto.common.type.Type)3 TypeManager (com.facebook.presto.common.type.TypeManager)3 CachingStripeMetadataSource (com.facebook.presto.orc.CachingStripeMetadataSource)3 DwrfAwareStripeMetadataSourceFactory (com.facebook.presto.orc.DwrfAwareStripeMetadataSourceFactory)3 DwrfKeyProvider (com.facebook.presto.orc.DwrfKeyProvider)3