Search in sources :

Example 6 with StripeMetadataSourceFactory

use of com.facebook.presto.orc.StripeMetadataSourceFactory in project presto by prestodb.

the class OrcFileRewriter method rewrite.

public OrcFileInfo rewrite(FileSystem fileSystem, Map<String, Type> allColumnTypes, Path input, Path output, BitSet rowsToDelete) throws IOException {
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
        OrcDataSource dataSource = orcDataEnvironment.createOrcDataSource(fileSystem, input, readerAttributes)) {
        OrcReader reader = new OrcReader(dataSource, ORC, orcFileTailSource, stripeMetadataSourceFactory, new RaptorOrcAggregatedMemoryContext(), new OrcReaderOptions(readerAttributes.getMaxMergeDistance(), readerAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE, readerAttributes.isZstdJniDecompressionEnabled()), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
        if (reader.getFooter().getNumberOfRows() < rowsToDelete.length()) {
            throw new IOException("File has fewer rows than deletion vector");
        }
        int deleteRowCount = rowsToDelete.cardinality();
        if (reader.getFooter().getNumberOfRows() == deleteRowCount) {
            return new OrcFileInfo(0, 0);
        }
        if (reader.getFooter().getNumberOfRows() >= Integer.MAX_VALUE) {
            throw new IOException("File has too many rows");
        }
        int inputRowCount = toIntExact(reader.getFooter().getNumberOfRows());
        Map<String, Integer> currentColumnIds = IntStream.range(0, reader.getColumnNames().size()).boxed().collect(toMap(reader.getColumnNames()::get, i -> i));
        ImmutableList.Builder<Type> writerColumnTypesBuilder = ImmutableList.builder();
        ImmutableList.Builder<String> writerColumnIdsBuilder = ImmutableList.builder();
        ImmutableList.Builder<Integer> readerColumnIndexBuilder = ImmutableList.builder();
        // Build columns for writer; keep the right ordinal
        Map<String, Type> orderedAllColumnTypes = new TreeMap<>(Comparator.comparingLong(Long::parseLong));
        orderedAllColumnTypes.putAll(allColumnTypes);
        for (Map.Entry<String, Type> columnType : orderedAllColumnTypes.entrySet()) {
            // Get the intersection of the provide columns and the actual columns
            Integer currentColumnIndex = currentColumnIds.get(columnType.getKey());
            if (currentColumnIndex != null) {
                readerColumnIndexBuilder.add(currentColumnIndex);
                writerColumnTypesBuilder.add(columnType.getValue());
                writerColumnIdsBuilder.add(columnType.getKey());
            }
        }
        List<Type> writerColumnTypes = writerColumnTypesBuilder.build();
        List<String> writerColumnIds = writerColumnIdsBuilder.build();
        List<Integer> readerColumnIndex = readerColumnIndexBuilder.build();
        Map<Integer, Type> readerColumns = IntStream.range(0, readerColumnIndex.size()).boxed().collect(toMap(readerColumnIndex::get, writerColumnTypes::get));
        if (writerColumnTypes.isEmpty()) {
            // no intersection; directly return
            return new OrcFileInfo(0, 0);
        }
        StorageTypeConverter converter = new StorageTypeConverter(typeManager);
        List<Type> writerStorageTypes = writerColumnTypes.stream().map(converter::toStorageType).collect(toImmutableList());
        long start = System.nanoTime();
        Map<String, String> userMetadata = ImmutableMap.of();
        if (reader.getFooter().getUserMetadata().containsKey(OrcFileMetadata.KEY)) {
            // build metadata if the original file has it
            ImmutableMap.Builder<Long, TypeSignature> metadataBuilder = ImmutableMap.builder();
            for (int i = 0; i < writerColumnIds.size(); i++) {
                metadataBuilder.put(Long.parseLong(writerColumnIds.get(i)), writerColumnTypes.get(i).getTypeSignature());
            }
            userMetadata = ImmutableMap.of(OrcFileMetadata.KEY, METADATA_CODEC.toJson(new OrcFileMetadata(metadataBuilder.build())));
        }
        StorageTypeConverter storageTypeConverter = new StorageTypeConverter(typeManager);
        try (Closer<OrcBatchRecordReader, IOException> recordReader = closer(reader.createBatchRecordReader(storageTypeConverter.toStorageTypes(readerColumns), TRUE, DEFAULT_STORAGE_TIMEZONE, new RaptorOrcAggregatedMemoryContext(), INITIAL_BATCH_SIZE), OrcBatchRecordReader::close);
            Closer<OrcWriter, IOException> writer = closer(new OrcWriter(orcDataEnvironment.createOrcDataSink(fileSystem, output), writerColumnIds, writerStorageTypes, ORC, compression, Optional.empty(), NO_ENCRYPTION, getDefaultOrcWriterOptions(), userMetadata, DEFAULT_STORAGE_TIMEZONE, validate, HASHED, stats), OrcWriter::close)) {
            OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, writerColumnTypes, readerColumnIndexBuilder.build());
            log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
            return fileInfo;
        }
    } catch (NotSupportedException e) {
        throw new PrestoException(NOT_SUPPORTED, e.getMessage(), e);
    }
}
Also used : JsonCodec(com.facebook.airlift.json.JsonCodec) Page(com.facebook.presto.common.Page) NotSupportedException(com.facebook.presto.common.NotSupportedException) DEFAULT_STORAGE_TIMEZONE(com.facebook.presto.raptor.storage.OrcStorageManager.DEFAULT_STORAGE_TIMEZONE) FileSystem(org.apache.hadoop.fs.FileSystem) OrcWriter(com.facebook.presto.orc.OrcWriter) TypeSignature(com.facebook.presto.common.type.TypeSignature) Closer.closer(com.facebook.presto.raptor.util.Closer.closer) Collectors.toMap(java.util.stream.Collectors.toMap) HUGE_MAX_READ_BLOCK_SIZE(com.facebook.presto.raptor.storage.OrcStorageManager.HUGE_MAX_READ_BLOCK_SIZE) Duration.nanosSince(io.airlift.units.Duration.nanosSince) Map(java.util.Map) RaptorOrcAggregatedMemoryContext(com.facebook.presto.raptor.RaptorOrcAggregatedMemoryContext) Path(org.apache.hadoop.fs.Path) RuntimeStats(com.facebook.presto.common.RuntimeStats) OrcDataSource(com.facebook.presto.orc.OrcDataSource) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) WriterStats(com.facebook.presto.orc.WriterStats) ImmutableMap(com.google.common.collect.ImmutableMap) Closer(com.facebook.presto.raptor.util.Closer) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) JsonCodec.jsonCodec(com.facebook.airlift.json.JsonCodec.jsonCodec) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) TRUE(com.facebook.presto.orc.OrcPredicate.TRUE) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Optional(java.util.Optional) OrcWriterOptions.getDefaultOrcWriterOptions(com.facebook.presto.orc.OrcWriterOptions.getDefaultOrcWriterOptions) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) IntStream(java.util.stream.IntStream) Logger(com.facebook.airlift.log.Logger) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) PrestoException(com.facebook.presto.spi.PrestoException) InterruptedIOException(java.io.InterruptedIOException) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) Math.toIntExact(java.lang.Math.toIntExact) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) HASHED(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode.HASHED) Type(com.facebook.presto.common.type.Type) NO_ENCRYPTION(com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION) IOException(java.io.IOException) TreeMap(java.util.TreeMap) BitSet(java.util.BitSet) Block(com.facebook.presto.common.block.Block) OrcReader(com.facebook.presto.orc.OrcReader) Comparator(java.util.Comparator) RuntimeStats(com.facebook.presto.common.RuntimeStats) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) OrcWriter(com.facebook.presto.orc.OrcWriter) PrestoException(com.facebook.presto.spi.PrestoException) RaptorOrcAggregatedMemoryContext(com.facebook.presto.raptor.RaptorOrcAggregatedMemoryContext) TypeSignature(com.facebook.presto.common.type.TypeSignature) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) OrcDataSource(com.facebook.presto.orc.OrcDataSource) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ImmutableMap(com.google.common.collect.ImmutableMap) Type(com.facebook.presto.common.type.Type) OrcReader(com.facebook.presto.orc.OrcReader) NotSupportedException(com.facebook.presto.common.NotSupportedException) Collectors.toMap(java.util.stream.Collectors.toMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TreeMap(java.util.TreeMap)

Aggregations

StripeMetadataSourceFactory (com.facebook.presto.orc.StripeMetadataSourceFactory)6 OrcFileTailSource (com.facebook.presto.orc.cache.OrcFileTailSource)6 OrcDataSourceId (com.facebook.presto.orc.OrcDataSourceId)5 List (java.util.List)5 Optional (java.util.Optional)5 Objects.requireNonNull (java.util.Objects.requireNonNull)4 ConfigBinder.configBinder (com.facebook.airlift.configuration.ConfigBinder.configBinder)3 Type (com.facebook.presto.common.type.Type)3 TypeManager (com.facebook.presto.common.type.TypeManager)3 FileFormatDataSourceStats (com.facebook.presto.hive.FileFormatDataSourceStats)3 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)3 HiveClientConfig (com.facebook.presto.hive.HiveClientConfig)3 CachingStripeMetadataSource (com.facebook.presto.orc.CachingStripeMetadataSource)3 DwrfAwareStripeMetadataSourceFactory (com.facebook.presto.orc.DwrfAwareStripeMetadataSourceFactory)3 DwrfKeyProvider (com.facebook.presto.orc.DwrfKeyProvider)3 OrcBatchRecordReader (com.facebook.presto.orc.OrcBatchRecordReader)3 OrcDataSource (com.facebook.presto.orc.OrcDataSource)3 ORC (com.facebook.presto.orc.OrcEncoding.ORC)3 OrcReader (com.facebook.presto.orc.OrcReader)3 INITIAL_BATCH_SIZE (com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE)3