Search in sources :

Example 6 with ORC

use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.

the class TestOrcReaderPositions method testStripeSkippingWithAppendNumber.

@Test
public void testStripeSkippingWithAppendNumber() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        createMultiStripeFile(tempFile.getFile());
        // EVery stripe has 20 rows and there are total of 5 stripes
        // test reading second and fourth stripes
        OrcPredicate predicate = (numberOfRows, statisticsByColumnIndex) -> {
            if (numberOfRows == 100) {
                return true;
            }
            IntegerStatistics stats = statisticsByColumnIndex.get(0).getIntegerStatistics();
            return ((stats.getMin() == 60) && (stats.getMax() == 117)) || ((stats.getMin() == 180) && (stats.getMax() == 237));
        };
        List<Long> expectedValues = new ArrayList<>();
        expectedValues.addAll(LongStream.range(20, 40).collect(ArrayList::new, List::add, List::addAll));
        expectedValues.addAll(LongStream.range(60, 80).collect(ArrayList::new, List::add, List::addAll));
        List<Long> actualValues = new ArrayList<>();
        OrcSelectiveRecordReader reader = createCustomOrcSelectiveRecordReader(tempFile, ORC, predicate, BIGINT, MAX_BATCH_SIZE, false, true);
        assertNotNull(reader);
        Page returnPage;
        while (true) {
            returnPage = reader.getNextPage();
            if (returnPage == null) {
                break;
            }
            Block rowNumberBlock = returnPage.getBlock(1);
            for (int i = 0; i < returnPage.getPositionCount(); i++) {
                actualValues.add(rowNumberBlock.getLong(i));
            }
        }
        assertEquals(actualValues, expectedValues);
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Page(com.facebook.presto.common.Page) OrcTester.createCustomOrcSelectiveRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcSelectiveRecordReader) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) ORC_12(com.facebook.presto.orc.OrcTester.Format.ORC_12) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) OrcTester.createOrcRecordWriter(com.facebook.presto.orc.OrcTester.createOrcRecordWriter) Path(org.apache.hadoop.fs.Path) Predicate(com.facebook.presto.common.relation.Predicate) RuntimeStats(com.facebook.presto.common.RuntimeStats) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) Footer(com.facebook.presto.orc.metadata.Footer) SqlFunctionProperties(com.facebook.presto.common.function.SqlFunctionProperties) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) Math.min(java.lang.Math.min) Assert.assertNotNull(org.testng.Assert.assertNotNull) NOOP_ORC_AGGREGATED_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcAggregatedMemoryContext.NOOP_ORC_AGGREGATED_MEMORY_CONTEXT) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) DataSize(io.airlift.units.DataSize) List(java.util.List) OrcTester.createCustomOrcRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcRecordReader) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) MAX_BLOCK_SIZE(com.facebook.presto.orc.OrcTester.MAX_BLOCK_SIZE) Slice(io.airlift.slice.Slice) VARCHAR(com.facebook.presto.common.type.VarcharType.VARCHAR) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) Assert.assertEquals(org.testng.Assert.assertEquals) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) FilterFunction(com.facebook.presto.common.predicate.FilterFunction) MAX_BATCH_SIZE(com.facebook.presto.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) LongStream(java.util.stream.LongStream) BATCH_SIZE_GROWTH_FACTOR(com.facebook.presto.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) NO_ENCRYPTION(com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION) IOException(java.io.IOException) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) StorageOrcFileTailSource(com.facebook.presto.orc.cache.StorageOrcFileTailSource) Serializer(org.apache.hadoop.hive.serde2.Serializer) IntegerStatistics(com.facebook.presto.orc.metadata.statistics.IntegerStatistics) Assert.assertTrue(org.testng.Assert.assertTrue) Block(com.facebook.presto.common.block.Block) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcTester.createSettableStructObjectInspector(com.facebook.presto.orc.OrcTester.createSettableStructObjectInspector) OrcTester.createCustomOrcSelectiveRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcSelectiveRecordReader) ArrayList(java.util.ArrayList) Block(com.facebook.presto.common.block.Block) Page(com.facebook.presto.common.Page) IntegerStatistics(com.facebook.presto.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 7 with ORC

use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.

the class IcebergPageSourceProvider method createBatchOrcPageSource.

private static ConnectorPageSource createBatchOrcPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, boolean isCacheable, List<IcebergColumnHandle> regularColumns, TypeManager typeManager, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, OrcEncoding orcEncoding, DataSize maxBufferSize, DataSize streamBufferSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, int domainCompactionThreshold, OrcFileTailSource orcFileTailSource, StripeMetadataSourceFactory stripeMetadataSourceFactory, FileFormatDataSourceStats stats, Optional<EncryptionInformation> encryptionInformation, DwrfEncryptionProvider dwrfEncryptionProvider) {
    OrcDataSource orcDataSource = null;
    try {
        ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        long fileSize = fileStatus.getLen();
        long modificationTime = fileStatus.getModificationTime();
        HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.openFile(path, hiveFileContext));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options.getMaxMergeDistance(), maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats);
        // Todo: pass real columns to ProjectionBasedDwrfKeyProvider instead of ImmutableList.of()
        DwrfKeyProvider dwrfKeyProvider = new ProjectionBasedDwrfKeyProvider(encryptionInformation, ImmutableList.of(), true, path);
        RuntimeStats runtimeStats = new RuntimeStats();
        OrcReader reader = new OrcReader(orcDataSource, orcEncoding, orcFileTailSource, stripeMetadataSourceFactory, new HiveOrcAggregatedMemoryContext(), options, isCacheable, dwrfEncryptionProvider, dwrfKeyProvider, runtimeStats);
        List<HiveColumnHandle> physicalColumnHandles = new ArrayList<>(regularColumns.size());
        ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
        ImmutableList.Builder<TupleDomainOrcPredicate.ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
        List<IcebergOrcColumn> fileOrcColumns = getFileOrcColumns(reader);
        Map<Integer, IcebergOrcColumn> fileOrcColumnByIcebergId = fileOrcColumns.stream().filter(orcColumn -> orcColumn.getAttributes().containsKey(ORC_ICEBERG_ID_KEY)).collect(toImmutableMap(orcColumn -> Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY)), orcColumn -> IcebergOrcColumn.copy(orcColumn).setIcebergColumnId(Optional.of(Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY))))));
        Map<String, IcebergOrcColumn> fileOrcColumnsByName = uniqueIndex(fileOrcColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
        int nextMissingColumnIndex = fileOrcColumnsByName.size();
        for (IcebergColumnHandle column : regularColumns) {
            IcebergOrcColumn icebergOrcColumn;
            boolean isExcludeColumn = false;
            if (fileOrcColumnByIcebergId.isEmpty()) {
                icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
            } else {
                icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
                if (icebergOrcColumn == null) {
                    // Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
                    icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
                    if (icebergOrcColumn != null) {
                        isExcludeColumn = true;
                    }
                }
            }
            if (icebergOrcColumn != null) {
                HiveColumnHandle columnHandle = new HiveColumnHandle(// Todo: using orc file column name
                column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), icebergOrcColumn.getOrcColumnId(), icebergOrcColumn.getColumnType(), Optional.empty(), Optional.empty());
                physicalColumnHandles.add(columnHandle);
                // Skip SchemaEvolution column
                if (!isExcludeColumn) {
                    includedColumns.put(columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature()));
                    columnReferences.add(new TupleDomainOrcPredicate.ColumnReference<>(columnHandle, columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature())));
                }
            } else {
                physicalColumnHandles.add(new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), nextMissingColumnIndex++, REGULAR, Optional.empty(), Optional.empty()));
            }
        }
        TupleDomain<HiveColumnHandle> hiveColumnHandleTupleDomain = effectivePredicate.transform(column -> {
            IcebergOrcColumn icebergOrcColumn;
            if (fileOrcColumnByIcebergId.isEmpty()) {
                icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
            } else {
                icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
                if (icebergOrcColumn == null) {
                    // Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
                    icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
                }
            }
            return new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), // Note: the HiveColumnHandle.hiveColumnIndex starts from '0' while the IcebergColumnHandle.id starts from '1'
            icebergOrcColumn != null ? icebergOrcColumn.getOrcColumnId() : column.getId() - 1, icebergOrcColumn != null ? icebergOrcColumn.getColumnType() : REGULAR, Optional.empty(), Optional.empty());
        });
        OrcPredicate predicate = new TupleDomainOrcPredicate<>(hiveColumnHandleTupleDomain, columnReferences.build(), orcBloomFiltersEnabled, Optional.of(domainCompactionThreshold));
        OrcAggregatedMemoryContext systemMemoryUsage = new HiveOrcAggregatedMemoryContext();
        OrcBatchRecordReader recordReader = reader.createBatchRecordReader(includedColumns.build(), predicate, start, length, UTC, systemMemoryUsage, INITIAL_BATCH_SIZE);
        return new OrcBatchPageSource(recordReader, orcDataSource, physicalColumnHandles, typeManager, systemMemoryUsage, stats, runtimeStats);
    } catch (Exception e) {
        if (orcDataSource != null) {
            try {
                orcDataSource.close();
            } catch (IOException ignored) {
            }
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ConnectorTransactionHandle(com.facebook.presto.spi.connector.ConnectorTransactionHandle) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) OrcDataSource(com.facebook.presto.orc.OrcDataSource) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) ConnectorPageSourceProvider(com.facebook.presto.spi.connector.ConnectorPageSourceProvider) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) ORC_ICEBERG_ID_KEY(com.facebook.presto.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) IcebergSessionProperties.getOrcLazyReadSmallRanges(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ICEBERG_BAD_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ParquetPageSource(com.facebook.presto.hive.parquet.ParquetPageSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) MetadataReader(com.facebook.presto.parquet.cache.MetadataReader) StandardTypes(com.facebook.presto.common.type.StandardTypes) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) IcebergSessionProperties.getOrcMaxReadBlockSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) ArrayList(java.util.ArrayList) IcebergSessionProperties.getOrcTinyStripeThreshold(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ROOT_COLUMN_ID(com.facebook.presto.iceberg.IcebergOrcColumn.ROOT_COLUMN_ID) ICEBERG_MISSING_DATA(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) FileFormat(org.apache.iceberg.FileFormat) Domain(com.facebook.presto.common.predicate.Domain) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) HiveSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize) ColumnHandle(com.facebook.presto.spi.ColumnHandle) IcebergSessionProperties.isOrcZstdJniDecompressionEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcZstdJniDecompressionEnabled) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OrcReader(com.facebook.presto.orc.OrcReader) ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) NO_CACHE_CONSTRAINTS(com.facebook.presto.hive.CacheQuota.NO_CACHE_CONSTRAINTS) IcebergSessionProperties.getOrcMaxBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) OrcBatchPageSource(com.facebook.presto.hive.orc.OrcBatchPageSource) SchemaTableName(com.facebook.presto.spi.SchemaTableName) SplitContext(com.facebook.presto.spi.SplitContext) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) HdfsContext(com.facebook.presto.hive.HdfsContext) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) HiveSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPredicate(com.facebook.presto.orc.OrcPredicate) HiveDwrfEncryptionProvider(com.facebook.presto.hive.HiveDwrfEncryptionProvider) String.format(java.lang.String.format) IcebergSessionProperties.isOrcBloomFiltersEnabled(com.facebook.presto.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) Objects(java.util.Objects) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) IntStream(java.util.stream.IntStream) ConnectorTableLayoutHandle(com.facebook.presto.spi.ConnectorTableLayoutHandle) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) Function(java.util.function.Function) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) ImmutableList(com.google.common.collect.ImmutableList) ICEBERG_CANNOT_OPEN_SPLIT(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) OrcType(com.facebook.presto.orc.metadata.OrcType) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) IcebergSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) OrcEncoding(com.facebook.presto.orc.OrcEncoding) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) Field(com.facebook.presto.parquet.Field) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) IcebergSessionProperties.getOrcStreamBufferSize(com.facebook.presto.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) FileStatus(org.apache.hadoop.fs.FileStatus) RuntimeStats(com.facebook.presto.common.RuntimeStats) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) ArrayList(java.util.ArrayList) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) PrestoException(com.facebook.presto.spi.PrestoException) HiveFileContext(com.facebook.presto.hive.HiveFileContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) ProjectionBasedDwrfKeyProvider(com.facebook.presto.hive.orc.ProjectionBasedDwrfKeyProvider) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) OrcDataSource(com.facebook.presto.orc.OrcDataSource) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) HiveOrcAggregatedMemoryContext(com.facebook.presto.hive.HiveOrcAggregatedMemoryContext) OrcBatchPageSource(com.facebook.presto.hive.orc.OrcBatchPageSource) IOException(java.io.IOException) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) IOException(java.io.IOException) PrestoException(com.facebook.presto.spi.PrestoException) TypeConverter.toHiveType(com.facebook.presto.iceberg.TypeConverter.toHiveType) MessageType(org.apache.parquet.schema.MessageType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) OrcReader(com.facebook.presto.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ExtendedFileSystem(com.facebook.presto.hive.filesystem.ExtendedFileSystem) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) OrcPredicate(com.facebook.presto.orc.OrcPredicate)

Example 8 with ORC

use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
    if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    OrcEncoding orcEncoding;
    if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        orcEncoding = ORC;
    } else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        orcEncoding = DWRF;
    } else {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration, orcEncoding);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
    List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        DataSink dataSink = createDataSink(session, fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats);
                } catch (IOException e) {
                    throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        Optional<DwrfWriterEncryption> dwrfWriterEncryption = createDwrfEncryption(encryptionInformation, fileColumnNames, fileColumnTypes);
        return Optional.of(new OrcFileWriter(dataSink, rollbackAction, orcEncoding, fileColumnNames, fileColumnTypes, compression, orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)).withIgnoreDictionaryRowGroupSizes(isExecutionBasedMemoryAccountingEnabled(session)).withDwrfStripeCacheEnabled(isDwrfWriterStripeCacheEnabled(session)).withDwrfStripeCacheMaxSize(getDwrfWriterStripeCacheeMaxSize(session)).build(), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(MetastoreUtil.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats, dwrfEncryptionProvider, dwrfWriterEncryption));
    } catch (IOException e) {
        throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating " + orcEncoding + " file. " + e.getMessage(), e);
    }
}
Also used : HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.getDwrfWriterStripeCacheeMaxSize(com.facebook.presto.hive.HiveSessionProperties.getDwrfWriterStripeCacheeMaxSize) HIVE_WRITE_VALIDATION_FAILED(com.facebook.presto.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) DataSink(com.facebook.presto.common.io.DataSink) HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory) OrcConf(org.apache.orc.OrcConf) HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMinStripeSize) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getOrcOptimizedWriterValidateMode(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterValidateMode) OrcDataSource(com.facebook.presto.orc.OrcDataSource) Splitter(com.google.common.base.Splitter) ENGLISH(java.util.Locale.ENGLISH) WriterEncryptionGroup(com.facebook.presto.orc.WriterEncryptionGroup) CRYPTO_SERVICE(com.facebook.presto.orc.metadata.KeyProvider.CRYPTO_SERVICE) HIVE_UNSUPPORTED_FORMAT(com.facebook.presto.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) KeyProvider(com.facebook.presto.orc.metadata.KeyProvider) META_TABLE_COLUMN_TYPES(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isDwrfWriterStripeCacheEnabled(com.facebook.presto.hive.HiveSessionProperties.isDwrfWriterStripeCacheEnabled) HiveSessionProperties.isExecutionBasedMemoryAccountingEnabled(com.facebook.presto.hive.HiveSessionProperties.isExecutionBasedMemoryAccountingEnabled) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) DWRF(com.facebook.presto.orc.OrcEncoding.DWRF) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) ImmutableListMultimap(com.google.common.collect.ImmutableListMultimap) Optional(java.util.Optional) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) HiveType.toHiveTypes(com.facebook.presto.hive.HiveType.toHiveTypes) IntStream(java.util.stream.IntStream) HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize) Slice(io.airlift.slice.Slice) HiveSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.hive.HiveSessionProperties.getOrcMaxMergeDistance) Flatten(org.weakref.jmx.Flatten) Callable(java.util.concurrent.Callable) META_TABLE_COLUMNS(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS) DwrfWriterEncryption(com.facebook.presto.orc.DwrfWriterEncryption) DataSinkFactory(com.facebook.presto.hive.datasink.DataSinkFactory) PrestoException(com.facebook.presto.spi.PrestoException) Supplier(java.util.function.Supplier) UNKNOWN(com.facebook.presto.orc.metadata.KeyProvider.UNKNOWN) Inject(javax.inject.Inject) MetastoreUtil(com.facebook.presto.hive.metastore.MetastoreUtil) Managed(org.weakref.jmx.Managed) TypeManager(com.facebook.presto.common.type.TypeManager) HiveSessionProperties.getOrcMaxBufferSize(com.facebook.presto.hive.HiveSessionProperties.getOrcMaxBufferSize) Objects.requireNonNull(java.util.Objects.requireNonNull) OrcType(com.facebook.presto.orc.metadata.OrcType) OrcWriterStats(com.facebook.presto.orc.OrcWriterStats) Type(com.facebook.presto.common.type.Type) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) OrcEncoding(com.facebook.presto.orc.OrcEncoding) Properties(java.util.Properties) DefaultOrcWriterFlushPolicy(com.facebook.presto.orc.DefaultOrcWriterFlushPolicy) HiveSessionProperties.getOrcStreamBufferSize(com.facebook.presto.hive.HiveSessionProperties.getOrcStreamBufferSize) IOException(java.io.IOException) HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows) JobConf(org.apache.hadoop.mapred.JobConf) Collectors.toList(java.util.stream.Collectors.toList) HiveSessionProperties.getOrcStringStatisticsLimit(com.facebook.presto.hive.HiveSessionProperties.getOrcStringStatisticsLimit) HIVE_WRITER_OPEN_ERROR(com.facebook.presto.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) DataSink(com.facebook.presto.common.io.DataSink) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) PrestoException(com.facebook.presto.spi.PrestoException) OrcEncoding(com.facebook.presto.orc.OrcEncoding) IOException(java.io.IOException) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) FileSystem(org.apache.hadoop.fs.FileSystem) DwrfWriterEncryption(com.facebook.presto.orc.DwrfWriterEncryption) Supplier(java.util.function.Supplier)

Example 9 with ORC

use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.

the class OrcStorageManager method toOrcFileType.

static Type toOrcFileType(Type raptorType, TypeManager typeManager) {
    // TIMESTAMPS are stored as BIGINT to void the poor encoding in ORC
    if (raptorType == TimestampType.TIMESTAMP) {
        return BIGINT;
    }
    if (raptorType instanceof ArrayType) {
        Type elementType = toOrcFileType(((ArrayType) raptorType).getElementType(), typeManager);
        return new ArrayType(elementType);
    }
    if (raptorType instanceof MapType) {
        TypeSignature keyType = toOrcFileType(((MapType) raptorType).getKeyType(), typeManager).getTypeSignature();
        TypeSignature valueType = toOrcFileType(((MapType) raptorType).getValueType(), typeManager).getTypeSignature();
        return typeManager.getParameterizedType(StandardTypes.MAP, ImmutableList.of(TypeSignatureParameter.of(keyType), TypeSignatureParameter.of(valueType)));
    }
    if (raptorType instanceof RowType) {
        List<RowType.Field> fields = ((RowType) raptorType).getFields().stream().map(field -> new RowType.Field(field.getName(), toOrcFileType(field.getType(), typeManager))).collect(toImmutableList());
        return RowType.from(fields);
    }
    return raptorType;
}
Also used : ArrayType(com.facebook.presto.common.type.ArrayType) CharType.createCharType(com.facebook.presto.common.type.CharType.createCharType) Page(com.facebook.presto.common.Page) DateTimeZone(org.joda.time.DateTimeZone) VarcharType.createUnboundedVarcharType(com.facebook.presto.common.type.VarcharType.createUnboundedVarcharType) FileSystem(org.apache.hadoop.fs.FileSystem) TypeSignature(com.facebook.presto.common.type.TypeSignature) RaptorColumnHandle.isShardRowIdColumn(com.facebook.presto.raptor.RaptorColumnHandle.isShardRowIdColumn) Future(java.util.concurrent.Future) Map(java.util.Map) OrcDataSource(com.facebook.presto.orc.OrcDataSource) ColumnInfo(com.facebook.presto.raptor.metadata.ColumnInfo) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) RAPTOR_LOCAL_DISK_FULL(com.facebook.presto.raptor.RaptorErrorCode.RAPTOR_LOCAL_DISK_FULL) RAPTOR_RECOVERY_ERROR(com.facebook.presto.raptor.RaptorErrorCode.RAPTOR_RECOVERY_ERROR) HiveFileContext(com.facebook.presto.hive.HiveFileContext) JsonCodec.jsonCodec(com.facebook.airlift.json.JsonCodec.jsonCodec) NULL_COLUMN(com.facebook.presto.raptor.storage.OrcPageSource.NULL_COLUMN) ORC(com.facebook.presto.orc.OrcEncoding.ORC) StandardTypes(com.facebook.presto.common.type.StandardTypes) DecimalType(com.facebook.presto.common.type.DecimalType) ColumnStats(com.facebook.presto.raptor.metadata.ColumnStats) TypeSignatureParameter(com.facebook.presto.common.type.TypeSignatureParameter) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) OrcOptimizedWriterStage(com.facebook.presto.raptor.storage.StorageManagerConfig.OrcOptimizedWriterStage) ArrayList(java.util.ArrayList) ROWID_COLUMN(com.facebook.presto.raptor.storage.OrcPageSource.ROWID_COLUMN) OptionalLong(java.util.OptionalLong) MoreFutures.allAsList(com.facebook.airlift.concurrent.MoreFutures.allAsList) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) ArrayType(com.facebook.presto.common.type.ArrayType) RaptorColumnHandle(com.facebook.presto.raptor.RaptorColumnHandle) OrcWriterStats(com.facebook.presto.orc.OrcWriterStats) ColumnReference(com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) StorageStripeMetadataSource(com.facebook.presto.orc.StorageStripeMetadataSource) IOException(java.io.IOException) UTC(org.joda.time.DateTimeZone.UTC) Throwables.throwIfInstanceOf(com.google.common.base.Throwables.throwIfInstanceOf) ExecutionException(java.util.concurrent.ExecutionException) RAPTOR_ERROR(com.facebook.presto.raptor.RaptorErrorCode.RAPTOR_ERROR) BUCKET_NUMBER_COLUMN(com.facebook.presto.raptor.storage.OrcPageSource.BUCKET_NUMBER_COLUMN) RaptorConnectorId(com.facebook.presto.raptor.RaptorConnectorId) OrcReader(com.facebook.presto.orc.OrcReader) RowType(com.facebook.presto.common.type.RowType) JsonCodec(com.facebook.airlift.json.JsonCodec) TupleDomainOrcPredicate(com.facebook.presto.orc.TupleDomainOrcPredicate) TimeoutException(java.util.concurrent.TimeoutException) DataSink(com.facebook.presto.common.io.DataSink) Duration(io.airlift.units.Duration) PreDestroy(javax.annotation.PreDestroy) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) RaptorOrcAggregatedMemoryContext(com.facebook.presto.raptor.RaptorOrcAggregatedMemoryContext) Path(org.apache.hadoop.fs.Path) RuntimeStats(com.facebook.presto.common.RuntimeStats) RaptorColumnHandle.isBucketNumberColumn(com.facebook.presto.raptor.RaptorColumnHandle.isBucketNumberColumn) HdfsContext(com.facebook.presto.hive.HdfsContext) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableSet(com.google.common.collect.ImmutableSet) NodeManager(com.facebook.presto.spi.NodeManager) ImmutableMap(com.google.common.collect.ImmutableMap) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) RAPTOR_RECOVERY_TIMEOUT(com.facebook.presto.raptor.RaptorErrorCode.RAPTOR_RECOVERY_TIMEOUT) OrcPredicate(com.facebook.presto.orc.OrcPredicate) UUID(java.util.UUID) RaptorColumnHandle.isHiddenColumn(com.facebook.presto.raptor.RaptorColumnHandle.isHiddenColumn) Math.min(java.lang.Math.min) ENABLED_AND_VALIDATED(com.facebook.presto.raptor.storage.StorageManagerConfig.OrcOptimizedWriterStage.ENABLED_AND_VALIDATED) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Threads.daemonThreadsNamed(com.facebook.airlift.concurrent.Threads.daemonThreadsNamed) DataSize(io.airlift.units.DataSize) List(java.util.List) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Optional(java.util.Optional) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) TimestampType(com.facebook.presto.common.type.TimestampType) BackupManager(com.facebook.presto.raptor.backup.BackupManager) FileSystemUtil.xxhash64(com.facebook.presto.raptor.filesystem.FileSystemUtil.xxhash64) MapType(com.facebook.presto.common.type.MapType) CompletableFuture(java.util.concurrent.CompletableFuture) BackupStore(com.facebook.presto.raptor.backup.BackupStore) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) VarcharType.createVarcharType(com.facebook.presto.common.type.VarcharType.createVarcharType) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) OrcType(com.facebook.presto.orc.metadata.OrcType) Math.toIntExact(java.lang.Math.toIntExact) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) Type(com.facebook.presto.common.type.Type) ExecutorService(java.util.concurrent.ExecutorService) NamedTypeSignature(com.facebook.presto.common.type.NamedTypeSignature) OrcAggregatedMemoryContext(com.facebook.presto.orc.OrcAggregatedMemoryContext) RaptorColumnHandle.isShardUuidColumn(com.facebook.presto.raptor.RaptorColumnHandle.isShardUuidColumn) PETABYTE(io.airlift.units.DataSize.Unit.PETABYTE) NO_ENCRYPTION(com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION) VARBINARY(com.facebook.presto.common.type.VarbinaryType.VARBINARY) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) ShardInfo(com.facebook.presto.raptor.metadata.ShardInfo) Executors.newFixedThreadPool(java.util.concurrent.Executors.newFixedThreadPool) ShardRecorder(com.facebook.presto.raptor.metadata.ShardRecorder) TimeUnit(java.util.concurrent.TimeUnit) ShardStats.computeColumnStats(com.facebook.presto.raptor.storage.ShardStats.computeColumnStats) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) Executors.newCachedThreadPool(java.util.concurrent.Executors.newCachedThreadPool) Closeable(java.io.Closeable) SHARD_UUID_COLUMN(com.facebook.presto.raptor.storage.OrcPageSource.SHARD_UUID_COLUMN) RowFieldName(com.facebook.presto.common.type.RowFieldName) VisibleForTesting(com.google.common.annotations.VisibleForTesting) BitSet(java.util.BitSet) Block(com.facebook.presto.common.block.Block) CharType.createCharType(com.facebook.presto.common.type.CharType.createCharType) VarcharType.createUnboundedVarcharType(com.facebook.presto.common.type.VarcharType.createUnboundedVarcharType) DecimalType(com.facebook.presto.common.type.DecimalType) ArrayType(com.facebook.presto.common.type.ArrayType) RowType(com.facebook.presto.common.type.RowType) TimestampType(com.facebook.presto.common.type.TimestampType) MapType(com.facebook.presto.common.type.MapType) VarcharType.createVarcharType(com.facebook.presto.common.type.VarcharType.createVarcharType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) TypeSignature(com.facebook.presto.common.type.TypeSignature) NamedTypeSignature(com.facebook.presto.common.type.NamedTypeSignature) RowType(com.facebook.presto.common.type.RowType) MapType(com.facebook.presto.common.type.MapType)

Example 10 with ORC

use of com.facebook.presto.orc.OrcEncoding.ORC in project presto by prestodb.

the class OrcFileRewriter method rewrite.

public OrcFileInfo rewrite(FileSystem fileSystem, Map<String, Type> allColumnTypes, Path input, Path output, BitSet rowsToDelete) throws IOException {
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
        OrcDataSource dataSource = orcDataEnvironment.createOrcDataSource(fileSystem, input, readerAttributes)) {
        OrcReader reader = new OrcReader(dataSource, ORC, orcFileTailSource, stripeMetadataSourceFactory, new RaptorOrcAggregatedMemoryContext(), new OrcReaderOptions(readerAttributes.getMaxMergeDistance(), readerAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE, readerAttributes.isZstdJniDecompressionEnabled()), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
        if (reader.getFooter().getNumberOfRows() < rowsToDelete.length()) {
            throw new IOException("File has fewer rows than deletion vector");
        }
        int deleteRowCount = rowsToDelete.cardinality();
        if (reader.getFooter().getNumberOfRows() == deleteRowCount) {
            return new OrcFileInfo(0, 0);
        }
        if (reader.getFooter().getNumberOfRows() >= Integer.MAX_VALUE) {
            throw new IOException("File has too many rows");
        }
        int inputRowCount = toIntExact(reader.getFooter().getNumberOfRows());
        Map<String, Integer> currentColumnIds = IntStream.range(0, reader.getColumnNames().size()).boxed().collect(toMap(reader.getColumnNames()::get, i -> i));
        ImmutableList.Builder<Type> writerColumnTypesBuilder = ImmutableList.builder();
        ImmutableList.Builder<String> writerColumnIdsBuilder = ImmutableList.builder();
        ImmutableList.Builder<Integer> readerColumnIndexBuilder = ImmutableList.builder();
        // Build columns for writer; keep the right ordinal
        Map<String, Type> orderedAllColumnTypes = new TreeMap<>(Comparator.comparingLong(Long::parseLong));
        orderedAllColumnTypes.putAll(allColumnTypes);
        for (Map.Entry<String, Type> columnType : orderedAllColumnTypes.entrySet()) {
            // Get the intersection of the provide columns and the actual columns
            Integer currentColumnIndex = currentColumnIds.get(columnType.getKey());
            if (currentColumnIndex != null) {
                readerColumnIndexBuilder.add(currentColumnIndex);
                writerColumnTypesBuilder.add(columnType.getValue());
                writerColumnIdsBuilder.add(columnType.getKey());
            }
        }
        List<Type> writerColumnTypes = writerColumnTypesBuilder.build();
        List<String> writerColumnIds = writerColumnIdsBuilder.build();
        List<Integer> readerColumnIndex = readerColumnIndexBuilder.build();
        Map<Integer, Type> readerColumns = IntStream.range(0, readerColumnIndex.size()).boxed().collect(toMap(readerColumnIndex::get, writerColumnTypes::get));
        if (writerColumnTypes.isEmpty()) {
            // no intersection; directly return
            return new OrcFileInfo(0, 0);
        }
        StorageTypeConverter converter = new StorageTypeConverter(typeManager);
        List<Type> writerStorageTypes = writerColumnTypes.stream().map(converter::toStorageType).collect(toImmutableList());
        long start = System.nanoTime();
        Map<String, String> userMetadata = ImmutableMap.of();
        if (reader.getFooter().getUserMetadata().containsKey(OrcFileMetadata.KEY)) {
            // build metadata if the original file has it
            ImmutableMap.Builder<Long, TypeSignature> metadataBuilder = ImmutableMap.builder();
            for (int i = 0; i < writerColumnIds.size(); i++) {
                metadataBuilder.put(Long.parseLong(writerColumnIds.get(i)), writerColumnTypes.get(i).getTypeSignature());
            }
            userMetadata = ImmutableMap.of(OrcFileMetadata.KEY, METADATA_CODEC.toJson(new OrcFileMetadata(metadataBuilder.build())));
        }
        StorageTypeConverter storageTypeConverter = new StorageTypeConverter(typeManager);
        try (Closer<OrcBatchRecordReader, IOException> recordReader = closer(reader.createBatchRecordReader(storageTypeConverter.toStorageTypes(readerColumns), TRUE, DEFAULT_STORAGE_TIMEZONE, new RaptorOrcAggregatedMemoryContext(), INITIAL_BATCH_SIZE), OrcBatchRecordReader::close);
            Closer<OrcWriter, IOException> writer = closer(new OrcWriter(orcDataEnvironment.createOrcDataSink(fileSystem, output), writerColumnIds, writerStorageTypes, ORC, compression, Optional.empty(), NO_ENCRYPTION, getDefaultOrcWriterOptions(), userMetadata, DEFAULT_STORAGE_TIMEZONE, validate, HASHED, stats), OrcWriter::close)) {
            OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, writerColumnTypes, readerColumnIndexBuilder.build());
            log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
            return fileInfo;
        }
    } catch (NotSupportedException e) {
        throw new PrestoException(NOT_SUPPORTED, e.getMessage(), e);
    }
}
Also used : JsonCodec(com.facebook.airlift.json.JsonCodec) Page(com.facebook.presto.common.Page) NotSupportedException(com.facebook.presto.common.NotSupportedException) DEFAULT_STORAGE_TIMEZONE(com.facebook.presto.raptor.storage.OrcStorageManager.DEFAULT_STORAGE_TIMEZONE) FileSystem(org.apache.hadoop.fs.FileSystem) OrcWriter(com.facebook.presto.orc.OrcWriter) TypeSignature(com.facebook.presto.common.type.TypeSignature) Closer.closer(com.facebook.presto.raptor.util.Closer.closer) Collectors.toMap(java.util.stream.Collectors.toMap) HUGE_MAX_READ_BLOCK_SIZE(com.facebook.presto.raptor.storage.OrcStorageManager.HUGE_MAX_READ_BLOCK_SIZE) Duration.nanosSince(io.airlift.units.Duration.nanosSince) Map(java.util.Map) RaptorOrcAggregatedMemoryContext(com.facebook.presto.raptor.RaptorOrcAggregatedMemoryContext) Path(org.apache.hadoop.fs.Path) RuntimeStats(com.facebook.presto.common.RuntimeStats) OrcDataSource(com.facebook.presto.orc.OrcDataSource) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) WriterStats(com.facebook.presto.orc.WriterStats) ImmutableMap(com.google.common.collect.ImmutableMap) Closer(com.facebook.presto.raptor.util.Closer) INITIAL_BATCH_SIZE(com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) JsonCodec.jsonCodec(com.facebook.airlift.json.JsonCodec.jsonCodec) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) TRUE(com.facebook.presto.orc.OrcPredicate.TRUE) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Optional(java.util.Optional) OrcWriterOptions.getDefaultOrcWriterOptions(com.facebook.presto.orc.OrcWriterOptions.getDefaultOrcWriterOptions) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) IntStream(java.util.stream.IntStream) Logger(com.facebook.airlift.log.Logger) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) PrestoException(com.facebook.presto.spi.PrestoException) InterruptedIOException(java.io.InterruptedIOException) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) Math.toIntExact(java.lang.Math.toIntExact) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) HASHED(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode.HASHED) Type(com.facebook.presto.common.type.Type) NO_ENCRYPTION(com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION) IOException(java.io.IOException) TreeMap(java.util.TreeMap) BitSet(java.util.BitSet) Block(com.facebook.presto.common.block.Block) OrcReader(com.facebook.presto.orc.OrcReader) Comparator(java.util.Comparator) RuntimeStats(com.facebook.presto.common.RuntimeStats) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) OrcWriter(com.facebook.presto.orc.OrcWriter) PrestoException(com.facebook.presto.spi.PrestoException) RaptorOrcAggregatedMemoryContext(com.facebook.presto.raptor.RaptorOrcAggregatedMemoryContext) TypeSignature(com.facebook.presto.common.type.TypeSignature) OrcReaderOptions(com.facebook.presto.orc.OrcReaderOptions) FileSystem(org.apache.hadoop.fs.FileSystem) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) OrcDataSource(com.facebook.presto.orc.OrcDataSource) OrcBatchRecordReader(com.facebook.presto.orc.OrcBatchRecordReader) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ImmutableMap(com.google.common.collect.ImmutableMap) Type(com.facebook.presto.common.type.Type) OrcReader(com.facebook.presto.orc.OrcReader) NotSupportedException(com.facebook.presto.common.NotSupportedException) Collectors.toMap(java.util.stream.Collectors.toMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TreeMap(java.util.TreeMap)

Aggregations

ORC (com.facebook.presto.orc.OrcEncoding.ORC)10 ImmutableMap (com.google.common.collect.ImmutableMap)10 IOException (java.io.IOException)10 List (java.util.List)10 Map (java.util.Map)8 Path (org.apache.hadoop.fs.Path)8 RuntimeStats (com.facebook.presto.common.RuntimeStats)7 INITIAL_BATCH_SIZE (com.facebook.presto.orc.OrcReader.INITIAL_BATCH_SIZE)7 CompressionKind (com.facebook.presto.orc.metadata.CompressionKind)7 ImmutableList (com.google.common.collect.ImmutableList)7 Page (com.facebook.presto.common.Page)6 Block (com.facebook.presto.common.block.Block)6 BIGINT (com.facebook.presto.common.type.BigintType.BIGINT)6 NO_ENCRYPTION (com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION)6 ConnectorSession (com.facebook.presto.spi.ConnectorSession)6 DataSize (io.airlift.units.DataSize)6 ArrayList (java.util.ArrayList)6 Type (com.facebook.presto.common.type.Type)5 TypeManager (com.facebook.presto.common.type.TypeManager)5 VARCHAR (com.facebook.presto.common.type.VarcharType.VARCHAR)5