Search in sources :

Example 1 with PARQUET

use of com.facebook.presto.hive.HiveStorageFormat.PARQUET in project presto by prestodb.

the class HiveSplitManager method getPartitionMetadata.

private Iterable<HivePartitionMetadata> getPartitionMetadata(SemiTransactionalHiveMetastore metastore, Table table, SchemaTableName tableName, List<HivePartition> hivePartitions, Optional<HiveBucketHandle> hiveBucketHandle, ConnectorSession session, WarningCollector warningCollector, Optional<Set<HiveColumnHandle>> requestedColumns, Map<String, HiveColumnHandle> predicateColumns, Optional<Map<Subfield, Domain>> domains) {
    if (hivePartitions.isEmpty()) {
        return ImmutableList.of();
    }
    Optional<Set<HiveColumnHandle>> allRequestedColumns = mergeRequestedAndPredicateColumns(requestedColumns, ImmutableSet.copyOf(predicateColumns.values()));
    if (hivePartitions.size() == 1) {
        HivePartition firstPartition = getOnlyElement(hivePartitions);
        if (firstPartition.getPartitionId().equals(UNPARTITIONED_ID)) {
            return ImmutableList.of(new HivePartitionMetadata(firstPartition, Optional.empty(), TableToPartitionMapping.empty(), encryptionInformationProvider.getReadEncryptionInformation(session, table, allRequestedColumns), ImmutableSet.of()));
        }
    }
    StorageFormat storageFormat = table.getStorage().getStorageFormat();
    Optional<HiveStorageFormat> hiveStorageFormat = getHiveStorageFormat(storageFormat);
    Optional<HiveStorageFormat> resolvedHiveStorageFormat;
    if (isUseParquetColumnNames(session)) {
        // Use Hive Storage Format as Parquet if table is of HUDI format
        resolvedHiveStorageFormat = (!hiveStorageFormat.isPresent() && isHudiFormat(storageFormat)) ? Optional.of(PARQUET) : hiveStorageFormat;
    } else {
        resolvedHiveStorageFormat = hiveStorageFormat;
    }
    Iterable<List<HivePartition>> partitionNameBatches = partitionExponentially(hivePartitions, minPartitionBatchSize, maxPartitionBatchSize);
    Iterable<List<HivePartitionMetadata>> partitionBatches = transform(partitionNameBatches, partitionBatch -> {
        Map<String, PartitionSplitInfo> partitionSplitInfo = getPartitionSplitInfo(session, metastore, tableName, partitionBatch, predicateColumns, domains);
        if (partitionBatch.size() != partitionSplitInfo.size()) {
            throw new PrestoException(GENERIC_INTERNAL_ERROR, format("Expected %s partitions but found %s", partitionBatch.size(), partitionSplitInfo.size()));
        }
        Map<String, Partition> partitions = partitionSplitInfo.entrySet().stream().collect(toImmutableMap(Entry::getKey, entry -> entry.getValue().getPartition()));
        Optional<Map<String, EncryptionInformation>> encryptionInformationForPartitions = encryptionInformationProvider.getReadEncryptionInformation(session, table, allRequestedColumns, partitions);
        ImmutableList.Builder<HivePartitionMetadata> results = ImmutableList.builder();
        Map<String, Set<String>> partitionsNotReadable = new HashMap<>();
        int unreadablePartitionsSkipped = 0;
        for (HivePartition hivePartition : partitionBatch) {
            Partition partition = partitions.get(hivePartition.getPartitionId());
            if (partitionSplitInfo.get(hivePartition.getPartitionId()).isPruned()) {
                continue;
            }
            if (partition == null) {
                throw new PrestoException(GENERIC_INTERNAL_ERROR, "Partition not loaded: " + hivePartition);
            }
            String partitionName = makePartName(table.getPartitionColumns(), partition.getValues());
            Optional<EncryptionInformation> encryptionInformation = encryptionInformationForPartitions.map(metadata -> metadata.get(hivePartition.getPartitionId()));
            if (!isOfflineDataDebugModeEnabled(session)) {
                // verify partition is online
                verifyOnline(tableName, Optional.of(partitionName), getProtectMode(partition), partition.getParameters());
                // verify partition is not marked as non-readable
                String reason = partition.getParameters().get(OBJECT_NOT_READABLE);
                if (!isNullOrEmpty(reason)) {
                    if (!shouldIgnoreUnreadablePartition(session) || !partition.isEligibleToIgnore()) {
                        throw new HiveNotReadableException(tableName, Optional.of(partitionName), reason);
                    }
                    unreadablePartitionsSkipped++;
                    if (partitionsNotReadable.size() <= 3) {
                        partitionsNotReadable.putIfAbsent(reason, new HashSet<>(ImmutableSet.of(partitionName)));
                        if (partitionsNotReadable.get(reason).size() <= 3) {
                            partitionsNotReadable.get(reason).add(partitionName);
                        }
                    }
                    continue;
                }
            }
            // Verify that the partition schema matches the table schema.
            // Either adding or dropping columns from the end of the table
            // without modifying existing partitions is allowed, but every
            // column that exists in both the table and partition must have
            // the same type.
            List<Column> tableColumns = table.getDataColumns();
            List<Column> partitionColumns = partition.getColumns();
            if ((tableColumns == null) || (partitionColumns == null)) {
                throw new PrestoException(HIVE_INVALID_METADATA, format("Table '%s' or partition '%s' has null columns", tableName, partitionName));
            }
            TableToPartitionMapping tableToPartitionMapping = getTableToPartitionMapping(session, resolvedHiveStorageFormat, tableName, partitionName, tableColumns, partitionColumns);
            if (hiveBucketHandle.isPresent() && !hiveBucketHandle.get().isVirtuallyBucketed()) {
                Optional<HiveBucketProperty> partitionBucketProperty = partition.getStorage().getBucketProperty();
                if (!partitionBucketProperty.isPresent()) {
                    throw new PrestoException(HIVE_PARTITION_SCHEMA_MISMATCH, format("Hive table (%s) is bucketed but partition (%s) is not bucketed", hivePartition.getTableName(), hivePartition.getPartitionId()));
                }
                int tableBucketCount = hiveBucketHandle.get().getTableBucketCount();
                int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
                List<String> tableBucketColumns = hiveBucketHandle.get().getColumns().stream().map(HiveColumnHandle::getName).collect(toImmutableList());
                List<String> partitionBucketColumns = partitionBucketProperty.get().getBucketedBy();
                if (!tableBucketColumns.equals(partitionBucketColumns) || !isBucketCountCompatible(tableBucketCount, partitionBucketCount)) {
                    throw new PrestoException(HIVE_PARTITION_SCHEMA_MISMATCH, format("Hive table (%s) bucketing (columns=%s, buckets=%s) is not compatible with partition (%s) bucketing (columns=%s, buckets=%s)", hivePartition.getTableName(), tableBucketColumns, tableBucketCount, hivePartition.getPartitionId(), partitionBucketColumns, partitionBucketCount));
                }
            }
            results.add(new HivePartitionMetadata(hivePartition, Optional.of(partition), tableToPartitionMapping, encryptionInformation, partitionSplitInfo.get(hivePartition.getPartitionId()).getRedundantColumnDomains()));
        }
        if (unreadablePartitionsSkipped > 0) {
            StringBuilder warningMessage = new StringBuilder(format("Table '%s' has %s out of %s partitions unreadable: ", tableName, unreadablePartitionsSkipped, partitionBatch.size()));
            for (Entry<String, Set<String>> entry : partitionsNotReadable.entrySet()) {
                warningMessage.append(String.join(", ", entry.getValue())).append("... are due to ").append(entry.getKey()).append(". ");
            }
            warningCollector.add(new PrestoWarning(PARTITION_NOT_READABLE, warningMessage.toString()));
        }
        return results.build();
    });
    return concat(partitionBatches);
}
Also used : WarningCollector(com.facebook.presto.spi.WarningCollector) HiveStorageFormat.getHiveStorageFormat(com.facebook.presto.hive.HiveStorageFormat.getHiveStorageFormat) MetastoreUtil.makePartName(com.facebook.presto.hive.metastore.MetastoreUtil.makePartName) ConnectorSplitSource(com.facebook.presto.spi.ConnectorSplitSource) CounterStat(com.facebook.airlift.stats.CounterStat) MetastoreContext(com.facebook.presto.hive.metastore.MetastoreContext) HiveSessionProperties.isOfflineDataDebugModeEnabled(com.facebook.presto.hive.HiveSessionProperties.isOfflineDataDebugModeEnabled) GENERIC_INTERNAL_ERROR(com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) HIVE_PARTITION_SCHEMA_MISMATCH(com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) SERVER_SHUTTING_DOWN(com.facebook.presto.spi.StandardErrorCode.SERVER_SHUTTING_DOWN) ConnectorTransactionHandle(com.facebook.presto.spi.connector.ConnectorTransactionHandle) IntegerStatistics(com.facebook.presto.hive.metastore.IntegerStatistics) Map(java.util.Map) DecimalStatistics(com.facebook.presto.hive.metastore.DecimalStatistics) ENGLISH(java.util.Locale.ENGLISH) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) HIVE_INVALID_METADATA(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_METADATA) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Decimals.encodeScaledValue(com.facebook.presto.common.type.Decimals.encodeScaledValue) SemiTransactionalHiveMetastore(com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore) HiveSessionProperties.getHiveMaxInitialSplitSize(com.facebook.presto.hive.HiveSessionProperties.getHiveMaxInitialSplitSize) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Stream(java.util.stream.Stream) Decimals.isShortDecimal(com.facebook.presto.common.type.Decimals.isShortDecimal) TableToPartitionMapping.mapColumnsByIndex(com.facebook.presto.hive.TableToPartitionMapping.mapColumnsByIndex) MetastoreUtil.getMetastoreHeaders(com.facebook.presto.hive.metastore.MetastoreUtil.getMetastoreHeaders) Iterables(com.google.common.collect.Iterables) Table(com.facebook.presto.hive.metastore.Table) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) HiveSessionProperties.shouldIgnoreUnreadablePartition(com.facebook.presto.hive.HiveSessionProperties.shouldIgnoreUnreadablePartition) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) Strings.isNullOrEmpty(com.google.common.base.Strings.isNullOrEmpty) HiveColumnStatistics(com.facebook.presto.hive.metastore.HiveColumnStatistics) HiveSessionProperties.isPartitionStatisticsBasedOptimizationEnabled(com.facebook.presto.hive.HiveSessionProperties.isPartitionStatisticsBasedOptimizationEnabled) Lists(com.google.common.collect.Lists) Managed(org.weakref.jmx.Managed) PrestoWarning(com.facebook.presto.spi.PrestoWarning) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) HiveBucketFilter(com.facebook.presto.hive.HiveBucketing.HiveBucketFilter) Executor(java.util.concurrent.Executor) DoubleStatistics(com.facebook.presto.hive.metastore.DoubleStatistics) AbstractIterator(com.google.common.collect.AbstractIterator) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Collectors.reducing(java.util.stream.Collectors.reducing) Domain(com.facebook.presto.common.predicate.Domain) ColumnHandle(com.facebook.presto.spi.ColumnHandle) PartitionStatistics(com.facebook.presto.hive.metastore.PartitionStatistics) BucketSplitInfo.createBucketSplitInfo(com.facebook.presto.hive.StoragePartitionLoader.BucketSplitInfo.createBucketSplitInfo) ValueSet(com.facebook.presto.common.predicate.ValueSet) Iterables.transform(com.google.common.collect.Iterables.transform) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) SortedRangeSet(com.facebook.presto.common.predicate.SortedRangeSet) Float.floatToIntBits(java.lang.Float.floatToIntBits) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) Iterables.concat(com.google.common.collect.Iterables.concat) HiveType.getPrimitiveType(com.facebook.presto.hive.HiveType.getPrimitiveType) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) GROUPED_SCHEDULING(com.facebook.presto.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.GROUPED_SCHEDULING) ConnectorSplitManager(com.facebook.presto.spi.connector.ConnectorSplitManager) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) HiveSessionProperties.getLeaseDuration(com.facebook.presto.hive.HiveSessionProperties.getLeaseDuration) Math.min(java.lang.Math.min) String.format(java.lang.String.format) Range(com.facebook.presto.common.predicate.Range) DateStatistics(com.facebook.presto.hive.metastore.DateStatistics) DataSize(io.airlift.units.DataSize) List(java.util.List) Entry(java.util.Map.Entry) Optional(java.util.Optional) Nested(org.weakref.jmx.Nested) MetastoreUtil.getProtectMode(com.facebook.presto.hive.metastore.MetastoreUtil.getProtectMode) Column(com.facebook.presto.hive.metastore.Column) ConnectorTableLayoutHandle(com.facebook.presto.spi.ConnectorTableLayoutHandle) HIVE_PARTITION_DROPPED_DURING_QUERY(com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_DROPPED_DURING_QUERY) HashMap(java.util.HashMap) PrestoException(com.facebook.presto.spi.PrestoException) HiveColumnHandle.isPathColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isPathColumnHandle) PARQUET(com.facebook.presto.hive.HiveStorageFormat.PARQUET) Partition(com.facebook.presto.hive.metastore.Partition) Inject(javax.inject.Inject) HashSet(java.util.HashSet) UNPARTITIONED_ID(com.facebook.presto.hive.HivePartition.UNPARTITIONED_ID) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) BoundedExecutor(com.facebook.airlift.concurrent.BoundedExecutor) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) PARTITION_NOT_READABLE(com.facebook.presto.hive.HiveWarningCode.PARTITION_NOT_READABLE) Objects.requireNonNull(java.util.Objects.requireNonNull) MetastoreUtil.verifyOnline(com.facebook.presto.hive.metastore.MetastoreUtil.verifyOnline) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Type(com.facebook.presto.common.type.Type) ExecutorService(java.util.concurrent.ExecutorService) HIVE_TRANSACTION_NOT_FOUND(com.facebook.presto.hive.HiveErrorCode.HIVE_TRANSACTION_NOT_FOUND) Iterator(java.util.Iterator) FixedSplitSource(com.facebook.presto.spi.FixedSplitSource) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) TableNotFoundException(com.facebook.presto.spi.TableNotFoundException) Ordering(com.google.common.collect.Ordering) MetastoreUtil.isUserDefinedTypeEncodingEnabled(com.facebook.presto.hive.metastore.MetastoreUtil.isUserDefinedTypeEncodingEnabled) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Set(java.util.Set) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ValueSet(com.facebook.presto.common.predicate.ValueSet) SortedRangeSet(com.facebook.presto.common.predicate.SortedRangeSet) ImmutableSet(com.google.common.collect.ImmutableSet) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) PrestoException(com.facebook.presto.spi.PrestoException) HiveStorageFormat.getHiveStorageFormat(com.facebook.presto.hive.HiveStorageFormat.getHiveStorageFormat) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) HiveStorageFormat.getHiveStorageFormat(com.facebook.presto.hive.HiveStorageFormat.getHiveStorageFormat) Column(com.facebook.presto.hive.metastore.Column) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HiveSessionProperties.shouldIgnoreUnreadablePartition(com.facebook.presto.hive.HiveSessionProperties.shouldIgnoreUnreadablePartition) Partition(com.facebook.presto.hive.metastore.Partition) PrestoWarning(com.facebook.presto.spi.PrestoWarning) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap)

Example 2 with PARQUET

use of com.facebook.presto.hive.HiveStorageFormat.PARQUET in project presto by prestodb.

the class TestHiveFileFormats method testParquetPageSourceSchemaEvolution.

@Test(dataProvider = "rowCount")
public void testParquetPageSourceSchemaEvolution(int rowCount) throws Exception {
    List<TestColumn> writeColumns = getTestColumnsSupportedByParquet();
    // test index-based access
    List<TestColumn> readColumns = writeColumns.stream().map(column -> new TestColumn(column.getName() + "_new", column.getObjectInspector(), column.getWriteValue(), column.getExpectedValue(), column.isPartitionKey())).collect(toList());
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(parquetPageSourceSession).withRowsCount(rowCount).isReadableByPageSource(new ParquetPageSourceFactory(FUNCTION_AND_TYPE_MANAGER, FUNCTION_RESOLUTION, HDFS_ENVIRONMENT, STATS, METADATA_READER));
    // test name-based access
    readColumns = Lists.reverse(writeColumns);
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(parquetPageSourceSessionUseName).isReadableByPageSource(new ParquetPageSourceFactory(FUNCTION_AND_TYPE_MANAGER, FUNCTION_RESOLUTION, HDFS_ENVIRONMENT, STATS, METADATA_READER));
}
Also used : RecordPageSource(com.facebook.presto.spi.RecordPageSource) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) VarcharType.createUnboundedVarcharType(com.facebook.presto.common.type.VarcharType.createUnboundedVarcharType) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) Test(org.testng.annotations.Test) HIVE_PARTITION_SCHEMA_MISMATCH(com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) PrimitiveObjectInspectorFactory.javaTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector) FileSplit(org.apache.hadoop.mapred.FileSplit) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) LzoCodec(io.airlift.compress.lzo.LzoCodec) DwrfBatchPageSourceFactory(com.facebook.presto.hive.orc.DwrfBatchPageSourceFactory) PrimitiveObjectInspectorFactory.javaFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector) PrimitiveObjectInspectorFactory.javaDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructuralTestUtil.rowBlockOf(com.facebook.presto.tests.StructuralTestUtil.rowBlockOf) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ConnectorSession(com.facebook.presto.spi.ConnectorSession) INTEGER(com.facebook.presto.common.type.IntegerType.INTEGER) HIVE_CLIENT_CONFIG(com.facebook.presto.hive.HiveTestUtils.HIVE_CLIENT_CONFIG) Iterables.filter(com.google.common.collect.Iterables.filter) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) MetadataReader(com.facebook.presto.parquet.cache.MetadataReader) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) FUNCTION_RESOLUTION(com.facebook.presto.hive.HiveTestUtils.FUNCTION_RESOLUTION) ROW_EXPRESSION_SERVICE(com.facebook.presto.hive.HiveTestUtils.ROW_EXPRESSION_SERVICE) Lists(com.google.common.collect.Lists) RCTEXT(com.facebook.presto.hive.HiveStorageFormat.RCTEXT) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) CSV(com.facebook.presto.hive.HiveStorageFormat.CSV) ArrayType(com.facebook.presto.common.type.ArrayType) JSON(com.facebook.presto.hive.HiveStorageFormat.JSON) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) StorageStripeMetadataSource(com.facebook.presto.orc.StorageStripeMetadataSource) IOException(java.io.IOException) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) File(java.io.File) FUNCTION_AND_TYPE_MANAGER(com.facebook.presto.hive.HiveTestUtils.FUNCTION_AND_TYPE_MANAGER) RCBINARY(com.facebook.presto.hive.HiveStorageFormat.RCBINARY) HDFS_ENVIRONMENT(com.facebook.presto.hive.HiveTestUtils.HDFS_ENVIRONMENT) ParquetPageSourceFactory(com.facebook.presto.hive.parquet.ParquetPageSourceFactory) ParquetFileWriterFactory(com.facebook.presto.hive.parquet.ParquetFileWriterFactory) RowType(com.facebook.presto.common.type.RowType) ORC(com.facebook.presto.hive.HiveStorageFormat.ORC) CacheConfig(com.facebook.presto.cache.CacheConfig) SchemaTableName(com.facebook.presto.spi.SchemaTableName) AVRO(com.facebook.presto.hive.HiveStorageFormat.AVRO) TEXTFILE(com.facebook.presto.hive.HiveStorageFormat.TEXTFILE) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) TimeZone(java.util.TimeZone) BeforeClass(org.testng.annotations.BeforeClass) DWRF(com.facebook.presto.hive.HiveStorageFormat.DWRF) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) TRUE_CONSTANT(com.facebook.presto.expressions.LogicalRowExpressions.TRUE_CONSTANT) RecordCursor(com.facebook.presto.spi.RecordCursor) List(java.util.List) StructuralTestUtil.arrayBlockOf(com.facebook.presto.tests.StructuralTestUtil.arrayBlockOf) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) Optional(java.util.Optional) ErrorCodeSupplier(com.facebook.presto.spi.ErrorCodeSupplier) DataProvider(org.testng.annotations.DataProvider) Column(com.facebook.presto.hive.metastore.Column) PrimitiveObjectInspectorFactory.javaBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector) HiveTestUtils.getTypes(com.facebook.presto.hive.HiveTestUtils.getTypes) HIVE_INVALID_PARTITION_VALUE(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE) RcFilePageSourceFactory(com.facebook.presto.hive.rcfile.RcFilePageSourceFactory) Assert.assertEquals(org.testng.Assert.assertEquals) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) LzopCodec(io.airlift.compress.lzo.LzopCodec) PARQUET(com.facebook.presto.hive.HiveStorageFormat.PARQUET) NO_ENCRYPTION(com.facebook.presto.hive.HiveDwrfEncryptionProvider.NO_ENCRYPTION) StructuralTestUtil.mapBlockOf(com.facebook.presto.tests.StructuralTestUtil.mapBlockOf) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) SESSION(com.facebook.presto.hive.HiveTestUtils.SESSION) Objects.requireNonNull(java.util.Objects.requireNonNull) DEFAULT_HIVE_FILE_CONTEXT(com.facebook.presto.hive.HiveFileContext.DEFAULT_HIVE_FILE_CONTEXT) SEQUENCEFILE(com.facebook.presto.hive.HiveStorageFormat.SEQUENCEFILE) ObjectInspectorFactory.getStandardMapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardMapObjectInspector) Storage(com.facebook.presto.hive.metastore.Storage) Assert.fail(org.testng.Assert.fail) OrcBatchPageSourceFactory(com.facebook.presto.hive.orc.OrcBatchPageSourceFactory) OutputStreamDataSinkFactory(com.facebook.presto.hive.datasink.OutputStreamDataSinkFactory) ObjectInspectorFactory.getStandardListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) StorageOrcFileTailSource(com.facebook.presto.orc.cache.StorageOrcFileTailSource) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) ParquetPageSourceFactory(com.facebook.presto.hive.parquet.ParquetPageSourceFactory) Test(org.testng.annotations.Test)

Example 3 with PARQUET

use of com.facebook.presto.hive.HiveStorageFormat.PARQUET in project presto by prestodb.

the class TestHiveFileFormats method testOptimizedParquetWriter.

@Test(dataProvider = "rowCount")
public void testOptimizedParquetWriter(int rowCount) throws Exception {
    TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveClientConfig().setOrcOptimizedWriterEnabled(true).setOrcWriterValidationPercentage(100.0), new OrcFileWriterConfig(), new ParquetFileWriterConfig().setParquetOptimizedWriterEnabled(true), new CacheConfig()).getSessionProperties());
    // A Presto page can not contain a map with null keys, so a page based writer can not write null keys
    List<TestColumn> testColumns = TEST_COLUMNS.stream().filter(testColumn -> !testColumn.getName().equals("t_map_null_key") && !testColumn.getName().equals("t_map_null_key_complex_value") && !testColumn.getName().equals("t_map_null_key_complex_key_value")).collect(toList());
    assertThatFileFormat(PARQUET).withSession(session).withColumns(testColumns).withRowsCount(rowCount).withFileWriterFactory(new ParquetFileWriterFactory(HDFS_ENVIRONMENT, FUNCTION_AND_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE)).isReadableByPageSource(new ParquetPageSourceFactory(FUNCTION_AND_TYPE_MANAGER, FUNCTION_RESOLUTION, HDFS_ENVIRONMENT, STATS, METADATA_READER));
}
Also used : RecordPageSource(com.facebook.presto.spi.RecordPageSource) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) VarcharType.createUnboundedVarcharType(com.facebook.presto.common.type.VarcharType.createUnboundedVarcharType) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) Test(org.testng.annotations.Test) HIVE_PARTITION_SCHEMA_MISMATCH(com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) PrimitiveObjectInspectorFactory.javaTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector) FileSplit(org.apache.hadoop.mapred.FileSplit) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) LzoCodec(io.airlift.compress.lzo.LzoCodec) DwrfBatchPageSourceFactory(com.facebook.presto.hive.orc.DwrfBatchPageSourceFactory) PrimitiveObjectInspectorFactory.javaFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector) PrimitiveObjectInspectorFactory.javaDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructuralTestUtil.rowBlockOf(com.facebook.presto.tests.StructuralTestUtil.rowBlockOf) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ConnectorSession(com.facebook.presto.spi.ConnectorSession) INTEGER(com.facebook.presto.common.type.IntegerType.INTEGER) HIVE_CLIENT_CONFIG(com.facebook.presto.hive.HiveTestUtils.HIVE_CLIENT_CONFIG) Iterables.filter(com.google.common.collect.Iterables.filter) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) MetadataReader(com.facebook.presto.parquet.cache.MetadataReader) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) FUNCTION_RESOLUTION(com.facebook.presto.hive.HiveTestUtils.FUNCTION_RESOLUTION) ROW_EXPRESSION_SERVICE(com.facebook.presto.hive.HiveTestUtils.ROW_EXPRESSION_SERVICE) Lists(com.google.common.collect.Lists) RCTEXT(com.facebook.presto.hive.HiveStorageFormat.RCTEXT) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) CSV(com.facebook.presto.hive.HiveStorageFormat.CSV) ArrayType(com.facebook.presto.common.type.ArrayType) JSON(com.facebook.presto.hive.HiveStorageFormat.JSON) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) StorageStripeMetadataSource(com.facebook.presto.orc.StorageStripeMetadataSource) IOException(java.io.IOException) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) File(java.io.File) FUNCTION_AND_TYPE_MANAGER(com.facebook.presto.hive.HiveTestUtils.FUNCTION_AND_TYPE_MANAGER) RCBINARY(com.facebook.presto.hive.HiveStorageFormat.RCBINARY) HDFS_ENVIRONMENT(com.facebook.presto.hive.HiveTestUtils.HDFS_ENVIRONMENT) ParquetPageSourceFactory(com.facebook.presto.hive.parquet.ParquetPageSourceFactory) ParquetFileWriterFactory(com.facebook.presto.hive.parquet.ParquetFileWriterFactory) RowType(com.facebook.presto.common.type.RowType) ORC(com.facebook.presto.hive.HiveStorageFormat.ORC) CacheConfig(com.facebook.presto.cache.CacheConfig) SchemaTableName(com.facebook.presto.spi.SchemaTableName) AVRO(com.facebook.presto.hive.HiveStorageFormat.AVRO) TEXTFILE(com.facebook.presto.hive.HiveStorageFormat.TEXTFILE) StripeMetadataSourceFactory(com.facebook.presto.orc.StripeMetadataSourceFactory) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) TimeZone(java.util.TimeZone) BeforeClass(org.testng.annotations.BeforeClass) DWRF(com.facebook.presto.hive.HiveStorageFormat.DWRF) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) TRUE_CONSTANT(com.facebook.presto.expressions.LogicalRowExpressions.TRUE_CONSTANT) RecordCursor(com.facebook.presto.spi.RecordCursor) List(java.util.List) StructuralTestUtil.arrayBlockOf(com.facebook.presto.tests.StructuralTestUtil.arrayBlockOf) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) Optional(java.util.Optional) ErrorCodeSupplier(com.facebook.presto.spi.ErrorCodeSupplier) DataProvider(org.testng.annotations.DataProvider) Column(com.facebook.presto.hive.metastore.Column) PrimitiveObjectInspectorFactory.javaBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector) HiveTestUtils.getTypes(com.facebook.presto.hive.HiveTestUtils.getTypes) HIVE_INVALID_PARTITION_VALUE(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE) RcFilePageSourceFactory(com.facebook.presto.hive.rcfile.RcFilePageSourceFactory) Assert.assertEquals(org.testng.Assert.assertEquals) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) LzopCodec(io.airlift.compress.lzo.LzopCodec) PARQUET(com.facebook.presto.hive.HiveStorageFormat.PARQUET) NO_ENCRYPTION(com.facebook.presto.hive.HiveDwrfEncryptionProvider.NO_ENCRYPTION) StructuralTestUtil.mapBlockOf(com.facebook.presto.tests.StructuralTestUtil.mapBlockOf) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) SESSION(com.facebook.presto.hive.HiveTestUtils.SESSION) Objects.requireNonNull(java.util.Objects.requireNonNull) DEFAULT_HIVE_FILE_CONTEXT(com.facebook.presto.hive.HiveFileContext.DEFAULT_HIVE_FILE_CONTEXT) SEQUENCEFILE(com.facebook.presto.hive.HiveStorageFormat.SEQUENCEFILE) ObjectInspectorFactory.getStandardMapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardMapObjectInspector) Storage(com.facebook.presto.hive.metastore.Storage) Assert.fail(org.testng.Assert.fail) OrcBatchPageSourceFactory(com.facebook.presto.hive.orc.OrcBatchPageSourceFactory) OutputStreamDataSinkFactory(com.facebook.presto.hive.datasink.OutputStreamDataSinkFactory) ObjectInspectorFactory.getStandardListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) StorageOrcFileTailSource(com.facebook.presto.orc.cache.StorageOrcFileTailSource) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) ParquetFileWriterFactory(com.facebook.presto.hive.parquet.ParquetFileWriterFactory) CacheConfig(com.facebook.presto.cache.CacheConfig) ParquetPageSourceFactory(com.facebook.presto.hive.parquet.ParquetPageSourceFactory) Test(org.testng.annotations.Test)

Aggregations

TupleDomain (com.facebook.presto.common.predicate.TupleDomain)3 HIVE_PARTITION_SCHEMA_MISMATCH (com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH)3 PARQUET (com.facebook.presto.hive.HiveStorageFormat.PARQUET)3 Column (com.facebook.presto.hive.metastore.Column)3 StorageFormat (com.facebook.presto.hive.metastore.StorageFormat)3 ConnectorSession (com.facebook.presto.spi.ConnectorSession)3 PrestoException (com.facebook.presto.spi.PrestoException)3 SchemaTableName (com.facebook.presto.spi.SchemaTableName)3 ImmutableList (com.google.common.collect.ImmutableList)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 ImmutableMap (com.google.common.collect.ImmutableMap)3 ImmutableSet (com.google.common.collect.ImmutableSet)3 Lists (com.google.common.collect.Lists)3 List (java.util.List)3 Objects.requireNonNull (java.util.Objects.requireNonNull)3 Optional (java.util.Optional)3 CacheConfig (com.facebook.presto.cache.CacheConfig)2 ArrayType (com.facebook.presto.common.type.ArrayType)2 BIGINT (com.facebook.presto.common.type.BigintType.BIGINT)2 BOOLEAN (com.facebook.presto.common.type.BooleanType.BOOLEAN)2