Search in sources :

Example 71 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.

the class TestTupleDomainParquetPredicate method testDouble.

@Test
public void testDouble() throws Exception {
    ColumnDescriptor columnDescriptor = createColumnDescriptor(PrimitiveTypeName.DOUBLE, "DoubleColumn");
    assertEquals(getDomain(columnDescriptor, DOUBLE, 0, null, ID), Domain.all(DOUBLE));
    assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(42.24, 42.24), ID), singleValue(DOUBLE, 42.24));
    assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(3.3, 42.24), ID), create(ValueSet.ofRanges(range(DOUBLE, 3.3, true, 42.24, true)), false));
    assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(NaN, NaN), ID), Domain.notNull(DOUBLE));
    assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(NaN, NaN, true), ID), Domain.all(DOUBLE));
    assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(3.3, NaN), ID), Domain.notNull(DOUBLE));
    assertEquals(getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(3.3, NaN, true), ID), Domain.all(DOUBLE));
    assertEquals(getDomain(DOUBLE, doubleDictionaryDescriptor(NaN)), Domain.all(DOUBLE));
    assertEquals(getDomain(DOUBLE, doubleDictionaryDescriptor(3.3, NaN)), Domain.all(DOUBLE));
    // fail on corrupted statistics
    assertThatExceptionOfType(ParquetCorruptionException.class).isThrownBy(() -> getDomain(columnDescriptor, DOUBLE, 10, doubleColumnStats(42.24, 3.3), ID)).withMessage("Corrupted statistics for column \"[] required double DoubleColumn\" in Parquet file \"testFile\": [min: 42.24, max: 3.3, num_nulls: 0]");
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) Test(org.testng.annotations.Test)

Example 72 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.

the class ParquetPageSourceFactory method createParquetPageSource.

public static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, SchemaTableName tableName, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, StandardFunctionResolution functionResolution, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, HiveFileContext hiveFileContext, ParquetMetadataSource parquetMetadataSource, boolean columnIndexFilterEnabled) {
    AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).openFile(path, hiveFileContext);
        dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
        ParquetMetadata parquetMetadata = parquetMetadataSource.getParquetMetadata(dataSource, fileSize, hiveFileContext.isCacheable()).getParquetMetadata();
        if (!columns.isEmpty() && columns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
            return new AggregatedParquetPageSource(columns, parquetMetadata, typeManager, functionResolution);
        }
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getTypeSignature()), fileSchema, useParquetColumnNames, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                footerBlocks.add(block);
            }
        }
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
        final ParquetDataSource finalDataSource = dataSource;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
        for (BlockMetaData block : footerBlocks.build()) {
            Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
            if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
                blocks.add(block);
                blockIndexStores.add(columnIndexStore.orElse(null));
                hiveFileContext.incrementCounter("parquet.blocksRead", 1);
                hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount());
                hiveFileContext.incrementCounter("parquet.totalBytesRead", block.getTotalByteSize());
            } else {
                hiveFileContext.incrementCounter("parquet.blocksSkipped", 1);
                hiveFileContext.incrementCounter("parquet.rowsSkipped", block.getRowCount());
                hiveFileContext.incrementCounter("parquet.totalBytesSkipped", block.getTotalByteSize());
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
        ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
        for (HiveColumnHandle column : columns) {
            checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SYNTHESIZED, "column type must be regular or synthesized column");
            String name = column.getName();
            Type type = typeManager.getType(column.getTypeSignature());
            namesBuilder.add(name);
            typesBuilder.add(type);
            if (column.getColumnType() == SYNTHESIZED) {
                Subfield pushedDownSubfield = getPushedDownSubfield(column);
                List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
                Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
                if (columnIO.isPresent()) {
                    fieldsBuilder.add(constructField(type, columnIO.get()));
                } else {
                    fieldsBuilder.add(Optional.empty());
                }
            } else if (getParquetType(type, fileSchema, useParquetColumnNames, column, tableName, path).isPresent()) {
                String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName();
                fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName)));
            } else {
                fieldsBuilder.add(Optional.empty());
            }
        }
        return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), hiveFileContext.getStats());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new PrestoException(HIVE_BAD_DATA, e);
        }
        if (e instanceof AccessControlException) {
            throw new PrestoException(PERMISSION_DENIED, e.getMessage(), e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) DateTimeZone(org.joda.time.DateTimeZone) TINYINT(com.facebook.presto.common.type.StandardTypes.TINYINT) HIVE_PARTITION_SCHEMA_MISMATCH(com.facebook.presto.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) HiveSessionProperties.isUseParquetColumnNames(com.facebook.presto.hive.HiveSessionProperties.isUseParquetColumnNames) ROW(com.facebook.presto.common.type.StandardTypes.ROW) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) AGGREGATED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.AGGREGATED) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) FileFormatDataSourceStats(com.facebook.presto.hive.FileFormatDataSourceStats) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) ParquetMetadataSource(com.facebook.presto.parquet.cache.ParquetMetadataSource) Set(java.util.Set) HiveFileContext(com.facebook.presto.hive.HiveFileContext) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ARRAY(com.facebook.presto.common.type.StandardTypes.ARRAY) ConnectorSession(com.facebook.presto.spi.ConnectorSession) CHAR(com.facebook.presto.common.type.StandardTypes.CHAR) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) HiveBatchPageSourceFactory(com.facebook.presto.hive.HiveBatchPageSourceFactory) ColumnIO(org.apache.parquet.io.ColumnIO) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) StandardTypes(com.facebook.presto.common.type.StandardTypes) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) ArrayList(java.util.ArrayList) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) HIVE_BAD_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_BAD_DATA) DATE(com.facebook.presto.common.type.StandardTypes.DATE) IOException(java.io.IOException) ParquetTypeUtils.nestedColumnPath(com.facebook.presto.parquet.ParquetTypeUtils.nestedColumnPath) Domain(com.facebook.presto.common.predicate.Domain) INTEGER(com.facebook.presto.common.type.StandardTypes.INTEGER) HiveColumnHandle.getPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.getPushedDownSubfield) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) HiveSessionProperties.getParquetMaxReadBlockSize(com.facebook.presto.hive.HiveSessionProperties.getParquetMaxReadBlockSize) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) REAL(com.facebook.presto.common.type.StandardTypes.REAL) RowType(com.facebook.presto.common.type.RowType) ColumnIOConverter.constructField(org.apache.parquet.io.ColumnIOConverter.constructField) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) HiveColumnHandle.isPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.isPushedDownSubfield) StandardFunctionResolution(com.facebook.presto.spi.function.StandardFunctionResolution) ParquetTypeUtils.lookupColumnByName(com.facebook.presto.parquet.ParquetTypeUtils.lookupColumnByName) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) MAP(com.facebook.presto.common.type.StandardTypes.MAP) Locale(java.util.Locale) ParquetTypeUtils.getDescriptors(com.facebook.presto.parquet.ParquetTypeUtils.getDescriptors) Path(org.apache.hadoop.fs.Path) EncryptionInformation(com.facebook.presto.hive.EncryptionInformation) BIGINT(com.facebook.presto.common.type.StandardTypes.BIGINT) HiveSessionProperties.isParquetBatchReadsEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReadsEnabled) ImmutableSet(com.google.common.collect.ImmutableSet) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Collectors(java.util.stream.Collectors) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ColumnIndexFilterUtils(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils) VARCHAR(com.facebook.presto.common.type.StandardTypes.VARCHAR) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveSessionProperties.isParquetBatchReaderVerificationEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetBatchReaderVerificationEnabled) Entry(java.util.Map.Entry) Optional(java.util.Optional) TIMESTAMP(com.facebook.presto.common.type.StandardTypes.TIMESTAMP) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) SMALLINT(com.facebook.presto.common.type.StandardTypes.SMALLINT) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) PredicateUtils.predicateMatches(com.facebook.presto.parquet.predicate.PredicateUtils.predicateMatches) PrestoException(com.facebook.presto.spi.PrestoException) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Inject(javax.inject.Inject) ParquetTypeUtils.getParquetTypeByName(com.facebook.presto.parquet.ParquetTypeUtils.getParquetTypeByName) DECIMAL(com.facebook.presto.common.type.StandardTypes.DECIMAL) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) TypeManager(com.facebook.presto.common.type.TypeManager) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicate(com.facebook.presto.parquet.predicate.Predicate) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) Type(com.facebook.presto.common.type.Type) VARBINARY(com.facebook.presto.common.type.StandardTypes.VARBINARY) Storage(com.facebook.presto.hive.metastore.Storage) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) PERMISSION_DENIED(com.facebook.presto.spi.StandardErrorCode.PERMISSION_DENIED) Field(com.facebook.presto.parquet.Field) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) AccessControlException(org.apache.hadoop.security.AccessControlException) SYNTHESIZED(com.facebook.presto.hive.HiveColumnHandle.ColumnType.SYNTHESIZED) HiveSessionProperties.columnIndexFilterEnabled(com.facebook.presto.hive.HiveSessionProperties.columnIndexFilterEnabled) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList(com.google.common.collect.ImmutableList) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(com.facebook.presto.parquet.predicate.Predicate) PredicateUtils.buildPredicate(com.facebook.presto.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) ParquetTypeUtils.columnPathFromSubfield(com.facebook.presto.parquet.ParquetTypeUtils.columnPathFromSubfield) HiveColumnHandle.getPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.getPushedDownSubfield) HiveColumnHandle.isPushedDownSubfield(com.facebook.presto.hive.HiveColumnHandle.isPushedDownSubfield) Subfield(com.facebook.presto.common.Subfield) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) Optional(java.util.Optional) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReader(com.facebook.presto.parquet.reader.ParquetReader) AccessControlException(org.apache.hadoop.security.AccessControlException) IOException(java.io.IOException) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) AggregatedMemoryContext(com.facebook.presto.memory.context.AggregatedMemoryContext) ColumnIO(org.apache.parquet.io.ColumnIO) ColumnIOConverter.findNestedColumnIO(org.apache.parquet.io.ColumnIOConverter.findNestedColumnIO) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ParquetTypeUtils.getColumnIO(com.facebook.presto.parquet.ParquetTypeUtils.getColumnIO) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) PrestoException(com.facebook.presto.spi.PrestoException) AccessControlException(org.apache.hadoop.security.AccessControlException) RowType(com.facebook.presto.common.type.RowType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) ParquetTypeUtils.getSubfieldType(com.facebook.presto.parquet.ParquetTypeUtils.getSubfieldType) Type(com.facebook.presto.common.type.Type) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 73 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.

the class IcebergPageSourceProvider method getParquetTupleDomain.

private static TupleDomain<ColumnDescriptor> getParquetTupleDomain(Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<IcebergColumnHandle> effectivePredicate) {
    if (effectivePredicate.isNone()) {
        return TupleDomain.none();
    }
    ImmutableMap.Builder<ColumnDescriptor, Domain> predicate = ImmutableMap.builder();
    effectivePredicate.getDomains().get().forEach((columnHandle, domain) -> {
        String baseType = columnHandle.getType().getTypeSignature().getBase();
        // skip looking up predicates for complex types as Parquet only stores stats for primitives
        if (!baseType.equals(StandardTypes.MAP) && !baseType.equals(StandardTypes.ARRAY) && !baseType.equals(StandardTypes.ROW)) {
            RichColumnDescriptor descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName()));
            if (descriptor != null) {
                predicate.put(descriptor, domain);
            }
        }
    });
    return TupleDomain.withColumnDomains(predicate.build());
}
Also used : RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 74 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.

the class ParquetReader method readPrimitive.

private ColumnChunk readPrimitive(PrimitiveField field) throws IOException {
    ColumnDescriptor columnDescriptor = field.getDescriptor();
    int fieldId = field.getId();
    ColumnReader columnReader = columnReaders[fieldId];
    if (!columnReader.isInitialized()) {
        validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
        ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
        long startingPosition = metadata.getStartingPos();
        int totalSize = toIntExact(metadata.getTotalSize());
        if (shouldUseColumnIndex(metadata.getPath())) {
            OffsetIndex offsetIndex = blockIndexStores.get(currentBlock).getOffsetIndex(metadata.getPath());
            OffsetIndex filteredOffsetIndex = ColumnIndexFilterUtils.filterOffsetIndex(offsetIndex, currentGroupRowRanges, blocks.get(currentBlock).getRowCount());
            List<OffsetRange> offsetRanges = ColumnIndexFilterUtils.calculateOffsetRanges(filteredOffsetIndex, metadata, offsetIndex.getOffset(0), startingPosition);
            List<OffsetRange> consecutiveRanges = concatRanges(offsetRanges);
            List<ByteBuffer> buffers = allocateBlocks(consecutiveRanges);
            for (int i = 0; i < consecutiveRanges.size(); i++) {
                ByteBuffer buffer = buffers.get(i);
                dataSource.readFully(startingPosition + consecutiveRanges.get(i).getOffset(), buffer.array());
            }
            PageReader pageReader = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
            columnReader.init(pageReader, field, currentGroupRowRanges);
            if (enableVerification) {
                ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
                PageReader pageReaderVerification = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
                verificationColumnReader.init(pageReaderVerification, field, currentGroupRowRanges);
            }
        } else {
            byte[] buffer = allocateBlock(totalSize);
            dataSource.readFully(startingPosition, buffer);
            PageReader pageReader = createPageReader(buffer, totalSize, metadata, columnDescriptor);
            columnReader.init(pageReader, field, null);
            if (enableVerification) {
                ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
                PageReader pageReaderVerification = createPageReader(buffer, totalSize, metadata, columnDescriptor);
                verificationColumnReader.init(pageReaderVerification, field, null);
            }
        }
    }
    ColumnChunk columnChunk = columnReader.readNext();
    columnChunk = typeCoercion(columnChunk, field.getDescriptor().getPrimitiveType().getPrimitiveTypeName(), field.getType());
    if (enableVerification) {
        ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
        ColumnChunk expected = verificationColumnReader.readNext();
        ParquetResultVerifierUtils.verifyColumnChunks(columnChunk, expected, columnDescriptor.getPath().length > 1, field, dataSource.getId());
    }
    // update max size per primitive column chunk
    long bytesPerCell = columnChunk.getBlock().getSizeInBytes() / batchSize;
    if (maxBytesPerCell[fieldId] < bytesPerCell) {
        // update batch size
        maxCombinedBytesPerRow = maxCombinedBytesPerRow - maxBytesPerCell[fieldId] + bytesPerCell;
        maxBatchSize = toIntExact(min(maxBatchSize, max(1, maxReadBlockBytes / maxCombinedBytesPerRow)));
        maxBytesPerCell[fieldId] = bytesPerCell;
    }
    return columnChunk;
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ByteBuffer(java.nio.ByteBuffer) OffsetRange(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange) ColumnReader(com.facebook.presto.parquet.ColumnReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 75 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.

the class TupleDomainParquetPredicate method getDomain.

/**
 * Get a domain for the ranges defined by each pair of elements from {@code minimums} and {@code maximums}.
 * Both arrays must have the same length.
 */
private static Domain getDomain(ColumnDescriptor column, Type type, List<Object> minimums, List<Object> maximums, boolean hasNullValue) {
    checkArgument(minimums.size() == maximums.size(), "Expected minimums and maximums to have the same size");
    List<Range> ranges = new ArrayList<>();
    if (type.equals(BOOLEAN)) {
        boolean hasTrueValues = minimums.stream().anyMatch(value -> (boolean) value) || maximums.stream().anyMatch(value -> (boolean) value);
        boolean hasFalseValues = minimums.stream().anyMatch(value -> !(boolean) value) || maximums.stream().anyMatch(value -> !(boolean) value);
        if (hasTrueValues && hasFalseValues) {
            return Domain.all(type);
        }
        if (hasTrueValues) {
            return Domain.create(ValueSet.of(type, true), hasNullValue);
        }
        if (hasFalseValues) {
            return Domain.create(ValueSet.of(type, false), hasNullValue);
        }
        // All nulls case is handled earlier
        throw new VerifyException("Impossible boolean statistics");
    }
    if ((type.equals(BIGINT) || type.equals(TINYINT) || type.equals(SMALLINT) || type.equals(INTEGER))) {
        for (int i = 0; i < minimums.size(); i++) {
            long min = asLong(minimums.get(i));
            long max = asLong(maximums.get(i));
            if (isStatisticsOverflow(type, min, max)) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (type.equals(REAL)) {
        for (int i = 0; i < minimums.size(); i++) {
            Float min = (Float) minimums.get(i);
            Float max = (Float) maximums.get(i);
            if (min.isNaN() || max.isNaN()) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, (long) floatToRawIntBits(min), true, (long) floatToRawIntBits(max), true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (type.equals(DOUBLE)) {
        for (int i = 0; i < minimums.size(); i++) {
            Double min = (Double) minimums.get(i);
            Double max = (Double) maximums.get(i);
            if (min.isNaN() || max.isNaN()) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (isVarcharType(type)) {
        for (int i = 0; i < minimums.size(); i++) {
            Slice min = Slices.wrappedBuffer(((Binary) minimums.get(i)).toByteBuffer());
            Slice max = Slices.wrappedBuffer(((Binary) maximums.get(i)).toByteBuffer());
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (type.equals(DATE)) {
        for (int i = 0; i < minimums.size(); i++) {
            long min = asLong(minimums.get(i));
            long max = asLong(maximums.get(i));
            if (isStatisticsOverflow(type, min, max)) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    return Domain.create(ValueSet.all(type), hasNullValue);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) PredicateUtils.isStatisticsOverflow(com.facebook.presto.parquet.predicate.PredicateUtils.isStatisticsOverflow) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) FilterApi(org.apache.parquet.filter2.predicate.FilterApi) ByteBuffer(java.nio.ByteBuffer) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Slices(io.airlift.slice.Slices) Map(java.util.Map) Varchars.isVarcharType(com.facebook.presto.common.type.Varchars.isVarcharType) UserDefinedPredicate(org.apache.parquet.filter2.predicate.UserDefinedPredicate) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) BINARY(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) String.format(java.lang.String.format) Range(com.facebook.presto.common.predicate.Range) Binary(org.apache.parquet.io.api.Binary) Serializable(java.io.Serializable) INT64(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64) List(java.util.List) LITTLE_ENDIAN(java.nio.ByteOrder.LITTLE_ENDIAN) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) INTEGER(com.facebook.presto.common.type.IntegerType.INTEGER) Optional(java.util.Optional) DictionaryPage(com.facebook.presto.parquet.DictionaryPage) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Slice(io.airlift.slice.Slice) TINYINT(com.facebook.presto.common.type.TinyintType.TINYINT) HashMap(java.util.HashMap) INT32(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32) Function(java.util.function.Function) DATE(com.facebook.presto.common.type.DateType.DATE) REAL(com.facebook.presto.common.type.RealType.REAL) ArrayList(java.util.ArrayList) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ParquetDataSourceId(com.facebook.presto.parquet.ParquetDataSourceId) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) Type(com.facebook.presto.common.type.Type) VerifyException(com.google.common.base.VerifyException) Statistics(org.apache.parquet.column.statistics.Statistics) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) Dictionary(com.facebook.presto.parquet.dictionary.Dictionary) FLOAT(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT) SMALLINT(com.facebook.presto.common.type.SmallintType.SMALLINT) VisibleForTesting(com.google.common.annotations.VisibleForTesting) ValueSet(com.facebook.presto.common.predicate.ValueSet) VerifyException(com.google.common.base.VerifyException) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) Range(com.facebook.presto.common.predicate.Range)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7