Search in sources :

Example 1 with TrinoColumnIndexStore

use of io.trino.parquet.reader.TrinoColumnIndexStore in project trino by trinodb.

the class ParquetPageSourceFactory method getColumnIndexStore.

private static Optional<ColumnIndexStore> getColumnIndexStore(ParquetDataSource dataSource, BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, ParquetReaderOptions options) {
    if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) {
        return Optional.empty();
    }
    boolean hasColumnIndex = false;
    for (ColumnChunkMetaData column : blockMetadata.getColumns()) {
        if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) {
            hasColumnIndex = true;
            break;
        }
    }
    if (!hasColumnIndex) {
        return Optional.empty();
    }
    Set<ColumnPath> columnsReadPaths = new HashSet<>(descriptorsByPath.size());
    for (List<String> path : descriptorsByPath.keySet()) {
        columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0])));
    }
    Map<ColumnDescriptor, Domain> parquetDomains = parquetTupleDomain.getDomains().orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains"));
    Set<ColumnPath> columnsFilteredPaths = parquetDomains.keySet().stream().map(column -> ColumnPath.get(column.getPath())).collect(toImmutableSet());
    return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths));
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) HashSet(java.util.HashSet)

Aggregations

Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Strings.nullToEmpty (com.google.common.base.Strings.nullToEmpty)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)1 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)1 Field (io.trino.parquet.Field)1 ParquetCorruptionException (io.trino.parquet.ParquetCorruptionException)1 ParquetDataSource (io.trino.parquet.ParquetDataSource)1 ParquetDataSourceId (io.trino.parquet.ParquetDataSourceId)1 ParquetReaderOptions (io.trino.parquet.ParquetReaderOptions)1 ParquetTypeUtils.getColumnIO (io.trino.parquet.ParquetTypeUtils.getColumnIO)1 ParquetTypeUtils.getDescriptors (io.trino.parquet.ParquetTypeUtils.getDescriptors)1 ParquetTypeUtils.getParquetTypeByName (io.trino.parquet.ParquetTypeUtils.getParquetTypeByName)1 ParquetTypeUtils.lookupColumnByName (io.trino.parquet.ParquetTypeUtils.lookupColumnByName)1 RichColumnDescriptor (io.trino.parquet.RichColumnDescriptor)1 Predicate (io.trino.parquet.predicate.Predicate)1 PredicateUtils.buildPredicate (io.trino.parquet.predicate.PredicateUtils.buildPredicate)1 PredicateUtils.predicateMatches (io.trino.parquet.predicate.PredicateUtils.predicateMatches)1