Search in sources :

Example 1 with Predicate

use of io.trino.parquet.predicate.Predicate in project trino by trinodb.

the class ParquetPageSourceFactory method createPageSource.

/**
 * This method is available for other callers to use directly.
 */
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
    // Ignore predicates on partial columns for now.
    effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
    MessageType fileSchema;
    MessageType requestedSchema;
    MessageColumnIO messageColumn;
    ParquetReader parquetReader;
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        messageColumn = getColumnIO(fileSchema, requestedSchema);
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
        long nextStart = 0;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
        ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
            if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
                blocks.add(block);
                blockStarts.add(nextStart);
                columnIndexes.add(columnIndex);
            }
            nextStart += block.getRowCount();
        }
        parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
    for (HiveColumnHandle column : baseColumns) {
        checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
    }
    ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
    ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
    ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
    for (HiveColumnHandle column : baseColumns) {
        trinoTypes.add(column.getBaseType());
        rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
        if (column == PARQUET_ROW_INDEX_COLUMN) {
            internalFields.add(Optional.empty());
        } else {
            internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
                String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
                return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
            }));
        }
    }
    ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
    return new ReaderPageSource(parquetPageSource, readerProjections);
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ImmutableList(com.google.common.collect.ImmutableList) FileNotFoundException(java.io.FileNotFoundException) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(io.trino.parquet.predicate.Predicate) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) FileSystem(org.apache.hadoop.fs.FileSystem) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) List(java.util.List) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) ImmutableList(com.google.common.collect.ImmutableList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) MessageType(org.apache.parquet.schema.MessageType) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ParquetDataSource(io.trino.parquet.ParquetDataSource) Optional(java.util.Optional) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetReader(io.trino.parquet.reader.ParquetReader) IOException(java.io.IOException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns)

Example 2 with Predicate

use of io.trino.parquet.predicate.Predicate in project trino by trinodb.

the class IcebergPageSourceProvider method createParquetPageSource.

private static ReaderPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> regularColumns, ParquetReaderOptions options, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, Optional<NameMapping> nameMapping) {
    AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), fileSize, inputStream, fileFormatDataSourceStats, options);
        // extra variable required for lambda below
        ParquetDataSource theDataSource = dataSource;
        ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(identity, () -> MetadataReader.readFooter(theDataSource));
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        if (nameMapping.isPresent() && !ParquetSchemaUtil.hasIds(fileSchema)) {
            // NameMapping conversion is necessary because MetadataReader converts all column names to lowercase and NameMapping is case sensitive
            fileSchema = ParquetSchemaUtil.applyNameMapping(fileSchema, convertToLowercase(nameMapping.get()));
        }
        // Mapping from Iceberg field ID to Parquet fields.
        Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
        Optional<ReaderColumns> columnProjections = projectColumns(regularColumns);
        List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(regularColumns);
        List<org.apache.parquet.schema.Type> parquetFields = readColumns.stream().map(column -> parquetIdToField.get(column.getId())).collect(toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, UTC);
        List<BlockMetaData> blocks = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain)) {
                blocks.add(block);
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumnIO, blocks, Optional.empty(), dataSource, UTC, memoryContext, options);
        ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
        for (int columnIndex = 0; columnIndex < readColumns.size(); columnIndex++) {
            IcebergColumnHandle column = readColumns.get(columnIndex);
            org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
            Type trinoType = column.getBaseType();
            trinoTypes.add(trinoType);
            if (parquetField == null) {
                internalFields.add(Optional.empty());
            } else {
                // The top level columns are already mapped by name/id appropriately.
                ColumnIO columnIO = messageColumnIO.getChild(parquetField.getName());
                internalFields.add(IcebergParquetColumnIOConverter.constructField(new FieldContext(trinoType, column.getColumnIdentity()), columnIO));
            }
        }
        return new ReaderPageSource(new ParquetPageSource(parquetReader, trinoTypes.build(), internalFields.build()), columnProjections);
    } catch (IOException | RuntimeException e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof ParquetCorruptionException) {
            throw new TrinoException(ICEBERG_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ORC_ICEBERG_ID_KEY(io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ICEBERG_CANNOT_OPEN_SPLIT(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) UUID(io.trino.spi.type.UuidType.UUID) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) ICEBERG_FILESYSTEM_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) IcebergSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MappedField(org.apache.iceberg.mapping.MappedField) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) IcebergSessionProperties.isOrcNestedLazy(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy) IcebergSessionProperties.getOrcMaxBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IcebergSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) ICEBERG_MISSING_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) MappedFields(org.apache.iceberg.mapping.MappedFields) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) OrcType(io.trino.orc.metadata.OrcType) Predicate(io.trino.parquet.predicate.Predicate) IcebergSessionProperties.isUseFileSizeFromMetadata(io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata) MapType(io.trino.spi.type.MapType) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) StandardTypes(io.trino.spi.type.StandardTypes) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IcebergSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) IcebergSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) UTC(org.joda.time.DateTimeZone.UTC) Field(io.trino.parquet.Field) Traverser(com.google.common.graph.Traverser) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) IcebergSessionProperties.getOrcStreamBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetSchemaUtil(org.apache.iceberg.parquet.ParquetSchemaUtil) OrcColumn(io.trino.orc.OrcColumn) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) MetadataReader(io.trino.parquet.reader.MetadataReader) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) OrcRecordReader(io.trino.orc.OrcRecordReader) NameMapping(org.apache.iceberg.mapping.NameMapping) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) RowType(io.trino.spi.type.RowType) ICEBERG_BAD_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) ParquetReader(io.trino.parquet.reader.ParquetReader) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) ICEBERG_CURSOR_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) OrcDataSourceId(io.trino.orc.OrcDataSourceId) MessageType(org.apache.parquet.schema.MessageType) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) IcebergSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) Function(java.util.function.Function) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) IcebergSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) OrcReader(io.trino.orc.OrcReader) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ICEBERG_BINARY_TYPE(io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) OrcCorruptionException(io.trino.orc.OrcCorruptionException) Collectors.toList(java.util.stream.Collectors.toList) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) ParquetDataSource(io.trino.parquet.ParquetDataSource) TypeManager(io.trino.spi.type.TypeManager) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(io.trino.parquet.predicate.Predicate) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) MessageType(org.apache.parquet.schema.MessageType) Optional(java.util.Optional) ParquetReader(io.trino.parquet.reader.ParquetReader) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ColumnIO(org.apache.parquet.io.ColumnIO) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Objects(java.util.Objects) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ParquetDataSource(io.trino.parquet.ParquetDataSource) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) IOException(java.io.IOException) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) OrcType(io.trino.orc.metadata.OrcType) MapType(io.trino.spi.type.MapType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) TrinoException(io.trino.spi.TrinoException) ReaderColumns(io.trino.plugin.hive.ReaderColumns)

Example 3 with Predicate

use of io.trino.parquet.predicate.Predicate in project trino by trinodb.

the class ParquetPageSourceFactory method getColumnIndexStore.

private static Optional<ColumnIndexStore> getColumnIndexStore(ParquetDataSource dataSource, BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, ParquetReaderOptions options) {
    if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) {
        return Optional.empty();
    }
    boolean hasColumnIndex = false;
    for (ColumnChunkMetaData column : blockMetadata.getColumns()) {
        if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) {
            hasColumnIndex = true;
            break;
        }
    }
    if (!hasColumnIndex) {
        return Optional.empty();
    }
    Set<ColumnPath> columnsReadPaths = new HashSet<>(descriptorsByPath.size());
    for (List<String> path : descriptorsByPath.keySet()) {
        columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0])));
    }
    Map<ColumnDescriptor, Domain> parquetDomains = parquetTupleDomain.getDomains().orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains"));
    Set<ColumnPath> columnsFilteredPaths = parquetDomains.keySet().stream().map(column -> ColumnPath.get(column.getPath())).collect(toImmutableSet());
    return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths));
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) HashSet(java.util.HashSet)

Example 4 with Predicate

use of io.trino.parquet.predicate.Predicate in project trino by trinodb.

the class IcebergPageSourceProvider method createOrcPageSource.

private static ReaderPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> columns, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, FileFormatDataSourceStats stats, TypeManager typeManager, Optional<NameMapping> nameMapping) {
    OrcDataSource orcDataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options, inputStream, stats);
        OrcReader reader = OrcReader.createOrcReader(orcDataSource, options).orElseThrow(() -> new TrinoException(ICEBERG_BAD_DATA, "ORC file is zero length"));
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        if (nameMapping.isPresent() && !hasIds(reader.getRootColumn())) {
            fileColumns = fileColumns.stream().map(orcColumn -> setMissingFieldIds(orcColumn, nameMapping.get(), ImmutableList.of(orcColumn.getColumnName()))).collect(toImmutableList());
        }
        Map<Integer, OrcColumn> fileColumnsByIcebergId = mapIdsToOrcFileColumns(fileColumns);
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled());
        Map<IcebergColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        Optional<ReaderColumns> columnProjections = projectColumns(columns);
        Map<Integer, List<List<Integer>>> projectionsByFieldId = columns.stream().collect(groupingBy(column -> column.getBaseColumnIdentity().getId(), mapping(IcebergColumnHandle::getPath, toUnmodifiableList())));
        List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(columns);
        List<OrcColumn> fileReadColumns = new ArrayList<>(readColumns.size());
        List<Type> fileReadTypes = new ArrayList<>(readColumns.size());
        List<ProjectedLayout> projectedLayouts = new ArrayList<>(readColumns.size());
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(readColumns.size());
        for (IcebergColumnHandle column : readColumns) {
            verify(column.isBaseColumn(), "Column projections must be based from a root column");
            OrcColumn orcColumn = fileColumnsByIcebergId.get(column.getId());
            if (orcColumn != null) {
                Type readType = getOrcReadType(column.getType(), typeManager);
                if (column.getType() == UUID && !"UUID".equals(orcColumn.getAttributes().get(ICEBERG_BINARY_TYPE))) {
                    throw new TrinoException(ICEBERG_BAD_DATA, format("Expected ORC column for UUID data to be annotated with %s=UUID: %s", ICEBERG_BINARY_TYPE, orcColumn));
                }
                List<List<Integer>> fieldIdProjections = projectionsByFieldId.get(column.getId());
                ProjectedLayout projectedLayout = IcebergOrcProjectedLayout.createProjectedLayout(orcColumn, fieldIdProjections);
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                projectedLayouts.add(projectedLayout);
                for (Map.Entry<IcebergColumnHandle, Domain> domainEntry : effectivePredicateDomains.entrySet()) {
                    IcebergColumnHandle predicateColumn = domainEntry.getKey();
                    OrcColumn predicateOrcColumn = fileColumnsByIcebergId.get(predicateColumn.getId());
                    if (predicateOrcColumn != null && column.getColumnIdentity().equals(predicateColumn.getBaseColumnIdentity())) {
                        predicateBuilder.addColumn(predicateOrcColumn.getColumnId(), domainEntry.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(column.getType()));
            }
        }
        AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
        OrcDataSourceId orcDataSourceId = orcDataSource.getId();
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, projectedLayouts, predicateBuilder.build(), start, length, UTC, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSourceId, exception), new IdBasedFieldMapperFactory(readColumns));
        return new ReaderPageSource(new OrcPageSource(recordReader, columnAdaptations, orcDataSource, Optional.empty(), Optional.empty(), memoryUsage, stats), columnProjections);
    } catch (Exception e) {
        if (orcDataSource != null) {
            try {
                orcDataSource.close();
            } catch (IOException ignored) {
            }
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ORC_ICEBERG_ID_KEY(io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ICEBERG_CANNOT_OPEN_SPLIT(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) UUID(io.trino.spi.type.UuidType.UUID) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) ICEBERG_FILESYSTEM_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) IcebergSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MappedField(org.apache.iceberg.mapping.MappedField) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) IcebergSessionProperties.isOrcNestedLazy(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy) IcebergSessionProperties.getOrcMaxBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IcebergSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) ICEBERG_MISSING_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) MappedFields(org.apache.iceberg.mapping.MappedFields) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) OrcType(io.trino.orc.metadata.OrcType) Predicate(io.trino.parquet.predicate.Predicate) IcebergSessionProperties.isUseFileSizeFromMetadata(io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata) MapType(io.trino.spi.type.MapType) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) StandardTypes(io.trino.spi.type.StandardTypes) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IcebergSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) IcebergSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) UTC(org.joda.time.DateTimeZone.UTC) Field(io.trino.parquet.Field) Traverser(com.google.common.graph.Traverser) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) IcebergSessionProperties.getOrcStreamBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetSchemaUtil(org.apache.iceberg.parquet.ParquetSchemaUtil) OrcColumn(io.trino.orc.OrcColumn) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) MetadataReader(io.trino.parquet.reader.MetadataReader) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) OrcRecordReader(io.trino.orc.OrcRecordReader) NameMapping(org.apache.iceberg.mapping.NameMapping) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) RowType(io.trino.spi.type.RowType) ICEBERG_BAD_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) ParquetReader(io.trino.parquet.reader.ParquetReader) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) ICEBERG_CURSOR_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) OrcDataSourceId(io.trino.orc.OrcDataSourceId) MessageType(org.apache.parquet.schema.MessageType) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) IcebergSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) Function(java.util.function.Function) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) IcebergSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) OrcReader(io.trino.orc.OrcReader) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ICEBERG_BINARY_TYPE(io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) OrcCorruptionException(io.trino.orc.OrcCorruptionException) Collectors.toList(java.util.stream.Collectors.toList) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) ParquetDataSource(io.trino.parquet.ParquetDataSource) TypeManager(io.trino.spi.type.TypeManager) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) ArrayList(java.util.ArrayList) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) OrcDataSource(io.trino.orc.OrcDataSource) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) OrcColumn(io.trino.orc.OrcColumn) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) TrinoException(io.trino.spi.TrinoException) OrcCorruptionException(io.trino.orc.OrcCorruptionException) OrcType(io.trino.orc.metadata.OrcType) MapType(io.trino.spi.type.MapType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) OrcReader(io.trino.orc.OrcReader) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)4 ImmutableMap (com.google.common.collect.ImmutableMap)4 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)4 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)4 Field (io.trino.parquet.Field)4 ParquetCorruptionException (io.trino.parquet.ParquetCorruptionException)4 ParquetDataSource (io.trino.parquet.ParquetDataSource)4 ParquetDataSourceId (io.trino.parquet.ParquetDataSourceId)4 ParquetReaderOptions (io.trino.parquet.ParquetReaderOptions)4 ParquetTypeUtils.getColumnIO (io.trino.parquet.ParquetTypeUtils.getColumnIO)4 ParquetTypeUtils.getDescriptors (io.trino.parquet.ParquetTypeUtils.getDescriptors)4 RichColumnDescriptor (io.trino.parquet.RichColumnDescriptor)4 Predicate (io.trino.parquet.predicate.Predicate)4 PredicateUtils.buildPredicate (io.trino.parquet.predicate.PredicateUtils.buildPredicate)4 PredicateUtils.predicateMatches (io.trino.parquet.predicate.PredicateUtils.predicateMatches)4 MetadataReader (io.trino.parquet.reader.MetadataReader)4 ParquetReader (io.trino.parquet.reader.ParquetReader)4 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)4 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)4 ReaderColumns (io.trino.plugin.hive.ReaderColumns)4