Search in sources :

Example 1 with ParquetReaderOptions

use of io.trino.parquet.ParquetReaderOptions in project trino by trinodb.

the class ParquetPageSourceFactory method createPageSource.

/**
 * This method is available for other callers to use directly.
 */
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
    // Ignore predicates on partial columns for now.
    effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
    MessageType fileSchema;
    MessageType requestedSchema;
    MessageColumnIO messageColumn;
    ParquetReader parquetReader;
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        messageColumn = getColumnIO(fileSchema, requestedSchema);
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
        long nextStart = 0;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
        ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
            if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
                blocks.add(block);
                blockStarts.add(nextStart);
                columnIndexes.add(columnIndex);
            }
            nextStart += block.getRowCount();
        }
        parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
    for (HiveColumnHandle column : baseColumns) {
        checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
    }
    ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
    ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
    ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
    for (HiveColumnHandle column : baseColumns) {
        trinoTypes.add(column.getBaseType());
        rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
        if (column == PARQUET_ROW_INDEX_COLUMN) {
            internalFields.add(Optional.empty());
        } else {
            internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
                String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
                return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
            }));
        }
    }
    ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
    return new ReaderPageSource(parquetPageSource, readerProjections);
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ImmutableList(com.google.common.collect.ImmutableList) FileNotFoundException(java.io.FileNotFoundException) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(io.trino.parquet.predicate.Predicate) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) FileSystem(org.apache.hadoop.fs.FileSystem) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) List(java.util.List) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) ImmutableList(com.google.common.collect.ImmutableList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) MessageType(org.apache.parquet.schema.MessageType) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ParquetDataSource(io.trino.parquet.ParquetDataSource) Optional(java.util.Optional) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetReader(io.trino.parquet.reader.ParquetReader) IOException(java.io.IOException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns)

Example 2 with ParquetReaderOptions

use of io.trino.parquet.ParquetReaderOptions in project trino by trinodb.

the class TestHdfsParquetDataSource method testPlanReadOrdering.

@Test(dataProvider = "testPlanReadOrderingProvider")
public void testPlanReadOrdering(DataSize maxBufferSize) {
    Slice testingInput = Slices.wrappedIntArray(IntStream.range(0, 1000).toArray());
    HdfsParquetDataSource dataSource = new HdfsParquetDataSource(new ParquetDataSourceId("test"), 0, new FSDataInputStream(new TestingSliceInputStream(testingInput.getInput())), new FileFormatDataSourceStats(), new ParquetReaderOptions().withMaxBufferSize(maxBufferSize));
    ListMultimap<String, ChunkReader> chunkReaders = dataSource.planRead(ImmutableListMultimap.<String, DiskRange>builder().putAll("test", new DiskRange(0, 300), new DiskRange(400, 100), new DiskRange(700, 200)).build());
    assertThat(chunkReaders.get("test")).map(ChunkReader::read).isEqualTo(ImmutableList.of(testingInput.slice(0, 300), testingInput.slice(400, 100), testingInput.slice(700, 200)));
}
Also used : Slice(io.airlift.slice.Slice) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) ChunkReader(io.trino.parquet.ChunkReader) DiskRange(io.trino.parquet.DiskRange) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Test(org.testng.annotations.Test)

Example 3 with ParquetReaderOptions

use of io.trino.parquet.ParquetReaderOptions in project trino by trinodb.

the class IcebergPageSourceProvider method createParquetPageSource.

private static ReaderPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> regularColumns, ParquetReaderOptions options, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, Optional<NameMapping> nameMapping) {
    AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), fileSize, inputStream, fileFormatDataSourceStats, options);
        // extra variable required for lambda below
        ParquetDataSource theDataSource = dataSource;
        ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(identity, () -> MetadataReader.readFooter(theDataSource));
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        if (nameMapping.isPresent() && !ParquetSchemaUtil.hasIds(fileSchema)) {
            // NameMapping conversion is necessary because MetadataReader converts all column names to lowercase and NameMapping is case sensitive
            fileSchema = ParquetSchemaUtil.applyNameMapping(fileSchema, convertToLowercase(nameMapping.get()));
        }
        // Mapping from Iceberg field ID to Parquet fields.
        Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
        Optional<ReaderColumns> columnProjections = projectColumns(regularColumns);
        List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(regularColumns);
        List<org.apache.parquet.schema.Type> parquetFields = readColumns.stream().map(column -> parquetIdToField.get(column.getId())).collect(toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, UTC);
        List<BlockMetaData> blocks = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain)) {
                blocks.add(block);
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumnIO, blocks, Optional.empty(), dataSource, UTC, memoryContext, options);
        ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
        ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
        for (int columnIndex = 0; columnIndex < readColumns.size(); columnIndex++) {
            IcebergColumnHandle column = readColumns.get(columnIndex);
            org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
            Type trinoType = column.getBaseType();
            trinoTypes.add(trinoType);
            if (parquetField == null) {
                internalFields.add(Optional.empty());
            } else {
                // The top level columns are already mapped by name/id appropriately.
                ColumnIO columnIO = messageColumnIO.getChild(parquetField.getName());
                internalFields.add(IcebergParquetColumnIOConverter.constructField(new FieldContext(trinoType, column.getColumnIdentity()), columnIO));
            }
        }
        return new ReaderPageSource(new ParquetPageSource(parquetReader, trinoTypes.build(), internalFields.build()), columnProjections);
    } catch (IOException | RuntimeException e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof ParquetCorruptionException) {
            throw new TrinoException(ICEBERG_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
        }
        throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ORC_ICEBERG_ID_KEY(io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ICEBERG_CANNOT_OPEN_SPLIT(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) UUID(io.trino.spi.type.UuidType.UUID) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) ICEBERG_FILESYSTEM_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) IcebergSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MappedField(org.apache.iceberg.mapping.MappedField) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) IcebergSessionProperties.isOrcNestedLazy(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy) IcebergSessionProperties.getOrcMaxBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IcebergSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) ICEBERG_MISSING_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) MappedFields(org.apache.iceberg.mapping.MappedFields) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) OrcType(io.trino.orc.metadata.OrcType) Predicate(io.trino.parquet.predicate.Predicate) IcebergSessionProperties.isUseFileSizeFromMetadata(io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata) MapType(io.trino.spi.type.MapType) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) StandardTypes(io.trino.spi.type.StandardTypes) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IcebergSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) IcebergSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) UTC(org.joda.time.DateTimeZone.UTC) Field(io.trino.parquet.Field) Traverser(com.google.common.graph.Traverser) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) IcebergSessionProperties.getOrcStreamBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetSchemaUtil(org.apache.iceberg.parquet.ParquetSchemaUtil) OrcColumn(io.trino.orc.OrcColumn) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) MetadataReader(io.trino.parquet.reader.MetadataReader) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) OrcRecordReader(io.trino.orc.OrcRecordReader) NameMapping(org.apache.iceberg.mapping.NameMapping) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) RowType(io.trino.spi.type.RowType) ICEBERG_BAD_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) ParquetReader(io.trino.parquet.reader.ParquetReader) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) ICEBERG_CURSOR_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) OrcDataSourceId(io.trino.orc.OrcDataSourceId) MessageType(org.apache.parquet.schema.MessageType) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) IcebergSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) Function(java.util.function.Function) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) IcebergSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) OrcReader(io.trino.orc.OrcReader) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ICEBERG_BINARY_TYPE(io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) OrcCorruptionException(io.trino.orc.OrcCorruptionException) Collectors.toList(java.util.stream.Collectors.toList) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) ParquetDataSource(io.trino.parquet.ParquetDataSource) TypeManager(io.trino.spi.type.TypeManager) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(io.trino.parquet.predicate.Predicate) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) MessageType(org.apache.parquet.schema.MessageType) Optional(java.util.Optional) ParquetReader(io.trino.parquet.reader.ParquetReader) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ColumnIO(org.apache.parquet.io.ColumnIO) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Objects(java.util.Objects) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ParquetDataSource(io.trino.parquet.ParquetDataSource) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) IOException(java.io.IOException) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) OrcType(io.trino.orc.metadata.OrcType) MapType(io.trino.spi.type.MapType) RowType(io.trino.spi.type.RowType) ArrayType(io.trino.spi.type.ArrayType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) TrinoException(io.trino.spi.TrinoException) ReaderColumns(io.trino.plugin.hive.ReaderColumns)

Example 4 with ParquetReaderOptions

use of io.trino.parquet.ParquetReaderOptions in project trino by trinodb.

the class ParquetPageSourceFactory method getColumnIndexStore.

private static Optional<ColumnIndexStore> getColumnIndexStore(ParquetDataSource dataSource, BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, ParquetReaderOptions options) {
    if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) {
        return Optional.empty();
    }
    boolean hasColumnIndex = false;
    for (ColumnChunkMetaData column : blockMetadata.getColumns()) {
        if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) {
            hasColumnIndex = true;
            break;
        }
    }
    if (!hasColumnIndex) {
        return Optional.empty();
    }
    Set<ColumnPath> columnsReadPaths = new HashSet<>(descriptorsByPath.size());
    for (List<String> path : descriptorsByPath.keySet()) {
        columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0])));
    }
    Map<ColumnDescriptor, Domain> parquetDomains = parquetTupleDomain.getDomains().orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains"));
    Set<ColumnPath> columnsFilteredPaths = parquetDomains.keySet().stream().map(column -> ColumnPath.get(column.getPath())).collect(toImmutableSet());
    return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths));
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) HashSet(java.util.HashSet)

Example 5 with ParquetReaderOptions

use of io.trino.parquet.ParquetReaderOptions in project trino by trinodb.

the class DeltaLakePageSourceProvider method createPageSource.

@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
    DeltaLakeSplit split = (DeltaLakeSplit) connectorSplit;
    DeltaLakeTableHandle table = (DeltaLakeTableHandle) connectorTable;
    // We reach here when we could not prune the split using file level stats, table predicate
    // and the dynamic filter in the coordinator during split generation. The file level stats
    // in DeltaLakeSplit#filePredicate could help to prune this split when a more selective dynamic filter
    // is available now, without having to access parquet file footer for row-group stats.
    // We avoid sending DeltaLakeSplit#splitPredicate to workers by using table.getPredicate() here.
    TupleDomain<DeltaLakeColumnHandle> filteredSplitPredicate = TupleDomain.intersect(ImmutableList.of(table.getNonPartitionConstraint(), split.getStatisticsPredicate(), dynamicFilter.getCurrentPredicate().transformKeys(DeltaLakeColumnHandle.class::cast)));
    if (filteredSplitPredicate.isNone()) {
        return new EmptyPageSource();
    }
    List<DeltaLakeColumnHandle> deltaLakeColumns = columns.stream().map(DeltaLakeColumnHandle.class::cast).collect(toImmutableList());
    Map<String, Optional<String>> partitionKeys = split.getPartitionKeys();
    List<DeltaLakeColumnHandle> regularColumns = deltaLakeColumns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
    List<HiveColumnHandle> hiveColumnHandles = regularColumns.stream().map(DeltaLakeColumnHandle::toHiveColumnHandle).collect(toImmutableList());
    Path path = new Path(split.getPath());
    HdfsContext hdfsContext = new HdfsContext(session);
    TupleDomain<HiveColumnHandle> parquetPredicate = getParquetTupleDomain(filteredSplitPredicate.simplify(domainCompactionThreshold));
    if (table.getWriteType().isPresent()) {
        return new DeltaLakeUpdatablePageSource(table, deltaLakeColumns, partitionKeys, split.getPath(), split.getFileSize(), split.getFileModifiedTime(), session, executorService, hdfsEnvironment, hdfsContext, parquetDateTimeZone, parquetReaderOptions, parquetPredicate, typeManager, updateResultJsonCodec);
    }
    ReaderPageSource pageSource = ParquetPageSourceFactory.createPageSource(path, split.getStart(), split.getLength(), split.getFileSize(), hiveColumnHandles, parquetPredicate, true, hdfsEnvironment, hdfsEnvironment.getConfiguration(hdfsContext, path), session.getIdentity(), parquetDateTimeZone, fileFormatDataSourceStats, parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session)).withUseColumnIndex(isParquetUseColumnIndex(session)));
    verify(pageSource.getReaderColumns().isEmpty(), "All columns expected to be base columns");
    return new DeltaLakePageSource(deltaLakeColumns, partitionKeys, pageSource.get(), split.getPath(), split.getFileSize(), split.getFileModifiedTime());
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) Inject(javax.inject.Inject) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) Path(org.apache.hadoop.fs.Path) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ExecutorService(java.util.concurrent.ExecutorService) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) StandardTypes(io.trino.spi.type.StandardTypes) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DeltaLakeSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockSize) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) TypeManager(io.trino.spi.type.TypeManager) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) JsonCodec(io.airlift.json.JsonCodec) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) Path(org.apache.hadoop.fs.Path) Optional(java.util.Optional) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle)

Aggregations

ParquetReaderOptions (io.trino.parquet.ParquetReaderOptions)5 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)5 ImmutableList (com.google.common.collect.ImmutableList)4 ImmutableMap (com.google.common.collect.ImmutableMap)4 ParquetDataSourceId (io.trino.parquet.ParquetDataSourceId)4 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)4 ReaderPageSource (io.trino.plugin.hive.ReaderPageSource)4 ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)4 ConnectorSession (io.trino.spi.connector.ConnectorSession)4 Domain (io.trino.spi.predicate.Domain)4 TupleDomain (io.trino.spi.predicate.TupleDomain)4 List (java.util.List)4 Map (java.util.Map)4 Objects.requireNonNull (java.util.Objects.requireNonNull)4 Optional (java.util.Optional)4 Inject (javax.inject.Inject)4 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)3 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)3 Field (io.trino.parquet.Field)3 ParquetCorruptionException (io.trino.parquet.ParquetCorruptionException)3