Search in sources :

Example 11 with Domain

use of io.trino.spi.predicate.Domain in project trino by trinodb.

the class OrcPageSourceFactory method createOrcPageSource.

private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
    for (HiveColumnHandle column : columns) {
        checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
    }
    checkArgument(!effectivePredicate.isNone());
    OrcDataSource orcDataSource;
    boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
    try {
        Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
        if (optionalOrcReader.isEmpty()) {
            return new EmptyPageSource();
        }
        OrcReader reader = optionalOrcReader.get();
        if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
            validateOrcAcidVersion(path, reader);
        }
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
        List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
        List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
        List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
        if (isFullAcid && !originalFilesPresent) {
            verifyAcidSchema(reader, path);
            Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
            fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
            fileReadTypes.add(INTEGER);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
        }
        Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            verifyFileHasColumnNames(fileColumns, path);
            // Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
            fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
        }
        Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
        Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        } else {
            projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        }
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
        Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
        for (HiveColumnHandle column : columns) {
            OrcColumn orcColumn = null;
            OrcReader.ProjectedLayout projectedLayout = null;
            Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
            if (useOrcColumnNames || isFullAcid) {
                String columnName = column.getName().toLowerCase(ENGLISH);
                orcColumn = fileColumnsByName.get(columnName);
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            } else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
                orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            }
            Type readType = column.getType();
            if (orcColumn != null) {
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                fileReadLayouts.add(projectedLayout);
                // Add predicates on top-level and nested columns
                for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
                    OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
                    if (nestedColumn != null) {
                        predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
            }
        }
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
        Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
        Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
        if (transaction.isDelete()) {
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
            } else {
                columnAdaptations.add(ColumnAdaptation.rowIdColumn());
            }
        } else if (transaction.isUpdate()) {
            HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
            List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
            } else {
                columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
            }
        }
        return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) OrcTypeKind(io.trino.orc.metadata.OrcType.OrcTypeKind) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) HiveSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.hive.HiveSessionProperties.getOrcLazyReadSmallRanges) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.hive.HiveSessionProperties.getOrcTinyStripeThreshold) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Slice(io.airlift.slice.Slice) STRUCT(io.trino.orc.metadata.OrcType.OrcTypeKind.STRUCT) ColumnAdaptation.updatedRowColumns(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumns) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) LONG(io.trino.orc.metadata.OrcType.OrcTypeKind.LONG) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) INT(io.trino.orc.metadata.OrcType.OrcTypeKind.INT) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) Properties(java.util.Properties) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) HiveSessionProperties.isOrcNestedLazy(io.trino.plugin.hive.HiveSessionProperties.isOrcNestedLazy) OrcColumn(io.trino.orc.OrcColumn) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) OrcRecordReader(io.trino.orc.OrcRecordReader) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) ImmutableMap(com.google.common.collect.ImmutableMap) AcidUtils.isFullAcidTable(org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) PRESTO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.PRESTO_WRITER_ID) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) Pattern(java.util.regex.Pattern) TRINO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.TRINO_WRITER_ID) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) HiveUtil.isDeserializerClass(io.trino.plugin.hive.util.HiveUtil.isDeserializerClass) Type(io.trino.spi.type.Type) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) AcidSchema(io.trino.plugin.hive.acid.AcidSchema) HiveSessionProperties.isUseOrcColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseOrcColumnNames) OptionalInt(java.util.OptionalInt) Inject(javax.inject.Inject) HiveSessionProperties.getOrcStreamBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcStreamBufferSize) ImmutableList(com.google.common.collect.ImmutableList) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) HiveSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.hive.HiveSessionProperties.isOrcBloomFiltersEnabled) HiveSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxReadBlockSize) OrcReader(io.trino.orc.OrcReader) HiveSessionProperties.getOrcMaxBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxBufferSize) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) UTF_8(java.nio.charset.StandardCharsets.UTF_8) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) Maps(com.google.common.collect.Maps) HiveSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxMergeDistance) ColumnAdaptation.updatedRowColumnsWithOriginalFiles(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumnsWithOriginalFiles) AcidInfo(io.trino.plugin.hive.AcidInfo) HiveColumnProjectionInfo(io.trino.plugin.hive.HiveColumnProjectionInfo) Collectors.toList(java.util.stream.Collectors.toList) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) HIVE_FILE_MISSING_COLUMN_NAMES(io.trino.plugin.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) Optional(java.util.Optional) OrcColumn(io.trino.orc.OrcColumn) OrcReader(io.trino.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) OrcDataSource(io.trino.orc.OrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) IOException(java.io.IOException) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) Type(io.trino.spi.type.Type) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) TrinoException(io.trino.spi.TrinoException) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout)

Example 12 with Domain

use of io.trino.spi.predicate.Domain in project trino by trinodb.

the class ParquetPageSourceFactory method createPageSource.

/**
 * This method is available for other callers to use directly.
 */
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
    // Ignore predicates on partial columns for now.
    effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
    MessageType fileSchema;
    MessageType requestedSchema;
    MessageColumnIO messageColumn;
    ParquetReader parquetReader;
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        messageColumn = getColumnIO(fileSchema, requestedSchema);
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
        long nextStart = 0;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
        ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
            if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
                blocks.add(block);
                blockStarts.add(nextStart);
                columnIndexes.add(columnIndex);
            }
            nextStart += block.getRowCount();
        }
        parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
    for (HiveColumnHandle column : baseColumns) {
        checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
    }
    ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
    ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
    ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
    for (HiveColumnHandle column : baseColumns) {
        trinoTypes.add(column.getBaseType());
        rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
        if (column == PARQUET_ROW_INDEX_COLUMN) {
            internalFields.add(Optional.empty());
        } else {
            internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
                String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
                return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
            }));
        }
    }
    ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
    return new ReaderPageSource(parquetPageSource, readerProjections);
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ImmutableList(com.google.common.collect.ImmutableList) FileNotFoundException(java.io.FileNotFoundException) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(io.trino.parquet.predicate.Predicate) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) FileSystem(org.apache.hadoop.fs.FileSystem) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) List(java.util.List) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) ImmutableList(com.google.common.collect.ImmutableList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) MessageType(org.apache.parquet.schema.MessageType) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ParquetDataSource(io.trino.parquet.ParquetDataSource) Optional(java.util.Optional) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetReader(io.trino.parquet.reader.ParquetReader) IOException(java.io.IOException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns)

Example 13 with Domain

use of io.trino.spi.predicate.Domain in project trino by trinodb.

the class HiveBucketing method getHiveBucketFilter.

public static Optional<HiveBucketFilter> getHiveBucketFilter(HiveTableHandle hiveTable, TupleDomain<ColumnHandle> effectivePredicate) {
    if (hiveTable.getBucketHandle().isEmpty()) {
        return Optional.empty();
    }
    HiveBucketProperty hiveBucketProperty = hiveTable.getBucketHandle().get().toTableBucketProperty();
    List<Column> dataColumns = hiveTable.getDataColumns().stream().map(HiveColumnHandle::toMetastoreColumn).collect(toImmutableList());
    Optional<Map<ColumnHandle, List<NullableValue>>> bindings = TupleDomain.extractDiscreteValues(effectivePredicate);
    if (bindings.isEmpty()) {
        return Optional.empty();
    }
    Optional<Set<Integer>> buckets = getHiveBuckets(hiveBucketProperty, dataColumns, bindings.get());
    if (buckets.isPresent()) {
        return Optional.of(new HiveBucketFilter(buckets.get()));
    }
    Optional<Domain> domain = effectivePredicate.getDomains().flatMap(domains -> domains.entrySet().stream().filter(entry -> ((HiveColumnHandle) entry.getKey()).getName().equals(BUCKET_COLUMN_NAME)).findFirst().map(Entry::getValue));
    if (domain.isEmpty()) {
        return Optional.empty();
    }
    ValueSet values = domain.get().getValues();
    ImmutableSet.Builder<Integer> builder = ImmutableSet.builder();
    int bucketCount = hiveBucketProperty.getBucketCount();
    for (int i = 0; i < bucketCount; i++) {
        if (values.containsValue((long) i)) {
            builder.add(i);
        }
    }
    return Optional.of(new HiveBucketFilter(builder.build()));
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) BUCKETING_V2(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V2) Lists.cartesianProduct(com.google.common.collect.Lists.cartesianProduct) BUCKETING_V1(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V1) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) Column(io.trino.plugin.hive.metastore.Column) Map(java.util.Map) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ImmutableSet(com.google.common.collect.ImmutableSet) Table(io.trino.plugin.hive.metastore.Table) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) HiveTimestampPrecision(io.trino.plugin.hive.HiveTimestampPrecision) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) ValueSet(io.trino.spi.predicate.ValueSet) Objects(java.util.Objects) List(java.util.List) BUCKET_COLUMN_NAME(io.trino.plugin.hive.HiveColumnHandle.BUCKET_COLUMN_NAME) StandardErrorCode(io.trino.spi.StandardErrorCode) Entry(java.util.Map.Entry) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) NullableValue(io.trino.spi.predicate.NullableValue) Page(io.trino.spi.Page) HashMap(java.util.HashMap) HiveBucketProperty(io.trino.plugin.hive.HiveBucketProperty) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) HIVE_INVALID_METADATA(io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA) ImmutableList(com.google.common.collect.ImmutableList) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) ColumnHandle(io.trino.spi.connector.ColumnHandle) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TABLE_BUCKETING_VERSION(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_BUCKETING_VERSION) HiveBucketHandle(io.trino.plugin.hive.HiveBucketHandle) HiveUtil.getRegularColumnHandles(io.trino.plugin.hive.util.HiveUtil.getRegularColumnHandles) HiveSessionProperties.getTimestampPrecision(io.trino.plugin.hive.HiveSessionProperties.getTimestampPrecision) SPARK_TABLE_PROVIDER_KEY(io.trino.plugin.hive.util.HiveUtil.SPARK_TABLE_PROVIDER_KEY) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TypeManager(io.trino.spi.type.TypeManager) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ValueSet(io.trino.spi.predicate.ValueSet) HashSet(java.util.HashSet) NullableValue(io.trino.spi.predicate.NullableValue) HiveBucketProperty(io.trino.plugin.hive.HiveBucketProperty) ImmutableSet(com.google.common.collect.ImmutableSet) Column(io.trino.plugin.hive.metastore.Column) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) HashMap(java.util.HashMap) ValueSet(io.trino.spi.predicate.ValueSet)

Example 14 with Domain

use of io.trino.spi.predicate.Domain in project trino by trinodb.

the class AbstractTestHive method doTestTransactionDeleteInsert.

protected void doTestTransactionDeleteInsert(HiveStorageFormat storageFormat, boolean allowInsertExisting, List<TransactionDeleteInsertTestCase> testCases) throws Exception {
    // There are 4 types of operations on a partition: add, drop, alter (drop then add), insert existing.
    // There are 12 partitions in this test, 3 for each type.
    // 3 is chosen to verify that cleanups, commit aborts, rollbacks are always as complete as possible regardless of failure.
    MaterializedResult beforeData = MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()).row(110L, "a", "alter1").row(120L, "a", "insert1").row(140L, "a", "drop1").row(210L, "b", "drop2").row(310L, "c", "alter2").row(320L, "c", "alter3").row(510L, "e", "drop3").row(610L, "f", "insert2").row(620L, "f", "insert3").build();
    Domain domainToDrop = Domain.create(ValueSet.of(createUnboundedVarcharType(), utf8Slice("alter1"), utf8Slice("alter2"), utf8Slice("alter3"), utf8Slice("drop1"), utf8Slice("drop2"), utf8Slice("drop3")), false);
    List<MaterializedRow> extraRowsForInsertExisting = ImmutableList.of();
    if (allowInsertExisting) {
        extraRowsForInsertExisting = MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()).row(121L, "a", "insert1").row(611L, "f", "insert2").row(621L, "f", "insert3").build().getMaterializedRows();
    }
    MaterializedResult insertData = MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()).row(111L, "a", "alter1").row(131L, "a", "add1").row(221L, "b", "add2").row(311L, "c", "alter2").row(321L, "c", "alter3").row(411L, "d", "add3").rows(extraRowsForInsertExisting).build();
    MaterializedResult afterData = MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()).row(120L, "a", "insert1").row(610L, "f", "insert2").row(620L, "f", "insert3").rows(insertData.getMaterializedRows()).build();
    for (TransactionDeleteInsertTestCase testCase : testCases) {
        SchemaTableName temporaryDeleteInsert = temporaryTable("delete_insert");
        try {
            createEmptyTable(temporaryDeleteInsert, storageFormat, ImmutableList.of(new Column("col1", HIVE_LONG, Optional.empty())), ImmutableList.of(new Column("pk1", HIVE_STRING, Optional.empty()), new Column("pk2", HIVE_STRING, Optional.empty())));
            insertData(temporaryDeleteInsert, beforeData);
            try {
                doTestTransactionDeleteInsert(storageFormat, temporaryDeleteInsert, domainToDrop, insertData, testCase.isExpectCommitedData() ? afterData : beforeData, testCase.getTag(), testCase.isExpectQuerySucceed(), testCase.getConflictTrigger());
            } catch (AssertionError e) {
                throw new AssertionError(format("Test case: %s", testCase), e);
            }
        } finally {
            dropTable(temporaryDeleteInsert);
        }
    }
}
Also used : HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) Column(io.trino.plugin.hive.metastore.Column) ViewColumn(io.trino.spi.connector.ConnectorViewDefinition.ViewColumn) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) MaterializedResult(io.trino.testing.MaterializedResult) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) CatalogSchemaTableName(io.trino.spi.connector.CatalogSchemaTableName) SchemaTableName(io.trino.spi.connector.SchemaTableName) MaterializedRow(io.trino.testing.MaterializedRow)

Example 15 with Domain

use of io.trino.spi.predicate.Domain in project trino by trinodb.

the class TestHiveMetadata method testCreatePredicateWithNaN.

@Test
public void testCreatePredicateWithNaN() {
    HiveColumnHandle columnHandle = DOUBLE_COLUMN_HANDLE;
    ImmutableList.Builder<HivePartition> partitions = ImmutableList.builder();
    partitions.add(new HivePartition(new SchemaTableName("test", "test"), "p1", ImmutableMap.of(columnHandle, NullableValue.of(DOUBLE, Double.NaN))));
    partitions.add(new HivePartition(new SchemaTableName("test", "test"), "p2", ImmutableMap.of(columnHandle, NullableValue.of(DOUBLE, 4.2))));
    Domain domain = createPredicate(ImmutableList.of(columnHandle), partitions.build()).getDomains().orElseThrow().get(columnHandle);
    assertEquals(domain, Domain.notNull(DOUBLE));
}
Also used : ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) Domain(io.trino.spi.predicate.Domain) SchemaTableName(io.trino.spi.connector.SchemaTableName) Test(org.testng.annotations.Test)

Aggregations

Domain (io.trino.spi.predicate.Domain)120 TupleDomain (io.trino.spi.predicate.TupleDomain)107 ColumnHandle (io.trino.spi.connector.ColumnHandle)51 Test (org.testng.annotations.Test)45 Map (java.util.Map)38 ImmutableList (com.google.common.collect.ImmutableList)36 ImmutableMap (com.google.common.collect.ImmutableMap)33 List (java.util.List)27 Optional (java.util.Optional)25 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)23 Type (io.trino.spi.type.Type)23 ConnectorSession (io.trino.spi.connector.ConnectorSession)21 SchemaTableName (io.trino.spi.connector.SchemaTableName)21 Objects.requireNonNull (java.util.Objects.requireNonNull)21 Set (java.util.Set)20 Range (io.trino.spi.predicate.Range)19 ValueSet (io.trino.spi.predicate.ValueSet)18 Constraint (io.trino.spi.connector.Constraint)17 String.format (java.lang.String.format)17 ImmutableSet (com.google.common.collect.ImmutableSet)16