Search in sources :

Example 1 with REGULAR

use of io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR in project trino by trinodb.

the class OrcPageSourceFactory method createOrcPageSource.

private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
    for (HiveColumnHandle column : columns) {
        checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
    }
    checkArgument(!effectivePredicate.isNone());
    OrcDataSource orcDataSource;
    boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
    try {
        Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
        if (optionalOrcReader.isEmpty()) {
            return new EmptyPageSource();
        }
        OrcReader reader = optionalOrcReader.get();
        if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
            validateOrcAcidVersion(path, reader);
        }
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
        List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
        List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
        List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
        if (isFullAcid && !originalFilesPresent) {
            verifyAcidSchema(reader, path);
            Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
            fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
            fileReadTypes.add(INTEGER);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
        }
        Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            verifyFileHasColumnNames(fileColumns, path);
            // Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
            fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
        }
        Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
        Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        } else {
            projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        }
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
        Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
        for (HiveColumnHandle column : columns) {
            OrcColumn orcColumn = null;
            OrcReader.ProjectedLayout projectedLayout = null;
            Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
            if (useOrcColumnNames || isFullAcid) {
                String columnName = column.getName().toLowerCase(ENGLISH);
                orcColumn = fileColumnsByName.get(columnName);
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            } else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
                orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            }
            Type readType = column.getType();
            if (orcColumn != null) {
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                fileReadLayouts.add(projectedLayout);
                // Add predicates on top-level and nested columns
                for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
                    OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
                    if (nestedColumn != null) {
                        predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
            }
        }
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
        Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
        Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
        if (transaction.isDelete()) {
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
            } else {
                columnAdaptations.add(ColumnAdaptation.rowIdColumn());
            }
        } else if (transaction.isUpdate()) {
            HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
            List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
            } else {
                columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
            }
        }
        return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) OrcTypeKind(io.trino.orc.metadata.OrcType.OrcTypeKind) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) HiveSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.hive.HiveSessionProperties.getOrcLazyReadSmallRanges) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.hive.HiveSessionProperties.getOrcTinyStripeThreshold) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Slice(io.airlift.slice.Slice) STRUCT(io.trino.orc.metadata.OrcType.OrcTypeKind.STRUCT) ColumnAdaptation.updatedRowColumns(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumns) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) LONG(io.trino.orc.metadata.OrcType.OrcTypeKind.LONG) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) INT(io.trino.orc.metadata.OrcType.OrcTypeKind.INT) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) Properties(java.util.Properties) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) HiveSessionProperties.isOrcNestedLazy(io.trino.plugin.hive.HiveSessionProperties.isOrcNestedLazy) OrcColumn(io.trino.orc.OrcColumn) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) OrcRecordReader(io.trino.orc.OrcRecordReader) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) ImmutableMap(com.google.common.collect.ImmutableMap) AcidUtils.isFullAcidTable(org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) PRESTO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.PRESTO_WRITER_ID) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) Pattern(java.util.regex.Pattern) TRINO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.TRINO_WRITER_ID) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) HiveUtil.isDeserializerClass(io.trino.plugin.hive.util.HiveUtil.isDeserializerClass) Type(io.trino.spi.type.Type) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) AcidSchema(io.trino.plugin.hive.acid.AcidSchema) HiveSessionProperties.isUseOrcColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseOrcColumnNames) OptionalInt(java.util.OptionalInt) Inject(javax.inject.Inject) HiveSessionProperties.getOrcStreamBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcStreamBufferSize) ImmutableList(com.google.common.collect.ImmutableList) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) HiveSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.hive.HiveSessionProperties.isOrcBloomFiltersEnabled) HiveSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxReadBlockSize) OrcReader(io.trino.orc.OrcReader) HiveSessionProperties.getOrcMaxBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxBufferSize) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) UTF_8(java.nio.charset.StandardCharsets.UTF_8) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) Maps(com.google.common.collect.Maps) HiveSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxMergeDistance) ColumnAdaptation.updatedRowColumnsWithOriginalFiles(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumnsWithOriginalFiles) AcidInfo(io.trino.plugin.hive.AcidInfo) HiveColumnProjectionInfo(io.trino.plugin.hive.HiveColumnProjectionInfo) Collectors.toList(java.util.stream.Collectors.toList) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) HIVE_FILE_MISSING_COLUMN_NAMES(io.trino.plugin.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) Optional(java.util.Optional) OrcColumn(io.trino.orc.OrcColumn) OrcReader(io.trino.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) OrcDataSource(io.trino.orc.OrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) IOException(java.io.IOException) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) Type(io.trino.spi.type.Type) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) TrinoException(io.trino.spi.TrinoException) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout)

Example 2 with REGULAR

use of io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR in project trino by trinodb.

the class ParquetPageSourceFactory method createPageSource.

/**
 * This method is available for other callers to use directly.
 */
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
    // Ignore predicates on partial columns for now.
    effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
    MessageType fileSchema;
    MessageType requestedSchema;
    MessageColumnIO messageColumn;
    ParquetReader parquetReader;
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        fileSchema = fileMetaData.getSchema();
        Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
        requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
        messageColumn = getColumnIO(fileSchema, requestedSchema);
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
        long nextStart = 0;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
        ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
            if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
                blocks.add(block);
                blockStarts.add(nextStart);
                columnIndexes.add(columnIndex);
            }
            nextStart += block.getRowCount();
        }
        parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
    for (HiveColumnHandle column : baseColumns) {
        checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
    }
    ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
    ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
    ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
    for (HiveColumnHandle column : baseColumns) {
        trinoTypes.add(column.getBaseType());
        rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
        if (column == PARQUET_ROW_INDEX_COLUMN) {
            internalFields.add(Optional.empty());
        } else {
            internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
                String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
                return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
            }));
        }
    }
    ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
    return new ReaderPageSource(parquetPageSource, readerProjections);
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) MetadataReader(io.trino.parquet.reader.MetadataReader) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) HiveSessionProperties.isUseParquetColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HiveParquetColumnIOConverter.constructField(io.trino.plugin.hive.parquet.HiveParquetColumnIOConverter.constructField) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveUtil.getDeserializerClassName(io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) GroupType(org.apache.parquet.schema.GroupType) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ParquetReader(io.trino.parquet.reader.ParquetReader) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) HiveSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) BIGINT(io.trino.spi.type.BigintType.BIGINT) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Entry(java.util.Map.Entry) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) OptionalInt(java.util.OptionalInt) HiveSessionProperties.isParquetIgnoreStatistics(io.trino.plugin.hive.HiveSessionProperties.isParquetIgnoreStatistics) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) HiveType(io.trino.plugin.hive.HiveType) ParquetTypeUtils.lookupColumnByName(io.trino.parquet.ParquetTypeUtils.lookupColumnByName) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.trino.parquet.ParquetTypeUtils.getParquetTypeByName) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) Predicate(io.trino.parquet.predicate.Predicate) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) HivePageSourceProvider.projectSufficientColumns(io.trino.plugin.hive.HivePageSourceProvider.projectSufficientColumns) Properties(java.util.Properties) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) AcidInfo(io.trino.plugin.hive.AcidInfo) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) Field(io.trino.parquet.Field) ParquetDataSource(io.trino.parquet.ParquetDataSource) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ImmutableList(com.google.common.collect.ImmutableList) FileNotFoundException(java.io.FileNotFoundException) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) Predicate(io.trino.parquet.predicate.Predicate) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) FileSystem(org.apache.hadoop.fs.FileSystem) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) TrinoColumnIndexStore(io.trino.parquet.reader.TrinoColumnIndexStore) List(java.util.List) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) ImmutableList(com.google.common.collect.ImmutableList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) MessageType(org.apache.parquet.schema.MessageType) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ParquetDataSource(io.trino.parquet.ParquetDataSource) Optional(java.util.Optional) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ParquetReader(io.trino.parquet.reader.ParquetReader) IOException(java.io.IOException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) IOException(java.io.IOException) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns)

Example 3 with REGULAR

use of io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR in project trino by trinodb.

the class TestConnectorPushdownRulesWithHive method testPredicatePushdown.

@Test
public void testPredicatePushdown() {
    String tableName = "predicate_test";
    tester().getQueryRunner().execute(format("CREATE TABLE %s (a, b) AS SELECT 5, 6", tableName));
    PushPredicateIntoTableScan pushPredicateIntoTableScan = new PushPredicateIntoTableScan(tester().getPlannerContext(), tester().getTypeAnalyzer());
    HiveTableHandle hiveTable = new HiveTableHandle(SCHEMA_NAME, tableName, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty());
    TableHandle table = new TableHandle(new CatalogName(HIVE_CATALOG_NAME), hiveTable, new HiveTransactionHandle(false));
    HiveColumnHandle column = createBaseColumn("a", 0, HIVE_INT, INTEGER, REGULAR, Optional.empty());
    tester().assertThat(pushPredicateIntoTableScan).on(p -> p.filter(PlanBuilder.expression("a = 5"), p.tableScan(table, ImmutableList.of(p.symbol("a", INTEGER)), ImmutableMap.of(p.symbol("a", INTEGER), column)))).matches(filter("a = 5", tableScan(tableHandle -> ((HiveTableHandle) tableHandle).getCompactEffectivePredicate().getDomains().get().equals(ImmutableMap.of(column, Domain.singleValue(INTEGER, 5L))), TupleDomain.all(), ImmutableMap.of("a", column::equals))));
    metastore.dropTable(SCHEMA_NAME, tableName, true);
}
Also used : MoreFiles.deleteRecursively(com.google.common.io.MoreFiles.deleteRecursively) Database(io.trino.plugin.hive.metastore.Database) Test(org.testng.annotations.Test) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) CatalogName(io.trino.connector.CatalogName) ArithmeticUnaryExpression(io.trino.sql.tree.ArithmeticUnaryExpression) LongLiteral(io.trino.sql.tree.LongLiteral) Arrays.asList(java.util.Arrays.asList) PlanBuilder(io.trino.sql.planner.iterative.rule.test.PlanBuilder) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) PlanMatchPattern.expression(io.trino.sql.planner.assertions.PlanMatchPattern.expression) RowType(io.trino.spi.type.RowType) ImmutableSet(com.google.common.collect.ImmutableSet) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) Assignments(io.trino.sql.planner.plan.Assignments) FileHiveMetastore(io.trino.plugin.hive.metastore.file.FileHiveMetastore) ScalarStatsCalculator(io.trino.cost.ScalarStatsCalculator) TestingHiveConnectorFactory(io.trino.plugin.hive.TestingHiveConnectorFactory) String.format(java.lang.String.format) TestingSession.testSessionBuilder(io.trino.testing.TestingSession.testSessionBuilder) PlanMatchPattern.strictProject(io.trino.sql.planner.assertions.PlanMatchPattern.strictProject) ADD(io.trino.sql.tree.ArithmeticBinaryExpression.Operator.ADD) BIGINT(io.trino.spi.type.BigintType.BIGINT) SymbolReference(io.trino.sql.tree.SymbolReference) PushProjectionIntoTableScan(io.trino.sql.planner.iterative.rule.PushProjectionIntoTableScan) HdfsConfig(io.trino.plugin.hive.HdfsConfig) PruneTableScanColumns(io.trino.sql.planner.iterative.rule.PruneTableScanColumns) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) Optional(java.util.Optional) Expression(io.trino.sql.tree.Expression) RowType.field(io.trino.spi.type.RowType.field) MINUS(io.trino.sql.tree.ArithmeticUnaryExpression.Sign.MINUS) Session(io.trino.Session) MetastoreConfig(io.trino.plugin.hive.metastore.MetastoreConfig) PushPredicateIntoTableScan(io.trino.sql.planner.iterative.rule.PushPredicateIntoTableScan) Type(io.trino.spi.type.Type) PlanMatchPattern.filter(io.trino.sql.planner.assertions.PlanMatchPattern.filter) SubscriptExpression(io.trino.sql.tree.SubscriptExpression) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) ALLOW_INSECURE(com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) NodeVersion(io.trino.plugin.hive.NodeVersion) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) LocalQueryRunner(io.trino.testing.LocalQueryRunner) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) HiveType.toHiveType(io.trino.plugin.hive.HiveType.toHiveType) ArithmeticBinaryExpression(io.trino.sql.tree.ArithmeticBinaryExpression) Symbol(io.trino.sql.planner.Symbol) AfterClass(org.testng.annotations.AfterClass) BaseRuleTest(io.trino.sql.planner.iterative.rule.test.BaseRuleTest) PrincipalType(io.trino.spi.security.PrincipalType) IOException(java.io.IOException) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) TupleDomain(io.trino.spi.predicate.TupleDomain) HiveColumnProjectionInfo(io.trino.plugin.hive.HiveColumnProjectionInfo) File(java.io.File) HIVE_INT(io.trino.plugin.hive.HiveType.HIVE_INT) TableHandle(io.trino.metadata.TableHandle) PlanMatchPattern.project(io.trino.sql.planner.assertions.PlanMatchPattern.project) FileHiveMetastoreConfig(io.trino.plugin.hive.metastore.file.FileHiveMetastoreConfig) HiveTransactionHandle(io.trino.plugin.hive.HiveTransactionHandle) PlanMatchPattern.tableScan(io.trino.sql.planner.assertions.PlanMatchPattern.tableScan) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) PushPredicateIntoTableScan(io.trino.sql.planner.iterative.rule.PushPredicateIntoTableScan) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) TableHandle(io.trino.metadata.TableHandle) CatalogName(io.trino.connector.CatalogName) HiveTransactionHandle(io.trino.plugin.hive.HiveTransactionHandle) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) Test(org.testng.annotations.Test) BaseRuleTest(io.trino.sql.planner.iterative.rule.test.BaseRuleTest)

Example 4 with REGULAR

use of io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR in project trino by trinodb.

the class TestConnectorPushdownRulesWithHive method testColumnPruningProjectionPushdown.

@Test
public void testColumnPruningProjectionPushdown() {
    String tableName = "column_pruning_projection_test";
    tester().getQueryRunner().execute(format("CREATE TABLE %s (a, b) AS SELECT 5, 6", tableName));
    PruneTableScanColumns pruneTableScanColumns = new PruneTableScanColumns(tester().getMetadata());
    HiveTableHandle hiveTable = new HiveTableHandle(SCHEMA_NAME, tableName, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty());
    TableHandle table = new TableHandle(new CatalogName(HIVE_CATALOG_NAME), hiveTable, new HiveTransactionHandle(false));
    HiveColumnHandle columnA = createBaseColumn("a", 0, HIVE_INT, INTEGER, REGULAR, Optional.empty());
    HiveColumnHandle columnB = createBaseColumn("b", 1, HIVE_INT, INTEGER, REGULAR, Optional.empty());
    tester().assertThat(pruneTableScanColumns).on(p -> {
        Symbol symbolA = p.symbol("a", INTEGER);
        Symbol symbolB = p.symbol("b", INTEGER);
        return p.project(Assignments.of(p.symbol("x"), symbolA.toSymbolReference()), p.tableScan(table, ImmutableList.of(symbolA, symbolB), ImmutableMap.of(symbolA, columnA, symbolB, columnB)));
    }).matches(strictProject(ImmutableMap.of("expr", expression("COLA")), tableScan(hiveTable.withProjectedColumns(ImmutableSet.of(columnA))::equals, TupleDomain.all(), ImmutableMap.of("COLA", columnA::equals))));
    metastore.dropTable(SCHEMA_NAME, tableName, true);
}
Also used : MoreFiles.deleteRecursively(com.google.common.io.MoreFiles.deleteRecursively) Database(io.trino.plugin.hive.metastore.Database) Test(org.testng.annotations.Test) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) CatalogName(io.trino.connector.CatalogName) ArithmeticUnaryExpression(io.trino.sql.tree.ArithmeticUnaryExpression) LongLiteral(io.trino.sql.tree.LongLiteral) Arrays.asList(java.util.Arrays.asList) PlanBuilder(io.trino.sql.planner.iterative.rule.test.PlanBuilder) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) PlanMatchPattern.expression(io.trino.sql.planner.assertions.PlanMatchPattern.expression) RowType(io.trino.spi.type.RowType) ImmutableSet(com.google.common.collect.ImmutableSet) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) Assignments(io.trino.sql.planner.plan.Assignments) FileHiveMetastore(io.trino.plugin.hive.metastore.file.FileHiveMetastore) ScalarStatsCalculator(io.trino.cost.ScalarStatsCalculator) TestingHiveConnectorFactory(io.trino.plugin.hive.TestingHiveConnectorFactory) String.format(java.lang.String.format) TestingSession.testSessionBuilder(io.trino.testing.TestingSession.testSessionBuilder) PlanMatchPattern.strictProject(io.trino.sql.planner.assertions.PlanMatchPattern.strictProject) ADD(io.trino.sql.tree.ArithmeticBinaryExpression.Operator.ADD) BIGINT(io.trino.spi.type.BigintType.BIGINT) SymbolReference(io.trino.sql.tree.SymbolReference) PushProjectionIntoTableScan(io.trino.sql.planner.iterative.rule.PushProjectionIntoTableScan) HdfsConfig(io.trino.plugin.hive.HdfsConfig) PruneTableScanColumns(io.trino.sql.planner.iterative.rule.PruneTableScanColumns) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) Optional(java.util.Optional) Expression(io.trino.sql.tree.Expression) RowType.field(io.trino.spi.type.RowType.field) MINUS(io.trino.sql.tree.ArithmeticUnaryExpression.Sign.MINUS) Session(io.trino.Session) MetastoreConfig(io.trino.plugin.hive.metastore.MetastoreConfig) PushPredicateIntoTableScan(io.trino.sql.planner.iterative.rule.PushPredicateIntoTableScan) Type(io.trino.spi.type.Type) PlanMatchPattern.filter(io.trino.sql.planner.assertions.PlanMatchPattern.filter) SubscriptExpression(io.trino.sql.tree.SubscriptExpression) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) ALLOW_INSECURE(com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) NodeVersion(io.trino.plugin.hive.NodeVersion) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) LocalQueryRunner(io.trino.testing.LocalQueryRunner) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) HiveType.toHiveType(io.trino.plugin.hive.HiveType.toHiveType) ArithmeticBinaryExpression(io.trino.sql.tree.ArithmeticBinaryExpression) Symbol(io.trino.sql.planner.Symbol) AfterClass(org.testng.annotations.AfterClass) BaseRuleTest(io.trino.sql.planner.iterative.rule.test.BaseRuleTest) PrincipalType(io.trino.spi.security.PrincipalType) IOException(java.io.IOException) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) TupleDomain(io.trino.spi.predicate.TupleDomain) HiveColumnProjectionInfo(io.trino.plugin.hive.HiveColumnProjectionInfo) File(java.io.File) HIVE_INT(io.trino.plugin.hive.HiveType.HIVE_INT) TableHandle(io.trino.metadata.TableHandle) PlanMatchPattern.project(io.trino.sql.planner.assertions.PlanMatchPattern.project) FileHiveMetastoreConfig(io.trino.plugin.hive.metastore.file.FileHiveMetastoreConfig) HiveTransactionHandle(io.trino.plugin.hive.HiveTransactionHandle) PlanMatchPattern.tableScan(io.trino.sql.planner.assertions.PlanMatchPattern.tableScan) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) Symbol(io.trino.sql.planner.Symbol) PruneTableScanColumns(io.trino.sql.planner.iterative.rule.PruneTableScanColumns) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) TableHandle(io.trino.metadata.TableHandle) CatalogName(io.trino.connector.CatalogName) HiveTransactionHandle(io.trino.plugin.hive.HiveTransactionHandle) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) Test(org.testng.annotations.Test) BaseRuleTest(io.trino.sql.planner.iterative.rule.test.BaseRuleTest)

Example 5 with REGULAR

use of io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR in project trino by trinodb.

the class HiveUtil method createRecordReader.

public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
    // determine which hive columns we will read
    List<HiveColumnHandle> readColumns = columns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
    // Projected columns are not supported here
    readColumns.forEach(readColumn -> checkArgument(readColumn.isBaseColumn(), "column %s is not a base column", readColumn.getName()));
    List<Integer> readHiveColumnIndexes = readColumns.stream().map(HiveColumnHandle::getBaseHiveColumnIndex).collect(toImmutableList());
    // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
    configuration = copy(configuration);
    setReadColumns(configuration, readHiveColumnIndexes);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
    JobConf jobConf = toJobConf(configuration);
    FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
    // propagate serialization configuration to getRecordReader
    schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
    configureCompressionCodecs(jobConf);
    try {
        // raw type on WritableComparable can't be fixed because Utilities#skipHeader takes them raw
        @SuppressWarnings({ "rawtypes", "unchecked" }) RecordReader<WritableComparable, Writable> recordReader = (RecordReader<WritableComparable, Writable>) inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
        int headerCount = getHeaderCount(schema);
        // Only skip header rows when the split is at the beginning of the file
        if (start == 0 && headerCount > 0) {
            Utilities.skipHeader(recordReader, headerCount, recordReader.createKey(), recordReader.createValue());
        }
        int footerCount = getFooterCount(schema);
        if (footerCount > 0) {
            recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf);
        }
        return recordReader;
    } catch (IOException e) {
        if (e instanceof TextLineLengthLimitExceededException) {
            throw new TrinoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), firstNonNull(e.getMessage(), e.getClass().getName())), e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) ORC_BLOOM_FILTER_FPP(io.trino.plugin.hive.HiveTableProperties.ORC_BLOOM_FILTER_FPP) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) DECIMAL_TYPE_NAME(org.apache.hadoop.hive.serde.serdeConstants.DECIMAL_TYPE_NAME) HiveBucketing.isSupportedBucketing(io.trino.plugin.hive.util.HiveBucketing.isSupportedBucketing) Writable(org.apache.hadoop.io.Writable) HiveColumnHandle.isBucketColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isBucketColumnHandle) Short.parseShort(java.lang.Short.parseShort) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) BigDecimal(java.math.BigDecimal) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) FileSplit(org.apache.hadoop.mapred.FileSplit) Matcher(java.util.regex.Matcher) Column(io.trino.plugin.hive.metastore.Column) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Chars.trimTrailingSpaces(io.trino.spi.type.Chars.trimTrailingSpaces) ENGLISH(java.util.Locale.ENGLISH) HiveColumnHandle.isFileSizeColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isFileSizeColumnHandle) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) Double.parseDouble(java.lang.Double.parseDouble) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) LzoCodec(io.airlift.compress.lzo.LzoCodec) Table(io.trino.plugin.hive.metastore.Table) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) DateTimeEncoding.packDateTimeWithZone(io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone) READ_ALL_COLUMNS(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS) AvroSerDe(org.apache.hadoop.hive.serde2.avro.AvroSerDe) HiveColumnHandle.fileSizeColumnHandle(io.trino.plugin.hive.HiveColumnHandle.fileSizeColumnHandle) ASCENDING(io.trino.plugin.hive.metastore.SortingColumn.Order.ASCENDING) Collectors.joining(java.util.stream.Collectors.joining) InvocationTargetException(java.lang.reflect.InvocationTargetException) Lists.newArrayList(com.google.common.collect.Lists.newArrayList) DESCENDING(io.trino.plugin.hive.metastore.SortingColumn.Order.DESCENDING) TextLineLengthLimitExceededException(io.trino.hadoop.TextLineLengthLimitExceededException) DateTimePrinter(org.joda.time.format.DateTimePrinter) RecordReader(org.apache.hadoop.mapred.RecordReader) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) Joiner(com.google.common.base.Joiner) HiveColumnHandle.bucketColumnHandle(io.trino.plugin.hive.HiveColumnHandle.bucketColumnHandle) HiveColumnHandle.isFileModifiedTimeColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isFileModifiedTimeColumnHandle) Slice(io.airlift.slice.Slice) DateTimeFormatterBuilder(org.joda.time.format.DateTimeFormatterBuilder) NullableValue(io.trino.spi.predicate.NullableValue) Decimals.isLongDecimal(io.trino.spi.type.Decimals.isLongDecimal) TIMESTAMP_MILLIS(io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) WritableComparable(org.apache.hadoop.io.WritableComparable) BOOLEAN(io.trino.spi.type.BooleanType.BOOLEAN) HIVE_SERDE_NOT_FOUND(io.trino.plugin.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND) HiveType(io.trino.plugin.hive.HiveType) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) HIVE_INVALID_METADATA(io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA) TIMESTAMP_TZ_MILLIS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS) Byte.parseByte(java.lang.Byte.parseByte) HIVE_INVALID_PARTITION_VALUE(io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE) COLLECTION_DELIM(org.apache.hadoop.hive.serde.serdeConstants.COLLECTION_DELIM) TEXTFILE(io.trino.plugin.hive.HiveStorageFormat.TEXTFILE) HiveColumnHandle.isPathColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isPathColumnHandle) Math.floorDiv(java.lang.Math.floorDiv) Nullable(javax.annotation.Nullable) SKIP_HEADER_COUNT_KEY(io.trino.plugin.hive.HiveMetadata.SKIP_HEADER_COUNT_KEY) Int128(io.trino.spi.type.Int128) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) Properties(java.util.Properties) DecimalType.createDecimalType(io.trino.spi.type.DecimalType.createDecimalType) MapType(io.trino.spi.type.MapType) Reporter(org.apache.hadoop.mapred.Reporter) HIVE_UNSUPPORTED_FORMAT(io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) IOException(java.io.IOException) Field(java.lang.reflect.Field) ORC_BLOOM_FILTER_COLUMNS_KEY(io.trino.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_COLUMNS_KEY) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) VarbinaryType(io.trino.spi.type.VarbinaryType) OrcWriterOptions(io.trino.orc.OrcWriterOptions) CharType(io.trino.spi.type.CharType) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) HiveColumnHandle.pathColumnHandle(io.trino.plugin.hive.HiveColumnHandle.pathColumnHandle) HiveType.toHiveTypes(io.trino.plugin.hive.HiveType.toHiveTypes) TINYINT(io.trino.spi.type.TinyintType.TINYINT) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Long.parseLong(java.lang.Long.parseLong) ReflectionUtils(org.apache.hadoop.util.ReflectionUtils) PARTITION_KEY(io.trino.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY) DateTimeParser(org.joda.time.format.DateTimeParser) MICROSECONDS_PER_MILLISECOND(io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND) HiveColumnHandle.partitionColumnHandle(io.trino.plugin.hive.HiveColumnHandle.partitionColumnHandle) ORC_BLOOM_FILTER_FPP_KEY(io.trino.plugin.hive.HiveMetadata.ORC_BLOOM_FILTER_FPP_KEY) Float.parseFloat(java.lang.Float.parseFloat) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) InputFormat(org.apache.hadoop.mapred.InputFormat) Path(org.apache.hadoop.fs.Path) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) Splitter(com.google.common.base.Splitter) Method(java.lang.reflect.Method) SliceUtf8(io.airlift.slice.SliceUtf8) DateTimeFormat(org.joda.time.format.DateTimeFormat) ISODateTimeFormat(org.joda.time.format.ISODateTimeFormat) RowType(io.trino.spi.type.RowType) ImmutableSet(com.google.common.collect.ImmutableSet) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) HiveTimestampPrecision(io.trino.plugin.hive.HiveTimestampPrecision) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) String.format(java.lang.String.format) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) ErrorCodeSupplier(io.trino.spi.ErrorCodeSupplier) Optional(java.util.Optional) MoreObjects.firstNonNull(com.google.common.base.MoreObjects.firstNonNull) READ_COLUMN_IDS_CONF_STR(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR) Pattern(java.util.regex.Pattern) FileUtils.unescapePathName(org.apache.hadoop.hive.common.FileUtils.unescapePathName) DecimalType(io.trino.spi.type.DecimalType) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ConfigurationUtils.copy(io.trino.plugin.hive.util.ConfigurationUtils.copy) ROUND_UNNECESSARY(java.math.BigDecimal.ROUND_UNNECESSARY) Type(io.trino.spi.type.Type) IOConstants(org.apache.hadoop.hive.ql.io.IOConstants) OptionalInt(java.util.OptionalInt) LzopCodec(io.airlift.compress.lzo.LzopCodec) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) VarcharType(io.trino.spi.type.VarcharType) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) ImmutableList(com.google.common.collect.ImmutableList) HiveStorageFormat(io.trino.plugin.hive.HiveStorageFormat) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) HIVE_DEFAULT_DYNAMIC_PARTITION(io.trino.plugin.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION) Decimals.isShortDecimal(io.trino.spi.type.Decimals.isShortDecimal) VerifyException(com.google.common.base.VerifyException) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) UTF_8(java.nio.charset.StandardCharsets.UTF_8) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) HiveColumnHandle.isPartitionColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isPartitionColumnHandle) SKIP_FOOTER_COUNT_KEY(io.trino.plugin.hive.HiveMetadata.SKIP_FOOTER_COUNT_KEY) Integer.parseInt(java.lang.Integer.parseInt) TrinoAvroSerDe(io.trino.plugin.hive.avro.TrinoAvroSerDe) GENERIC_INTERNAL_ERROR(io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) JavaUtils(org.apache.hadoop.hive.common.JavaUtils) JobConf(org.apache.hadoop.mapred.JobConf) TimeUnit(java.util.concurrent.TimeUnit) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) TypeManager(io.trino.spi.type.TypeManager) HiveColumnHandle.fileModifiedTimeColumnHandle(io.trino.plugin.hive.HiveColumnHandle.fileModifiedTimeColumnHandle) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) RecordReader(org.apache.hadoop.mapred.RecordReader) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) TextLineLengthLimitExceededException(io.trino.hadoop.TextLineLengthLimitExceededException) WritableComparable(org.apache.hadoop.io.WritableComparable) TrinoException(io.trino.spi.TrinoException) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) JobConf(org.apache.hadoop.mapred.JobConf) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)7 HiveColumnHandle (io.trino.plugin.hive.HiveColumnHandle)7 REGULAR (io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR)7 BIGINT (io.trino.spi.type.BigintType.BIGINT)7 Type (io.trino.spi.type.Type)7 IOException (java.io.IOException)7 String.format (java.lang.String.format)7 Optional (java.util.Optional)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 ImmutableSet (com.google.common.collect.ImmutableSet)6 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)6 Domain (io.trino.spi.predicate.Domain)6 TupleDomain (io.trino.spi.predicate.TupleDomain)6 INTEGER (io.trino.spi.type.IntegerType.INTEGER)6 HiveColumnHandle.createBaseColumn (io.trino.plugin.hive.HiveColumnHandle.createBaseColumn)5 HiveColumnProjectionInfo (io.trino.plugin.hive.HiveColumnProjectionInfo)5 Files (com.google.common.io.Files)4 MoreFiles.deleteRecursively (com.google.common.io.MoreFiles.deleteRecursively)4 ALLOW_INSECURE (com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE)4 Session (io.trino.Session)4