Search in sources :

Example 1 with BucketValidator

use of io.trino.plugin.hive.HivePageSource.BucketValidator in project trino by trinodb.

the class HivePageSourceProvider method createBucketValidator.

private static Optional<BucketValidator> createBucketValidator(Path path, Optional<BucketValidation> bucketValidation, OptionalInt bucketNumber, List<ColumnMapping> columnMappings) {
    return bucketValidation.flatMap(validation -> {
        Map<Integer, ColumnMapping> baseHiveColumnToBlockIndex = columnMappings.stream().filter(mapping -> mapping.getHiveColumnHandle().isBaseColumn()).collect(toImmutableMap(mapping -> mapping.getHiveColumnHandle().getBaseHiveColumnIndex(), identity()));
        int[] bucketColumnIndices = new int[validation.getBucketColumns().size()];
        List<TypeInfo> bucketColumnTypes = new ArrayList<>();
        for (int i = 0; i < validation.getBucketColumns().size(); i++) {
            HiveColumnHandle column = validation.getBucketColumns().get(i);
            ColumnMapping mapping = baseHiveColumnToBlockIndex.get(column.getBaseHiveColumnIndex());
            if (mapping == null) {
                // partitions the table by bucket, even if the bucket has the wrong data.
                return Optional.empty();
            }
            bucketColumnIndices[i] = mapping.getIndex();
            bucketColumnTypes.add(mapping.getHiveColumnHandle().getHiveType().getTypeInfo());
        }
        return Optional.of(new BucketValidator(path, bucketColumnIndices, bucketColumnTypes, validation.getBucketingVersion(), validation.getBucketCount(), bucketNumber.orElseThrow()));
    });
}
Also used : PARTITION_KEY(io.trino.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY) OrcFileWriterFactory(io.trino.plugin.hive.orc.OrcFileWriterFactory) GENERIC_INSUFFICIENT_RESOURCES(io.trino.spi.StandardErrorCode.GENERIC_INSUFFICIENT_RESOURCES) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) HiveBucketFilter(io.trino.plugin.hive.util.HiveBucketing.HiveBucketFilter) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) PREFILLED(io.trino.plugin.hive.HivePageSourceProvider.ColumnMappingKind.PREFILLED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ORIGINAL_FILE_PATH_MATCHER(io.trino.plugin.hive.HiveUpdatablePageSource.ORIGINAL_FILE_PATH_MATCHER) Path(org.apache.hadoop.fs.Path) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) BucketValidation(io.trino.plugin.hive.HiveSplit.BucketValidation) BiMap(com.google.common.collect.BiMap) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) ImmutableSet(com.google.common.collect.ImmutableSet) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ACID_ROW_STRUCT_COLUMN_ID(io.trino.plugin.hive.HiveUpdatablePageSource.ACID_ROW_STRUCT_COLUMN_ID) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ReaderRecordCursorWithProjections(io.trino.plugin.hive.HiveRecordCursorProvider.ReaderRecordCursorWithProjections) BucketingVersion(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion) DynamicFilter(io.trino.spi.connector.DynamicFilter) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) NullableValue(io.trino.spi.predicate.NullableValue) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType(io.trino.plugin.hive.orc.OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType) HiveBucketing.getHiveBucketFilter(io.trino.plugin.hive.util.HiveBucketing.getHiveBucketFilter) OptionalInt(java.util.OptionalInt) ArrayList(java.util.ArrayList) ImmutableBiMap(com.google.common.collect.ImmutableBiMap) Inject(javax.inject.Inject) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) ColumnMapping.toColumnHandles(io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.toColumnHandles) RecordPageSource(io.trino.spi.connector.RecordPageSource) Objects.requireNonNull(java.util.Objects.requireNonNull) HiveUtil.getPrefilledColumnValue(io.trino.plugin.hive.util.HiveUtil.getPrefilledColumnValue) ColumnHandle(io.trino.spi.connector.ColumnHandle) BucketConversion(io.trino.plugin.hive.HiveSplit.BucketConversion) OrcType(io.trino.orc.metadata.OrcType) RecordCursor(io.trino.spi.connector.RecordCursor) Properties(java.util.Properties) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) HiveColumnHandle.isRowIdColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isRowIdColumnHandle) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) SYNTHESIZED(io.trino.plugin.hive.HiveColumnHandle.ColumnType.SYNTHESIZED) Collectors.toList(java.util.stream.Collectors.toList) VisibleForTesting(com.google.common.annotations.VisibleForTesting) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) BucketValidator(io.trino.plugin.hive.HivePageSource.BucketValidator) TypeManager(io.trino.spi.type.TypeManager) OrcColumnId(io.trino.orc.metadata.OrcColumnId) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) BucketValidator(io.trino.plugin.hive.HivePageSource.BucketValidator) ArrayList(java.util.ArrayList) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)

Example 2 with BucketValidator

use of io.trino.plugin.hive.HivePageSource.BucketValidator in project trino by trinodb.

the class HivePageSourceProvider method createHivePageSource.

public static Optional<ConnectorPageSource> createHivePageSource(Set<HivePageSourceFactory> pageSourceFactories, Set<HiveRecordCursorProvider> cursorProviders, Configuration configuration, ConnectorSession session, Path path, OptionalInt bucketNumber, long start, long length, long estimatedFileSize, Properties schema, TupleDomain<HiveColumnHandle> effectivePredicate, List<HiveColumnHandle> columns, TypeManager typeManager, Optional<BucketConversion> bucketConversion, Optional<BucketValidation> bucketValidation, boolean s3SelectPushdownEnabled, Optional<AcidInfo> acidInfo, boolean originalFile, AcidTransaction transaction, List<ColumnMapping> columnMappings) {
    if (effectivePredicate.isNone()) {
        return Optional.of(new EmptyPageSource());
    }
    List<ColumnMapping> regularAndInterimColumnMappings = ColumnMapping.extractRegularAndInterimColumnMappings(columnMappings);
    Optional<BucketAdaptation> bucketAdaptation = createBucketAdaptation(bucketConversion, bucketNumber, regularAndInterimColumnMappings);
    Optional<BucketValidator> bucketValidator = createBucketValidator(path, bucketValidation, bucketNumber, regularAndInterimColumnMappings);
    for (HivePageSourceFactory pageSourceFactory : pageSourceFactories) {
        List<HiveColumnHandle> desiredColumns = toColumnHandles(regularAndInterimColumnMappings, true, typeManager);
        Optional<ReaderPageSource> readerWithProjections = pageSourceFactory.createPageSource(configuration, session, path, start, length, estimatedFileSize, schema, desiredColumns, effectivePredicate, acidInfo, bucketNumber, originalFile, transaction);
        if (readerWithProjections.isPresent()) {
            ConnectorPageSource pageSource = readerWithProjections.get().get();
            Optional<ReaderColumns> readerProjections = readerWithProjections.get().getReaderColumns();
            Optional<ReaderProjectionsAdapter> adapter = Optional.empty();
            if (readerProjections.isPresent()) {
                adapter = Optional.of(hiveProjectionsAdapter(desiredColumns, readerProjections.get()));
            }
            return Optional.of(new HivePageSource(columnMappings, bucketAdaptation, bucketValidator, adapter, typeManager, pageSource));
        }
    }
    for (HiveRecordCursorProvider provider : cursorProviders) {
        // GenericHiveRecordCursor will automatically do the coercion without HiveCoercionRecordCursor
        boolean doCoercion = !(provider instanceof GenericHiveRecordCursorProvider);
        List<HiveColumnHandle> desiredColumns = toColumnHandles(regularAndInterimColumnMappings, doCoercion, typeManager);
        Optional<ReaderRecordCursorWithProjections> readerWithProjections = provider.createRecordCursor(configuration, session, path, start, length, estimatedFileSize, schema, desiredColumns, effectivePredicate, typeManager, s3SelectPushdownEnabled);
        if (readerWithProjections.isPresent()) {
            RecordCursor delegate = readerWithProjections.get().getRecordCursor();
            Optional<ReaderColumns> projections = readerWithProjections.get().getProjectedReaderColumns();
            if (projections.isPresent()) {
                ReaderProjectionsAdapter projectionsAdapter = hiveProjectionsAdapter(desiredColumns, projections.get());
                delegate = new HiveReaderProjectionsAdaptingRecordCursor(delegate, projectionsAdapter);
            }
            checkArgument(acidInfo.isEmpty(), "Acid is not supported");
            if (bucketAdaptation.isPresent()) {
                delegate = new HiveBucketAdapterRecordCursor(bucketAdaptation.get().getBucketColumnIndices(), bucketAdaptation.get().getBucketColumnHiveTypes(), bucketAdaptation.get().getBucketingVersion(), bucketAdaptation.get().getTableBucketCount(), bucketAdaptation.get().getPartitionBucketCount(), bucketAdaptation.get().getBucketToKeep(), typeManager, delegate);
            }
            // Need to wrap RcText and RcBinary into a wrapper, which will do the coercion for mismatch columns
            if (doCoercion) {
                delegate = new HiveCoercionRecordCursor(regularAndInterimColumnMappings, typeManager, delegate);
            }
            // bucket adaptation already validates that data is in the right bucket
            if (bucketAdaptation.isEmpty() && bucketValidator.isPresent()) {
                delegate = bucketValidator.get().wrapRecordCursor(delegate, typeManager);
            }
            HiveRecordCursor hiveRecordCursor = new HiveRecordCursor(columnMappings, delegate);
            List<Type> columnTypes = columns.stream().map(HiveColumnHandle::getType).collect(toList());
            return Optional.of(new RecordPageSource(columnTypes, hiveRecordCursor));
        }
    }
    return Optional.empty();
}
Also used : BucketValidator(io.trino.plugin.hive.HivePageSource.BucketValidator) RecordCursor(io.trino.spi.connector.RecordCursor) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) RecordPageSource(io.trino.spi.connector.RecordPageSource) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) Type(io.trino.spi.type.Type) OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType(io.trino.plugin.hive.orc.OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType) OrcType(io.trino.orc.metadata.OrcType) ReaderRecordCursorWithProjections(io.trino.plugin.hive.HiveRecordCursorProvider.ReaderRecordCursorWithProjections)

Aggregations

OrcType (io.trino.orc.metadata.OrcType)2 BucketValidator (io.trino.plugin.hive.HivePageSource.BucketValidator)2 ReaderRecordCursorWithProjections (io.trino.plugin.hive.HiveRecordCursorProvider.ReaderRecordCursorWithProjections)2 OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType (io.trino.plugin.hive.orc.OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType)2 ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)2 EmptyPageSource (io.trino.spi.connector.EmptyPageSource)2 RecordCursor (io.trino.spi.connector.RecordCursor)2 RecordPageSource (io.trino.spi.connector.RecordPageSource)2 Type (io.trino.spi.type.Type)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 BiMap (com.google.common.collect.BiMap)1 ImmutableBiMap (com.google.common.collect.ImmutableBiMap)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Maps.uniqueIndex (com.google.common.collect.Maps.uniqueIndex)1 ColumnMetadata (io.trino.orc.metadata.ColumnMetadata)1