Search in sources :

Example 1 with HiveBucketFilter

use of com.facebook.presto.hive.HiveBucketing.HiveBucketFilter in project presto by prestodb.

the class HiveSplitManager method getSplits.

@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableLayoutHandle layoutHandle, SplitSchedulingContext splitSchedulingContext) {
    HiveTableLayoutHandle layout = (HiveTableLayoutHandle) layoutHandle;
    SchemaTableName tableName = layout.getSchemaTableName();
    // get table metadata
    TransactionalMetadata metadata = hiveTransactionManager.get(transaction);
    if (metadata == null) {
        throw new PrestoException(HIVE_TRANSACTION_NOT_FOUND, format("Transaction not found: %s", transaction));
    }
    SemiTransactionalHiveMetastore metastore = metadata.getMetastore();
    Table table = metastore.getTable(new MetastoreContext(session.getIdentity(), session.getQueryId(), session.getClientInfo(), session.getSource(), getMetastoreHeaders(session), isUserDefinedTypeEncodingEnabled(session), metastore.getColumnConverterProvider()), tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(tableName));
    if (!isOfflineDataDebugModeEnabled(session)) {
        // verify table is not marked as non-readable
        String tableNotReadable = table.getParameters().get(OBJECT_NOT_READABLE);
        if (!isNullOrEmpty(tableNotReadable)) {
            throw new HiveNotReadableException(tableName, Optional.empty(), tableNotReadable);
        }
    }
    // get partitions
    List<HivePartition> partitions = layout.getPartitions().orElseThrow(() -> new PrestoException(GENERIC_INTERNAL_ERROR, "Layout does not contain partitions"));
    // short circuit if we don't have any partitions
    HivePartition partition = Iterables.getFirst(partitions, null);
    if (partition == null) {
        return new FixedSplitSource(ImmutableList.of());
    }
    Optional<HiveBucketFilter> bucketFilter = layout.getBucketFilter();
    // validate bucket bucketed execution
    Optional<HiveBucketHandle> bucketHandle = layout.getBucketHandle();
    if ((splitSchedulingContext.getSplitSchedulingStrategy() == GROUPED_SCHEDULING) && !bucketHandle.isPresent()) {
        throw new PrestoException(GENERIC_INTERNAL_ERROR, "SchedulingPolicy is bucketed, but BucketHandle is not present");
    }
    if (bucketHandle.isPresent()) {
        if (bucketHandle.get().getReadBucketCount() > bucketHandle.get().getTableBucketCount()) {
            throw new PrestoException(GENERIC_INTERNAL_ERROR, "readBucketCount (%s) is greater than the tableBucketCount (%s) which generally points to an issue in plan generation");
        }
    }
    // sort partitions
    partitions = Ordering.natural().onResultOf(HivePartition::getPartitionId).reverse().sortedCopy(partitions);
    Iterable<HivePartitionMetadata> hivePartitions = getPartitionMetadata(metastore, table, tableName, partitions, bucketHandle, session, splitSchedulingContext.getWarningCollector(), layout.getRequestedColumns(), layout.getPredicateColumns(), layout.getDomainPredicate().getDomains());
    HiveSplitLoader hiveSplitLoader = new BackgroundHiveSplitLoader(table, hivePartitions, getPathDomain(layout.getDomainPredicate(), layout.getPredicateColumns()), createBucketSplitInfo(bucketHandle, bucketFilter), session, hdfsEnvironment, namenodeStats, directoryLister, executor, // Avoid over-committing split loader concurrency
    min(splitLoaderConcurrency, partitions.size()), recursiveDfsWalkerEnabled, splitSchedulingContext.schedulerUsesHostAddresses(), layout.isPartialAggregationsPushedDown());
    HiveSplitSource splitSource;
    CacheQuotaRequirement cacheQuotaRequirement = cacheQuotaRequirementProvider.getCacheQuotaRequirement(table.getDatabaseName(), table.getTableName());
    switch(splitSchedulingContext.getSplitSchedulingStrategy()) {
        case UNGROUPED_SCHEDULING:
            splitSource = HiveSplitSource.allAtOnce(session, table.getDatabaseName(), table.getTableName(), cacheQuotaRequirement, getHiveMaxInitialSplitSize(session), maxOutstandingSplits, maxOutstandingSplitsSize, hiveSplitLoader, executor, new CounterStat());
            break;
        case GROUPED_SCHEDULING:
            splitSource = HiveSplitSource.bucketed(session, table.getDatabaseName(), table.getTableName(), cacheQuotaRequirement, getHiveMaxInitialSplitSize(session), maxOutstandingSplits, maxOutstandingSplitsSize, hiveSplitLoader, executor, new CounterStat());
            break;
        case REWINDABLE_GROUPED_SCHEDULING:
            splitSource = HiveSplitSource.bucketedRewindable(session, table.getDatabaseName(), table.getTableName(), cacheQuotaRequirement, getHiveMaxInitialSplitSize(session), maxOutstandingSplitsSize, hiveSplitLoader, executor, new CounterStat());
            break;
        default:
            throw new IllegalArgumentException("Unknown splitSchedulingStrategy: " + splitSchedulingContext.getSplitSchedulingStrategy());
    }
    hiveSplitLoader.start(splitSource);
    return splitSource;
}
Also used : SemiTransactionalHiveMetastore(com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore) CounterStat(com.facebook.airlift.stats.CounterStat) PrestoException(com.facebook.presto.spi.PrestoException) TableNotFoundException(com.facebook.presto.spi.TableNotFoundException) FixedSplitSource(com.facebook.presto.spi.FixedSplitSource) Table(com.facebook.presto.hive.metastore.Table) MetastoreContext(com.facebook.presto.hive.metastore.MetastoreContext) SchemaTableName(com.facebook.presto.spi.SchemaTableName) HiveBucketFilter(com.facebook.presto.hive.HiveBucketing.HiveBucketFilter)

Example 2 with HiveBucketFilter

use of com.facebook.presto.hive.HiveBucketing.HiveBucketFilter in project presto by prestodb.

the class HivePartitionManager method getPartitions.

public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, ConnectorTableHandle tableHandle, Constraint<ColumnHandle> constraint, ConnectorSession session) {
    HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle;
    TupleDomain<ColumnHandle> effectivePredicate = constraint.getSummary();
    SchemaTableName tableName = hiveTableHandle.getSchemaTableName();
    Table table = getTable(session, metastore, tableName, isOfflineDataDebugModeEnabled(session));
    List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(table);
    List<HivePartition> partitions = getPartitionsAsList(getPartitionsIterator(metastore, tableHandle, constraint, session).iterator());
    Optional<HiveBucketHandle> hiveBucketHandle = getBucketHandle(table, session, effectivePredicate);
    Optional<HiveBucketFilter> bucketFilter = hiveBucketHandle.flatMap(value -> getHiveBucketFilter(table, effectivePredicate));
    if (!queryUsesHiveBucketColumn(effectivePredicate) && hiveBucketHandle.isPresent() && queryAccessesTooManyBuckets(hiveBucketHandle.get(), bucketFilter, partitions, session)) {
        hiveBucketHandle = Optional.empty();
        bucketFilter = Optional.empty();
    }
    if (effectivePredicate.isNone()) {
        return new HivePartitionResult(partitionColumns, table.getDataColumns(), table.getParameters(), partitions, TupleDomain.none(), TupleDomain.none(), TupleDomain.none(), hiveBucketHandle, Optional.empty());
    }
    TupleDomain<ColumnHandle> compactEffectivePredicate = effectivePredicate.compact(domainCompactionThreshold);
    if (partitionColumns.isEmpty()) {
        return new HivePartitionResult(partitionColumns, table.getDataColumns(), table.getParameters(), partitions, compactEffectivePredicate, effectivePredicate, TupleDomain.all(), hiveBucketHandle, bucketFilter);
    }
    // All partition key domains will be fully evaluated, so we don't need to include those
    TupleDomain<ColumnHandle> remainingTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), not(Predicates.in(partitionColumns))));
    TupleDomain<ColumnHandle> enforcedTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), Predicates.in(partitionColumns)));
    return new HivePartitionResult(partitionColumns, table.getDataColumns(), table.getParameters(), partitions, compactEffectivePredicate, remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle, bucketFilter);
}
Also used : ColumnHandle(com.facebook.presto.spi.ColumnHandle) Table(com.facebook.presto.hive.metastore.Table) SchemaTableName(com.facebook.presto.spi.SchemaTableName) HiveBucketing.getHiveBucketFilter(com.facebook.presto.hive.HiveBucketing.getHiveBucketFilter) HiveBucketFilter(com.facebook.presto.hive.HiveBucketing.HiveBucketFilter) HiveBucketing.getHiveBucketHandle(com.facebook.presto.hive.HiveBucketing.getHiveBucketHandle)

Aggregations

HiveBucketFilter (com.facebook.presto.hive.HiveBucketing.HiveBucketFilter)2 Table (com.facebook.presto.hive.metastore.Table)2 SchemaTableName (com.facebook.presto.spi.SchemaTableName)2 CounterStat (com.facebook.airlift.stats.CounterStat)1 HiveBucketing.getHiveBucketFilter (com.facebook.presto.hive.HiveBucketing.getHiveBucketFilter)1 HiveBucketing.getHiveBucketHandle (com.facebook.presto.hive.HiveBucketing.getHiveBucketHandle)1 MetastoreContext (com.facebook.presto.hive.metastore.MetastoreContext)1 SemiTransactionalHiveMetastore (com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore)1 ColumnHandle (com.facebook.presto.spi.ColumnHandle)1 FixedSplitSource (com.facebook.presto.spi.FixedSplitSource)1 PrestoException (com.facebook.presto.spi.PrestoException)1 TableNotFoundException (com.facebook.presto.spi.TableNotFoundException)1