use of com.facebook.presto.hive.HiveBucketing.HiveBucketFilter in project presto by prestodb.
the class HiveSplitManager method getSplits.
@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableLayoutHandle layoutHandle, SplitSchedulingContext splitSchedulingContext) {
HiveTableLayoutHandle layout = (HiveTableLayoutHandle) layoutHandle;
SchemaTableName tableName = layout.getSchemaTableName();
// get table metadata
TransactionalMetadata metadata = hiveTransactionManager.get(transaction);
if (metadata == null) {
throw new PrestoException(HIVE_TRANSACTION_NOT_FOUND, format("Transaction not found: %s", transaction));
}
SemiTransactionalHiveMetastore metastore = metadata.getMetastore();
Table table = metastore.getTable(new MetastoreContext(session.getIdentity(), session.getQueryId(), session.getClientInfo(), session.getSource(), getMetastoreHeaders(session), isUserDefinedTypeEncodingEnabled(session), metastore.getColumnConverterProvider()), tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(tableName));
if (!isOfflineDataDebugModeEnabled(session)) {
// verify table is not marked as non-readable
String tableNotReadable = table.getParameters().get(OBJECT_NOT_READABLE);
if (!isNullOrEmpty(tableNotReadable)) {
throw new HiveNotReadableException(tableName, Optional.empty(), tableNotReadable);
}
}
// get partitions
List<HivePartition> partitions = layout.getPartitions().orElseThrow(() -> new PrestoException(GENERIC_INTERNAL_ERROR, "Layout does not contain partitions"));
// short circuit if we don't have any partitions
HivePartition partition = Iterables.getFirst(partitions, null);
if (partition == null) {
return new FixedSplitSource(ImmutableList.of());
}
Optional<HiveBucketFilter> bucketFilter = layout.getBucketFilter();
// validate bucket bucketed execution
Optional<HiveBucketHandle> bucketHandle = layout.getBucketHandle();
if ((splitSchedulingContext.getSplitSchedulingStrategy() == GROUPED_SCHEDULING) && !bucketHandle.isPresent()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, "SchedulingPolicy is bucketed, but BucketHandle is not present");
}
if (bucketHandle.isPresent()) {
if (bucketHandle.get().getReadBucketCount() > bucketHandle.get().getTableBucketCount()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, "readBucketCount (%s) is greater than the tableBucketCount (%s) which generally points to an issue in plan generation");
}
}
// sort partitions
partitions = Ordering.natural().onResultOf(HivePartition::getPartitionId).reverse().sortedCopy(partitions);
Iterable<HivePartitionMetadata> hivePartitions = getPartitionMetadata(metastore, table, tableName, partitions, bucketHandle, session, splitSchedulingContext.getWarningCollector(), layout.getRequestedColumns(), layout.getPredicateColumns(), layout.getDomainPredicate().getDomains());
HiveSplitLoader hiveSplitLoader = new BackgroundHiveSplitLoader(table, hivePartitions, getPathDomain(layout.getDomainPredicate(), layout.getPredicateColumns()), createBucketSplitInfo(bucketHandle, bucketFilter), session, hdfsEnvironment, namenodeStats, directoryLister, executor, // Avoid over-committing split loader concurrency
min(splitLoaderConcurrency, partitions.size()), recursiveDfsWalkerEnabled, splitSchedulingContext.schedulerUsesHostAddresses(), layout.isPartialAggregationsPushedDown());
HiveSplitSource splitSource;
CacheQuotaRequirement cacheQuotaRequirement = cacheQuotaRequirementProvider.getCacheQuotaRequirement(table.getDatabaseName(), table.getTableName());
switch(splitSchedulingContext.getSplitSchedulingStrategy()) {
case UNGROUPED_SCHEDULING:
splitSource = HiveSplitSource.allAtOnce(session, table.getDatabaseName(), table.getTableName(), cacheQuotaRequirement, getHiveMaxInitialSplitSize(session), maxOutstandingSplits, maxOutstandingSplitsSize, hiveSplitLoader, executor, new CounterStat());
break;
case GROUPED_SCHEDULING:
splitSource = HiveSplitSource.bucketed(session, table.getDatabaseName(), table.getTableName(), cacheQuotaRequirement, getHiveMaxInitialSplitSize(session), maxOutstandingSplits, maxOutstandingSplitsSize, hiveSplitLoader, executor, new CounterStat());
break;
case REWINDABLE_GROUPED_SCHEDULING:
splitSource = HiveSplitSource.bucketedRewindable(session, table.getDatabaseName(), table.getTableName(), cacheQuotaRequirement, getHiveMaxInitialSplitSize(session), maxOutstandingSplitsSize, hiveSplitLoader, executor, new CounterStat());
break;
default:
throw new IllegalArgumentException("Unknown splitSchedulingStrategy: " + splitSchedulingContext.getSplitSchedulingStrategy());
}
hiveSplitLoader.start(splitSource);
return splitSource;
}
use of com.facebook.presto.hive.HiveBucketing.HiveBucketFilter in project presto by prestodb.
the class HivePartitionManager method getPartitions.
public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, ConnectorTableHandle tableHandle, Constraint<ColumnHandle> constraint, ConnectorSession session) {
HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle;
TupleDomain<ColumnHandle> effectivePredicate = constraint.getSummary();
SchemaTableName tableName = hiveTableHandle.getSchemaTableName();
Table table = getTable(session, metastore, tableName, isOfflineDataDebugModeEnabled(session));
List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(table);
List<HivePartition> partitions = getPartitionsAsList(getPartitionsIterator(metastore, tableHandle, constraint, session).iterator());
Optional<HiveBucketHandle> hiveBucketHandle = getBucketHandle(table, session, effectivePredicate);
Optional<HiveBucketFilter> bucketFilter = hiveBucketHandle.flatMap(value -> getHiveBucketFilter(table, effectivePredicate));
if (!queryUsesHiveBucketColumn(effectivePredicate) && hiveBucketHandle.isPresent() && queryAccessesTooManyBuckets(hiveBucketHandle.get(), bucketFilter, partitions, session)) {
hiveBucketHandle = Optional.empty();
bucketFilter = Optional.empty();
}
if (effectivePredicate.isNone()) {
return new HivePartitionResult(partitionColumns, table.getDataColumns(), table.getParameters(), partitions, TupleDomain.none(), TupleDomain.none(), TupleDomain.none(), hiveBucketHandle, Optional.empty());
}
TupleDomain<ColumnHandle> compactEffectivePredicate = effectivePredicate.compact(domainCompactionThreshold);
if (partitionColumns.isEmpty()) {
return new HivePartitionResult(partitionColumns, table.getDataColumns(), table.getParameters(), partitions, compactEffectivePredicate, effectivePredicate, TupleDomain.all(), hiveBucketHandle, bucketFilter);
}
// All partition key domains will be fully evaluated, so we don't need to include those
TupleDomain<ColumnHandle> remainingTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), not(Predicates.in(partitionColumns))));
TupleDomain<ColumnHandle> enforcedTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), Predicates.in(partitionColumns)));
return new HivePartitionResult(partitionColumns, table.getDataColumns(), table.getParameters(), partitions, compactEffectivePredicate, remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle, bucketFilter);
}
Aggregations