Examples with GROUPED_SCHEDULING - io.prestosql.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.GROUPED

Example 1 with GROUPED_SCHEDULING

use of io.prestosql.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.GROUPED_SCHEDULING in project hetu-core by openlookeng.

the class HiveSplitManager method getSplits.

@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle tableHandle, SplitSchedulingStrategy splitSchedulingStrategy, Supplier<List<Set<DynamicFilter>>> dynamicFilterSupplier, Optional<QueryType> queryType, Map<String, Object> queryInfo, Set<TupleDomain<ColumnMetadata>> userDefinedCachePredicates, boolean partOfReuse) {
    HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
    SchemaTableName tableName = hiveTable.getSchemaTableName();
    // get table metadata
    SemiTransactionalHiveMetastore metastore = metastoreProvider.apply((HiveTransactionHandle) transaction);
    Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(tableName));
    if (table.getStorage().getStorageFormat().getInputFormat().contains("carbon")) {
        throw new PrestoException(NOT_SUPPORTED, "Hive connector can't read carbondata tables");
    }
    // verify table is not marked as non-readable
    String tableNotReadable = table.getParameters().get(OBJECT_NOT_READABLE);
    if (!isNullOrEmpty(tableNotReadable)) {
        throw new HiveNotReadableException(tableName, Optional.empty(), tableNotReadable);
    }
    // get partitions
    List<HivePartition> partitions = partitionManager.getOrLoadPartitions(session, metastore, new HiveIdentity(session), hiveTable);
    // short circuit if we don't have any partitions
    if (partitions.isEmpty()) {
        return new FixedSplitSource(ImmutableList.of());
    }
    // get buckets from first partition (arbitrary)
    Optional<HiveBucketing.HiveBucketFilter> bucketFilter = hiveTable.getBucketFilter();
    // validate bucket bucketed execution
    Optional<HiveBucketHandle> bucketHandle = hiveTable.getBucketHandle();
    if ((splitSchedulingStrategy == GROUPED_SCHEDULING) && !bucketHandle.isPresent()) {
        throw new PrestoException(GENERIC_INTERNAL_ERROR, "SchedulingPolicy is bucketed, but BucketHandle is not present");
    }
    // sort partitions
    partitions = Ordering.natural().onResultOf(HivePartition::getPartitionId).reverse().sortedCopy(partitions);
    Iterable<HivePartitionMetadata> hivePartitions = getPartitionMetadata(session, metastore, table, tableName, partitions, bucketHandle.map(HiveBucketHandle::toTableBucketProperty));
    HiveSplitLoader hiveSplitLoader = new BackgroundHiveSplitLoader(table, hivePartitions, hiveTable.getCompactEffectivePredicate(), BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(bucketHandle, bucketFilter), session, hdfsEnvironment, namenodeStats, directoryLister, executor, splitLoaderConcurrency, recursiveDfsWalkerEnabled, metastore.getValidWriteIds(session, hiveTable, queryType.map(t -> t == QueryType.VACUUM).orElse(false)).map(validTxnWriteIdList -> validTxnWriteIdList.getTableValidWriteIdList(table.getDatabaseName() + "." + table.getTableName())), dynamicFilterSupplier, queryType, queryInfo, typeManager);
    HiveSplitSource splitSource;
    HiveStorageFormat hiveStorageFormat = HiveMetadata.extractHiveStorageFormat(table);
    switch(splitSchedulingStrategy) {
        case UNGROUPED_SCHEDULING:
            splitSource = HiveSplitSource.allAtOnce(session, table.getDatabaseName(), table.getTableName(), // For reuse, we should make sure to have same split size all time for a table.
            partOfReuse ? 0 : maxInitialSplits, maxOutstandingSplits, maxOutstandingSplitsSize, maxSplitsPerSecond, hiveSplitLoader, executor, new CounterStat(), dynamicFilterSupplier, userDefinedCachePredicates, typeManager, hiveConfig, hiveStorageFormat);
            break;
        case GROUPED_SCHEDULING:
            splitSource = HiveSplitSource.bucketed(session, table.getDatabaseName(), table.getTableName(), // For reuse, we should make sure to have same split size all time for a table.
            partOfReuse ? 0 : maxInitialSplits, maxOutstandingSplits, maxOutstandingSplitsSize, maxSplitsPerSecond, hiveSplitLoader, executor, new CounterStat(), dynamicFilterSupplier, userDefinedCachePredicates, typeManager, hiveConfig, hiveStorageFormat);
            break;
        default:
            throw new IllegalArgumentException("Unknown splitSchedulingStrategy: " + splitSchedulingStrategy);
    }
    hiveSplitLoader.start(splitSource);
    if (queryType.isPresent() && queryType.get() == QueryType.VACUUM) {
        HdfsContext hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName());
        return new HiveVacuumSplitSource(splitSource, (HiveVacuumTableHandle) queryInfo.get("vacuumHandle"), hdfsEnvironment, hdfsContext, session);
    }
    return splitSource;
}

Also used : VersionEmbedder(io.prestosql.spi.VersionEmbedder) GROUPED_SCHEDULING(io.prestosql.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.GROUPED_SCHEDULING) Iterables.transform(com.google.common.collect.Iterables.transform) DynamicFilter(io.prestosql.spi.dynamicfilter.DynamicFilter) QueryType(io.prestosql.spi.resourcegroups.QueryType) HdfsContext(io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext) ConnectorSplitManager(io.prestosql.spi.connector.ConnectorSplitManager) MetastoreUtil(io.prestosql.plugin.hive.metastore.MetastoreUtil) SERVER_SHUTTING_DOWN(io.prestosql.spi.StandardErrorCode.SERVER_SHUTTING_DOWN) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) TableNotFoundException(io.prestosql.spi.connector.TableNotFoundException) BoundedExecutor(io.airlift.concurrent.BoundedExecutor) Iterables.concat(com.google.common.collect.Iterables.concat) Map(java.util.Map) PrestoException(io.prestosql.spi.PrestoException) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) Set(java.util.Set) ConnectorSplitSource(io.prestosql.spi.connector.ConnectorSplitSource) Math.min(java.lang.Math.min) String.format(java.lang.String.format) DataSize(io.airlift.units.DataSize) List(java.util.List) Table(io.prestosql.plugin.hive.metastore.Table) GENERIC_INTERNAL_ERROR(io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) ConnectorTransactionHandle(io.prestosql.spi.connector.ConnectorTransactionHandle) Optional(java.util.Optional) MoreObjects.firstNonNull(com.google.common.base.MoreObjects.firstNonNull) NOT_SUPPORTED(io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED) Nested(org.weakref.jmx.Nested) Partition(io.prestosql.plugin.hive.metastore.Partition) Strings.isNullOrEmpty(com.google.common.base.Strings.isNullOrEmpty) CounterStat(io.airlift.stats.CounterStat) Function(java.util.function.Function) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) SchemaTableName(io.prestosql.spi.connector.SchemaTableName) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) Managed(org.weakref.jmx.Managed) Objects.requireNonNull(java.util.Objects.requireNonNull) FixedSplitSource(io.prestosql.spi.connector.FixedSplitSource) SemiTransactionalHiveMetastore(io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore) Nullable(javax.annotation.Nullable) ExecutorService(java.util.concurrent.ExecutorService) HiveIdentity(io.prestosql.plugin.hive.authentication.HiveIdentity) Iterator(java.util.Iterator) Executor(java.util.concurrent.Executor) ColumnMetadata(io.prestosql.spi.connector.ColumnMetadata) ConnectorTableHandle(io.prestosql.spi.connector.ConnectorTableHandle) TupleDomain(io.prestosql.spi.predicate.TupleDomain) AbstractIterator(com.google.common.collect.AbstractIterator) TypeManager(io.prestosql.spi.type.TypeManager) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Ordering(com.google.common.collect.Ordering) Column(io.prestosql.plugin.hive.metastore.Column) SemiTransactionalHiveMetastore(io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore) CounterStat(io.airlift.stats.CounterStat) PrestoException(io.prestosql.spi.PrestoException) HiveIdentity(io.prestosql.plugin.hive.authentication.HiveIdentity) TableNotFoundException(io.prestosql.spi.connector.TableNotFoundException) FixedSplitSource(io.prestosql.spi.connector.FixedSplitSource) HdfsContext(io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext) Table(io.prestosql.plugin.hive.metastore.Table) SchemaTableName(io.prestosql.spi.connector.SchemaTableName)

Example 2 with GROUPED_SCHEDULING

use of io.prestosql.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.GROUPED_SCHEDULING in project boostkit-bigdata by kunpengcompute.

the class HiveSplitManager method getSplits.

@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle tableHandle, SplitSchedulingStrategy splitSchedulingStrategy, Supplier<List<Set<DynamicFilter>>> dynamicFilterSupplier, Optional<QueryType> queryType, Map<String, Object> queryInfo, Set<TupleDomain<ColumnMetadata>> userDefinedCachePredicates, boolean partOfReuse) {
    HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
    SchemaTableName tableName = hiveTable.getSchemaTableName();
    // get table metadata
    SemiTransactionalHiveMetastore metastore = metastoreProvider.apply((HiveTransactionHandle) transaction);
    Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(tableName));
    if (table.getStorage().getStorageFormat().getInputFormat().contains("carbon")) {
        throw new PrestoException(NOT_SUPPORTED, "Hive connector can't read carbondata tables");
    }
    // verify table is not marked as non-readable
    String tableNotReadable = table.getParameters().get(OBJECT_NOT_READABLE);
    if (!isNullOrEmpty(tableNotReadable)) {
        throw new HiveNotReadableException(tableName, Optional.empty(), tableNotReadable);
    }
    // get partitions
    List<HivePartition> partitions = partitionManager.getOrLoadPartitions(session, metastore, new HiveIdentity(session), hiveTable);
    // short circuit if we don't have any partitions
    if (partitions.isEmpty()) {
        return new FixedSplitSource(ImmutableList.of());
    }
    // get buckets from first partition (arbitrary)
    Optional<HiveBucketing.HiveBucketFilter> bucketFilter = hiveTable.getBucketFilter();
    // validate bucket bucketed execution
    Optional<HiveBucketHandle> bucketHandle = hiveTable.getBucketHandle();
    if ((splitSchedulingStrategy == GROUPED_SCHEDULING) && !bucketHandle.isPresent()) {
        throw new PrestoException(GENERIC_INTERNAL_ERROR, "SchedulingPolicy is bucketed, but BucketHandle is not present");
    }
    // sort partitions
    partitions = Ordering.natural().onResultOf(HivePartition::getPartitionId).reverse().sortedCopy(partitions);
    Iterable<HivePartitionMetadata> hivePartitions = getPartitionMetadata(session, metastore, table, tableName, partitions, bucketHandle.map(HiveBucketHandle::toTableBucketProperty));
    HiveSplitLoader hiveSplitLoader = new BackgroundHiveSplitLoader(table, hivePartitions, hiveTable.getCompactEffectivePredicate(), BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(bucketHandle, bucketFilter), session, hdfsEnvironment, namenodeStats, directoryLister, executor, splitLoaderConcurrency, recursiveDfsWalkerEnabled, metastore.getValidWriteIds(session, hiveTable, queryType.map(t -> t == QueryType.VACUUM).orElse(false)).map(validTxnWriteIdList -> validTxnWriteIdList.getTableValidWriteIdList(table.getDatabaseName() + "." + table.getTableName())), dynamicFilterSupplier, queryType, queryInfo, typeManager);
    HiveSplitSource splitSource;
    HiveStorageFormat hiveStorageFormat = HiveMetadata.extractHiveStorageFormat(table);
    switch(splitSchedulingStrategy) {
        case UNGROUPED_SCHEDULING:
            splitSource = HiveSplitSource.allAtOnce(session, table.getDatabaseName(), table.getTableName(), // For reuse, we should make sure to have same split size all time for a table.
            partOfReuse ? 0 : maxInitialSplits, maxOutstandingSplits, maxOutstandingSplitsSize, maxSplitsPerSecond, hiveSplitLoader, executor, new CounterStat(), dynamicFilterSupplier, userDefinedCachePredicates, typeManager, hiveConfig, hiveStorageFormat);
            break;
        case GROUPED_SCHEDULING:
            splitSource = HiveSplitSource.bucketed(session, table.getDatabaseName(), table.getTableName(), // For reuse, we should make sure to have same split size all time for a table.
            partOfReuse ? 0 : maxInitialSplits, maxOutstandingSplits, maxOutstandingSplitsSize, maxSplitsPerSecond, hiveSplitLoader, executor, new CounterStat(), dynamicFilterSupplier, userDefinedCachePredicates, typeManager, hiveConfig, hiveStorageFormat);
            break;
        default:
            throw new IllegalArgumentException("Unknown splitSchedulingStrategy: " + splitSchedulingStrategy);
    }
    hiveSplitLoader.start(splitSource);
    if (queryType.isPresent() && queryType.get() == QueryType.VACUUM) {
        HdfsContext hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName());
        return new HiveVacuumSplitSource(splitSource, (HiveVacuumTableHandle) queryInfo.get("vacuumHandle"), hdfsEnvironment, hdfsContext, session);
    }
    return splitSource;
}

Aggregations

MoreObjects.firstNonNull (com.google.common.base.MoreObjects.firstNonNull)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)2 Strings.isNullOrEmpty (com.google.common.base.Strings.isNullOrEmpty)2 AbstractIterator (com.google.common.collect.AbstractIterator)2 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 Iterables.concat (com.google.common.collect.Iterables.concat)2 Iterables.getOnlyElement (com.google.common.collect.Iterables.getOnlyElement)2 Iterables.transform (com.google.common.collect.Iterables.transform)2 Lists (com.google.common.collect.Lists)2 Ordering (com.google.common.collect.Ordering)2 BoundedExecutor (io.airlift.concurrent.BoundedExecutor)2 CounterStat (io.airlift.stats.CounterStat)2 DataSize (io.airlift.units.DataSize)2 HdfsContext (io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext)2 HiveIdentity (io.prestosql.plugin.hive.authentication.HiveIdentity)2 Column (io.prestosql.plugin.hive.metastore.Column)2 MetastoreUtil (io.prestosql.plugin.hive.metastore.MetastoreUtil)2 Partition (io.prestosql.plugin.hive.metastore.Partition)2