Search in sources :

Example 1 with PrestoSparkTaskSourceRdd

use of com.facebook.presto.spark.classloader_interface.PrestoSparkTaskSourceRdd in project presto by prestodb.

the class PrestoSparkRddFactory method createRdd.

private <T extends PrestoSparkTaskOutput> JavaPairRDD<MutablePartitionId, T> createRdd(JavaSparkContext sparkContext, Session session, PlanFragment fragment, PrestoSparkTaskExecutorFactoryProvider executorFactoryProvider, CollectionAccumulator<SerializedTaskInfo> taskInfoCollector, CollectionAccumulator<PrestoSparkShuffleStats> shuffleStatsCollector, TableWriteInfo tableWriteInfo, Map<PlanFragmentId, JavaPairRDD<MutablePartitionId, PrestoSparkMutableRow>> rddInputs, Map<PlanFragmentId, Broadcast<?>> broadcastInputs, Class<T> outputType) {
    checkInputs(fragment.getRemoteSourceNodes(), rddInputs, broadcastInputs);
    PrestoSparkTaskDescriptor taskDescriptor = new PrestoSparkTaskDescriptor(session.toSessionRepresentation(), session.getIdentity().getExtraCredentials(), fragment, tableWriteInfo);
    SerializedPrestoSparkTaskDescriptor serializedTaskDescriptor = new SerializedPrestoSparkTaskDescriptor(taskDescriptorJsonCodec.toJsonBytes(taskDescriptor));
    Optional<Integer> numberOfShufflePartitions = Optional.empty();
    Map<String, RDD<Tuple2<MutablePartitionId, PrestoSparkMutableRow>>> shuffleInputRddMap = new HashMap<>();
    for (Map.Entry<PlanFragmentId, JavaPairRDD<MutablePartitionId, PrestoSparkMutableRow>> input : rddInputs.entrySet()) {
        RDD<Tuple2<MutablePartitionId, PrestoSparkMutableRow>> rdd = input.getValue().rdd();
        shuffleInputRddMap.put(input.getKey().toString(), rdd);
        if (!numberOfShufflePartitions.isPresent()) {
            numberOfShufflePartitions = Optional.of(rdd.getNumPartitions());
        } else {
            checkArgument(numberOfShufflePartitions.get() == rdd.getNumPartitions(), "Incompatible number of input partitions: %s != %s", numberOfShufflePartitions.get(), rdd.getNumPartitions());
        }
    }
    PrestoSparkTaskProcessor<T> taskProcessor = new PrestoSparkTaskProcessor<>(executorFactoryProvider, serializedTaskDescriptor, taskInfoCollector, shuffleStatsCollector, toTaskProcessorBroadcastInputs(broadcastInputs), outputType);
    Optional<PrestoSparkTaskSourceRdd> taskSourceRdd;
    List<TableScanNode> tableScans = findTableScanNodes(fragment.getRoot());
    if (!tableScans.isEmpty()) {
        try (CloseableSplitSourceProvider splitSourceProvider = new CloseableSplitSourceProvider(splitManager::getSplits)) {
            SplitSourceFactory splitSourceFactory = new SplitSourceFactory(splitSourceProvider, WarningCollector.NOOP);
            Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(fragment, session, tableWriteInfo);
            taskSourceRdd = Optional.of(createTaskSourcesRdd(fragment.getId(), sparkContext, session, fragment.getPartitioning(), tableScans, splitSources, numberOfShufflePartitions));
        }
    } else if (rddInputs.size() == 0) {
        checkArgument(fragment.getPartitioning().equals(SINGLE_DISTRIBUTION), "SINGLE_DISTRIBUTION partitioning is expected: %s", fragment.getPartitioning());
        // In case of no inputs we still need to schedule a task.
        // Task with no inputs may produce results (e.g.: ValuesNode).
        // To force the task to be scheduled we create a PrestoSparkTaskSourceRdd that contains exactly one partition.
        // Since there's also no table scans in the fragment, the list of TaskSource's for this partition is empty.
        taskSourceRdd = Optional.of(new PrestoSparkTaskSourceRdd(sparkContext.sc(), ImmutableList.of(ImmutableList.of())));
    } else {
        taskSourceRdd = Optional.empty();
    }
    return JavaPairRDD.fromRDD(PrestoSparkTaskRdd.create(sparkContext.sc(), taskSourceRdd, shuffleInputRddMap, taskProcessor), classTag(MutablePartitionId.class), classTag(outputType));
}
Also used : SerializedPrestoSparkTaskDescriptor(com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskDescriptor) PrestoSparkTaskDescriptor(com.facebook.presto.spark.PrestoSparkTaskDescriptor) HashMap(java.util.HashMap) SplitSourceFactory(com.facebook.presto.sql.planner.SplitSourceFactory) PlanNodeId(com.facebook.presto.spi.plan.PlanNodeId) MutablePartitionId(com.facebook.presto.spark.classloader_interface.MutablePartitionId) RDD(org.apache.spark.rdd.RDD) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) PlanFragmentId(com.facebook.presto.sql.planner.plan.PlanFragmentId) PrestoSparkMutableRow(com.facebook.presto.spark.classloader_interface.PrestoSparkMutableRow) PrestoSparkTaskSourceRdd(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskSourceRdd) PrestoSparkTaskProcessor(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskProcessor) SerializedPrestoSparkTaskDescriptor(com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskDescriptor) CloseableSplitSourceProvider(com.facebook.presto.split.CloseableSplitSourceProvider) TableScanNode(com.facebook.presto.spi.plan.TableScanNode) Tuple2(scala.Tuple2) SplitSource(com.facebook.presto.split.SplitSource) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) HashMap(java.util.HashMap)

Example 2 with PrestoSparkTaskSourceRdd

use of com.facebook.presto.spark.classloader_interface.PrestoSparkTaskSourceRdd in project presto by prestodb.

the class PrestoSparkRddFactory method createTaskSourcesRdd.

private PrestoSparkTaskSourceRdd createTaskSourcesRdd(PlanFragmentId fragmentId, JavaSparkContext sparkContext, Session session, PartitioningHandle partitioning, List<TableScanNode> tableScans, Map<PlanNodeId, SplitSource> splitSources, Optional<Integer> numberOfShufflePartitions) {
    ListMultimap<Integer, SerializedPrestoSparkTaskSource> taskSourcesMap = ArrayListMultimap.create();
    for (TableScanNode tableScan : tableScans) {
        int totalNumberOfSplits = 0;
        SplitSource splitSource = requireNonNull(splitSources.get(tableScan.getId()), "split source is missing for table scan node with id: " + tableScan.getId());
        try (PrestoSparkSplitAssigner splitAssigner = createSplitAssigner(session, tableScan.getId(), splitSource, partitioning)) {
            while (true) {
                Optional<SetMultimap<Integer, ScheduledSplit>> batch = splitAssigner.getNextBatch();
                if (!batch.isPresent()) {
                    break;
                }
                int numberOfSplitsInCurrentBatch = batch.get().size();
                log.info("Found %s splits for table scan node with id %s", numberOfSplitsInCurrentBatch, tableScan.getId());
                totalNumberOfSplits += numberOfSplitsInCurrentBatch;
                taskSourcesMap.putAll(createTaskSources(tableScan.getId(), batch.get()));
            }
        }
        log.info("Total number of splits for table scan node with id %s: %s", tableScan.getId(), totalNumberOfSplits);
    }
    long allTaskSourcesSerializedSizeInBytes = taskSourcesMap.values().stream().mapToLong(serializedTaskSource -> serializedTaskSource.getBytes().length).sum();
    log.info("Total serialized size of all task sources for fragment %s: %s", fragmentId, DataSize.succinctBytes(allTaskSourcesSerializedSizeInBytes));
    List<List<SerializedPrestoSparkTaskSource>> taskSourcesByPartitionId = new ArrayList<>();
    // If the fragment contains any shuffle inputs, this value will be present
    if (numberOfShufflePartitions.isPresent()) {
        // non bucketed tables match, an empty partition must be inserted if bucket is missing.
        for (int partitionId = 0; partitionId < numberOfShufflePartitions.get(); partitionId++) {
            // Eagerly remove task sources from the map to let GC reclaim the memory
            // If task sources are missing for a partition the removeAll returns an empty list
            taskSourcesByPartitionId.add(requireNonNull(taskSourcesMap.removeAll(partitionId), "taskSources is null"));
        }
    } else {
        taskSourcesByPartitionId.addAll(Multimaps.asMap(taskSourcesMap).values());
    }
    return new PrestoSparkTaskSourceRdd(sparkContext.sc(), taskSourcesByPartitionId);
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) WarningCollector(com.facebook.presto.spi.WarningCollector) JsonCodec(com.facebook.airlift.json.JsonCodec) ListMultimap(com.google.common.collect.ListMultimap) RemoteSourceNode(com.facebook.presto.sql.planner.plan.RemoteSourceNode) PrestoSparkTaskRdd(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskRdd) SplitSourceFactory(com.facebook.presto.sql.planner.SplitSourceFactory) PrestoSparkUtils.serializeZstdCompressed(com.facebook.presto.spark.util.PrestoSparkUtils.serializeZstdCompressed) TableWriteInfo(com.facebook.presto.execution.scheduler.TableWriteInfo) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Sets.difference(com.google.common.collect.Sets.difference) PlanFragment(com.facebook.presto.sql.planner.PlanFragment) MutablePartitionId(com.facebook.presto.spark.classloader_interface.MutablePartitionId) PrestoSparkShuffleStats(com.facebook.presto.spark.classloader_interface.PrestoSparkShuffleStats) Map(java.util.Map) Sets.union(com.google.common.collect.Sets.union) FIXED_BROADCAST_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.FIXED_BROADCAST_DISTRIBUTION) SplitSource(com.facebook.presto.split.SplitSource) Broadcast(org.apache.spark.broadcast.Broadcast) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) SplitManager(com.facebook.presto.split.SplitManager) Tuple2(scala.Tuple2) SOURCE_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.SOURCE_DISTRIBUTION) Codec(com.facebook.airlift.json.Codec) String.format(java.lang.String.format) PrestoSparkTaskProcessor(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskProcessor) DataSize(io.airlift.units.DataSize) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) PrestoSparkTaskExecutorFactoryProvider(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskExecutorFactoryProvider) SerializedPrestoSparkTaskDescriptor(com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskDescriptor) SerializedTaskInfo(com.facebook.presto.spark.classloader_interface.SerializedTaskInfo) Optional(java.util.Optional) FIXED_HASH_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.FIXED_HASH_DISTRIBUTION) RDD(org.apache.spark.rdd.RDD) PrestoSparkUtils.classTag(com.facebook.presto.spark.util.PrestoSparkUtils.classTag) PlanNodeId(com.facebook.presto.spi.plan.PlanNodeId) ARBITRARY_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.ARBITRARY_DISTRIBUTION) Logger(com.facebook.airlift.log.Logger) FIXED_ARBITRARY_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.FIXED_ARBITRARY_DISTRIBUTION) SINGLE_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.SINGLE_DISTRIBUTION) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HashMap(java.util.HashMap) PrestoException(com.facebook.presto.spi.PrestoException) Multimaps(com.google.common.collect.Multimaps) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) PrestoSparkTaskSourceRdd(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskSourceRdd) PrestoSparkTaskOutput(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskOutput) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ScheduledSplit(com.facebook.presto.execution.ScheduledSplit) PlanFragmentId(com.facebook.presto.sql.planner.plan.PlanFragmentId) CloseableSplitSourceProvider(com.facebook.presto.split.CloseableSplitSourceProvider) FIXED_PASSTHROUGH_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.FIXED_PASSTHROUGH_DISTRIBUTION) PlanNodeSearcher.searchFrom(com.facebook.presto.sql.planner.optimizations.PlanNodeSearcher.searchFrom) PrestoSparkTaskDescriptor(com.facebook.presto.spark.PrestoSparkTaskDescriptor) SerializedPrestoSparkTaskSource(com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskSource) PrestoSparkMutableRow(com.facebook.presto.spark.classloader_interface.PrestoSparkMutableRow) Session(com.facebook.presto.Session) TaskSource(com.facebook.presto.execution.TaskSource) SCALED_WRITER_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.SCALED_WRITER_DISTRIBUTION) CollectionAccumulator(org.apache.spark.util.CollectionAccumulator) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SetMultimap(com.google.common.collect.SetMultimap) PlanNode(com.facebook.presto.spi.plan.PlanNode) TableScanNode(com.facebook.presto.spi.plan.TableScanNode) PartitioningHandle(com.facebook.presto.sql.planner.PartitioningHandle) PartitioningProviderManager(com.facebook.presto.sql.planner.PartitioningProviderManager) COORDINATOR_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.COORDINATOR_DISTRIBUTION) ArrayList(java.util.ArrayList) SerializedPrestoSparkTaskSource(com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskSource) SetMultimap(com.google.common.collect.SetMultimap) TableScanNode(com.facebook.presto.spi.plan.TableScanNode) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) SplitSource(com.facebook.presto.split.SplitSource) PrestoSparkTaskSourceRdd(com.facebook.presto.spark.classloader_interface.PrestoSparkTaskSourceRdd)

Aggregations

PrestoSparkTaskDescriptor (com.facebook.presto.spark.PrestoSparkTaskDescriptor)2 MutablePartitionId (com.facebook.presto.spark.classloader_interface.MutablePartitionId)2 PrestoSparkMutableRow (com.facebook.presto.spark.classloader_interface.PrestoSparkMutableRow)2 PrestoSparkTaskProcessor (com.facebook.presto.spark.classloader_interface.PrestoSparkTaskProcessor)2 PrestoSparkTaskSourceRdd (com.facebook.presto.spark.classloader_interface.PrestoSparkTaskSourceRdd)2 SerializedPrestoSparkTaskDescriptor (com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskDescriptor)2 PlanNodeId (com.facebook.presto.spi.plan.PlanNodeId)2 TableScanNode (com.facebook.presto.spi.plan.TableScanNode)2 CloseableSplitSourceProvider (com.facebook.presto.split.CloseableSplitSourceProvider)2 SplitSource (com.facebook.presto.split.SplitSource)2 SplitSourceFactory (com.facebook.presto.sql.planner.SplitSourceFactory)2 PlanFragmentId (com.facebook.presto.sql.planner.plan.PlanFragmentId)2 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 RDD (org.apache.spark.rdd.RDD)2 Tuple2 (scala.Tuple2)2 Codec (com.facebook.airlift.json.Codec)1 JsonCodec (com.facebook.airlift.json.JsonCodec)1