use of com.facebook.presto.spark.classloader_interface.PrestoSparkTaskProcessor in project presto by prestodb.
the class PrestoSparkRddFactory method createRdd.
private <T extends PrestoSparkTaskOutput> JavaPairRDD<MutablePartitionId, T> createRdd(JavaSparkContext sparkContext, Session session, PlanFragment fragment, PrestoSparkTaskExecutorFactoryProvider executorFactoryProvider, CollectionAccumulator<SerializedTaskInfo> taskInfoCollector, CollectionAccumulator<PrestoSparkShuffleStats> shuffleStatsCollector, TableWriteInfo tableWriteInfo, Map<PlanFragmentId, JavaPairRDD<MutablePartitionId, PrestoSparkMutableRow>> rddInputs, Map<PlanFragmentId, Broadcast<?>> broadcastInputs, Class<T> outputType) {
checkInputs(fragment.getRemoteSourceNodes(), rddInputs, broadcastInputs);
PrestoSparkTaskDescriptor taskDescriptor = new PrestoSparkTaskDescriptor(session.toSessionRepresentation(), session.getIdentity().getExtraCredentials(), fragment, tableWriteInfo);
SerializedPrestoSparkTaskDescriptor serializedTaskDescriptor = new SerializedPrestoSparkTaskDescriptor(taskDescriptorJsonCodec.toJsonBytes(taskDescriptor));
Optional<Integer> numberOfShufflePartitions = Optional.empty();
Map<String, RDD<Tuple2<MutablePartitionId, PrestoSparkMutableRow>>> shuffleInputRddMap = new HashMap<>();
for (Map.Entry<PlanFragmentId, JavaPairRDD<MutablePartitionId, PrestoSparkMutableRow>> input : rddInputs.entrySet()) {
RDD<Tuple2<MutablePartitionId, PrestoSparkMutableRow>> rdd = input.getValue().rdd();
shuffleInputRddMap.put(input.getKey().toString(), rdd);
if (!numberOfShufflePartitions.isPresent()) {
numberOfShufflePartitions = Optional.of(rdd.getNumPartitions());
} else {
checkArgument(numberOfShufflePartitions.get() == rdd.getNumPartitions(), "Incompatible number of input partitions: %s != %s", numberOfShufflePartitions.get(), rdd.getNumPartitions());
}
}
PrestoSparkTaskProcessor<T> taskProcessor = new PrestoSparkTaskProcessor<>(executorFactoryProvider, serializedTaskDescriptor, taskInfoCollector, shuffleStatsCollector, toTaskProcessorBroadcastInputs(broadcastInputs), outputType);
Optional<PrestoSparkTaskSourceRdd> taskSourceRdd;
List<TableScanNode> tableScans = findTableScanNodes(fragment.getRoot());
if (!tableScans.isEmpty()) {
try (CloseableSplitSourceProvider splitSourceProvider = new CloseableSplitSourceProvider(splitManager::getSplits)) {
SplitSourceFactory splitSourceFactory = new SplitSourceFactory(splitSourceProvider, WarningCollector.NOOP);
Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(fragment, session, tableWriteInfo);
taskSourceRdd = Optional.of(createTaskSourcesRdd(fragment.getId(), sparkContext, session, fragment.getPartitioning(), tableScans, splitSources, numberOfShufflePartitions));
}
} else if (rddInputs.size() == 0) {
checkArgument(fragment.getPartitioning().equals(SINGLE_DISTRIBUTION), "SINGLE_DISTRIBUTION partitioning is expected: %s", fragment.getPartitioning());
// In case of no inputs we still need to schedule a task.
// Task with no inputs may produce results (e.g.: ValuesNode).
// To force the task to be scheduled we create a PrestoSparkTaskSourceRdd that contains exactly one partition.
// Since there's also no table scans in the fragment, the list of TaskSource's for this partition is empty.
taskSourceRdd = Optional.of(new PrestoSparkTaskSourceRdd(sparkContext.sc(), ImmutableList.of(ImmutableList.of())));
} else {
taskSourceRdd = Optional.empty();
}
return JavaPairRDD.fromRDD(PrestoSparkTaskRdd.create(sparkContext.sc(), taskSourceRdd, shuffleInputRddMap, taskProcessor), classTag(MutablePartitionId.class), classTag(outputType));
}
Aggregations